CERC-AAI · MinaBeiramy · Mar 30, 2024 · Mar 31, 2024 · Mar 31, 2024 · Apr 1, 2024
diff --git a/bgpt/conditional_launch.sh b/bgpt/conditional_launch.sh
@@ -0,0 +1,27 @@
+#!/bin/bash
+
+# Define the source and target directories
+SRC_DIR="/lustre/orion/csc590/scratch/george-adams/bgpt2/byte_models/bgpt"
+
+# Change to config YAML path
+CONFIG_PATH="/lustre/orion/csc590/scratch/george-adams/bgpt2/byte_models/bgpt/configs/config_110M.yaml"
+NUM_NODES=32
+NUM_GPUS_PER_NODE=8
+
+cd "$SRC_DIR"
+
+source activate /lustre/orion/csc590/scratch/george-adams/conda_envs/bgpt
+
+module load rocm/5.2
+
+export MASTER_IP=`ip -f inet addr show hsn0 | sed -En -e 's/.*inet ([0-9.]+).*/\1/p' | head -1`
+
+# mkdir "$SLURM_JOB_NAME"
+# mkdir "$SLURM_JOB_NAME"/checkpoints
+# mkdir "$SLURM_JOB_NAME"/dataloaders
+
+if [ "$1" = "--load-from-checkpoint" ]; then
+    srun torchrun --nnodes=${NUM_NODES} --nproc_per_node=${NUM_GPUS_PER_NODE} --rdzv_id=$SLURM_JOB_ID --rdzv_backend=c10d --rdzv_endpoint=$MASTER_IP:29400 train-gen.py --train-config-path ${CONFIG_PATH} --load-from-checkpoint
+else
+    srun torchrun --nnodes=${NUM_NODES} --nproc_per_node=${NUM_GPUS_PER_NODE} --rdzv_id=$SLURM_JOB_ID --rdzv_backend=c10d --rdzv_endpoint=$MASTER_IP:29400 train-gen.py --train-config-path ${CONFIG_PATH}
+fi
diff --git a/bgpt/conditional_launch_jon.sh b/bgpt/conditional_launch_jon.sh
@@ -0,0 +1,28 @@
+#!/bin/bash
+# Define the source and target directories
+SRC_DIR="/ccs/home/jonathanlimsc/generalist/byte_models/bgpt"
+
+# Change to config YAML path
+CONFIG_PATH="/ccs/home/jonathanlimsc/generalist/byte_models/bgpt/configs/config_110M_jon.yaml"
+# TODO: Pass this through from launch_global_jon.sh
+NUM_NODES=2
+
+cd "$SRC_DIR"
+
+source activate /ccs/home/jonathanlimsc/.conda/envs/bgpt2
+
+module load rocm/5.2
+
+export MASTER_IP=`ip -f inet addr show hsn0 | sed -En -e 's/.*inet ([0-9.]+).*/\1/p' | head -1`
+
+# mkdir "$SLURM_JOB_NAME"
+# mkdir "$SLURM_JOB_NAME"/checkpoints
+# mkdir "$SLURM_JOB_NAME"/dataloaders
+
+if [ "$1" = "--load-from-checkpoint" ]; then
+    srun torchrun --nnodes=${NUM_NODES} --nproc_per_node=8 --rdzv_id=$SLURM_JOB_ID --rdzv_backend=c10d --rdzv_endpoint=$MASTER_IP:29400 train-gen.py --train-config-path ${CONFIG_PATH} --load-from-checkpoint
+else
+    srun torchrun --nnodes=${NUM_NODES} --nproc_per_node=8 --rdzv_id=$SLURM_JOB_ID --rdzv_backend=c10d --rdzv_endpoint=$MASTER_IP:29400 train-gen.py --train-config-path ${CONFIG_PATH}
+fi
+
+
diff --git a/bgpt/conditional_launch_nvidia.sh b/bgpt/conditional_launch_nvidia.sh
@@ -0,0 +1,25 @@
+#!/bin/bash
+# Define the source and target directories
+SRC_DIR="/home/jonathan/cerc/byte_models/bgpt"
+
+# Change to config YAML path
+CONFIG_PATH="/home/jonathan/cerc/byte_models/bgpt/configs/config_110M_math_nvidia.yaml"
+# TODO: Pass this through from launch_global_jon.sh
+NUM_NODES=1
+NUM_GPUS_PER_NODE=4
+
+cd "$SRC_DIR"
+
+source ${HOME}/.bashrc
+source activate /nfs/scratch/jonathan/micromamba/envs/bgpt
+
+# export MASTER_IP=$(scontrol show hostnames $SLURM_JOB_NODELIST | head -n 1)
+export MASTER_IP=localhost
+
+if [ "$1" = "--load-from-checkpoint" ]; then
+    srun torchrun --nnodes=${NUM_NODES} --nproc_per_node=${NUM_GPUS_PER_NODE} --rdzv_id=$SLURM_JOB_ID --rdzv_backend=c10d --rdzv_endpoint=$MASTER_IP:29400 train-gen.py --train-config-path ${CONFIG_PATH} --load-from-checkpoint
+else
+    srun torchrun --nnodes=${NUM_NODES} --nproc_per_node=${NUM_GPUS_PER_NODE} --rdzv_id=$SLURM_JOB_ID --rdzv_backend=c10d --rdzv_endpoint=$MASTER_IP:29400 train-gen.py --train-config-path ${CONFIG_PATH}
+fi
+
+
diff --git a/bgpt/conditional_launch_nvidia_small.sh b/bgpt/conditional_launch_nvidia_small.sh
@@ -0,0 +1,25 @@
+#!/bin/bash
+# Define the source and target directories
+SRC_DIR="/home/jonathan/cerc/byte_models/bgpt"
+
+# Change to config YAML path
+CONFIG_PATH="/home/jonathan/cerc/byte_models/bgpt/configs/config_110M_test.yaml"
+# TODO: Pass this through from launch_global_jon.sh
+NUM_NODES=1
+NUM_GPUS_PER_NODE=2
+
+cd "$SRC_DIR"
+
+source ${HOME}/.bashrc
+source activate /nfs/scratch/jonathan/micromamba/envs/bgpt
+
+# export MASTER_IP=$(scontrol show hostnames $SLURM_JOB_NODELIST | head -n 1)
+export MASTER_IP=localhost
+
+if [ "$1" = "--load-from-checkpoint" ]; then
+    srun torchrun --nnodes=${NUM_NODES} --nproc_per_node=${NUM_GPUS_PER_NODE} --rdzv_id=$SLURM_JOB_ID --rdzv_backend=c10d --rdzv_endpoint=$MASTER_IP:29400 train-gen.py --train-config-path ${CONFIG_PATH} --load-from-checkpoint
+else
+    srun torchrun --nnodes=${NUM_NODES} --nproc_per_node=${NUM_GPUS_PER_NODE} --rdzv_id=$SLURM_JOB_ID --rdzv_backend=c10d --rdzv_endpoint=$MASTER_IP:29400 train-gen.py --train-config-path ${CONFIG_PATH}
+fi
+
+
diff --git a/bgpt/config_og.py b/bgpt/config_og.py
@@ -0,0 +1,59 @@
+# Configuration for generative modelling and classification
+TRAIN_FOLDERS = [
+                # "wikipedia/train",  
+                # "ag_news/train", 
+                # "imagenet32/train", 
+                # "cifar/train", 
+                # "librispeech8K/train", 
+                # "speech_commands8K/train", 
+                #"irishman/train",
+                #"/home/mila/m/mina.beiramy/scratch/bgpt/data/dummy",
+                "/home/mila/m/mina.beiramy/scratch/bgpt/data/mix",
+                # "cpu_states/train",
+                 ]     # Folder containing training data
+EVAL_FOLDERS = [
+                # "wikipedia/test",  
+                # "ag_news/test", 
+                # "imagenet32/test", 
+                # "cifar/test", 
+                # "librispeech8K/test", 
+                # "speech_commands8K/test", 
+                #"irishman/test",
+                #"/home/mila/m/mina.beiramy/scratch/bgpt/data/dummy_val",
+                "/home/mila/m/mina.beiramy/scratch/bgpt/data/mix_val",
+                # "cpu_states/test",
+                ]                                               # Folder containing evaluation data
+
+# Configuration for the paths
+PRETRAINED_PATH = "weights-text.pth"                            # Path to pre-trained weights
+WEIGHTS_PATH = "/home/mila/m/mina.beiramy/workspace/byte_models/scripts/abc-midi/exp/chkp/abcmidi-btch16-ptch16-v1.pth"                       # Path to save weights
+LOGS_PATH = "/home/mila/m/mina.beiramy/workspace/byte_models/scripts/abc-midi/logs/logs-abcmidi-btch16-ptch16-v1.txt"                              # Path to save logs
+
+# Configuration for the model
+PATCH_SIZE = 16                                                 # Patch Size
+PATCH_LENGTH = 512                                              # Patch Length
+BYTE_NUM_LAYERS = 3                                             # Number of layers in the decoder
+PATCH_NUM_LAYERS = 12                                           # Number of layers in the encoder
+HIDDEN_SIZE = 768                                               # Hidden Size
+
+# Configuration for the training
+NUM_EPOCHS = 10                                                 # Number of epochs to train for (if early stopping doesn't intervene)
+LEARNING_RATE = 1e-5                                            # Learning rate for the optimizer
+BATCH_SIZE = 16                                                  # Batch size for training
+ACCUMULATION_STEPS = 1                                          # Accumulation steps to simulate large batch size
+PATCH_SAMPLING_BATCH_SIZE = 0                                   # Batch size for patch during training, 0 for full conaudio
+LOAD_FROM_CHECKPOINT = False                                    # Whether to load weights from a checkpoint
+LOAD_FROM_PRETRAINED = True                                     # Whether to load pre-trained weights from a checkpoint
+CONVERSION_MODE = 'abc&mid'                                          # Mode of conversion (None for regular training, input->output for unidirectional conversion, input&output for bidirectional conversion)
+
+# Configuration for inference
+INFERENCE_WEIGHTS_PATH = "weights-conversion.pth"               # Path to weights for inference
+INPUT_EXT = "abc"                                               # Extension of input files, used for conversion
+TARGET_EXT = "mid"                                              # Extension of target files
+INPUT_FOLDER = "input"                                          # Folder containing input files
+OUTPUT_FOLDER = "output"                                        # Folder to save output files
+MODE = "convert"                                                # Mode of inference (convert or generate)
+NUM_SAMPLES = 100                                               # Number of samples to generate (only for generate mode)
+TOP_K = 0                                                       # Top k for sampling
+TOP_P = 1.                                                      # Top p for sampling
+TEMPERATURE = 1                                                 # Temperature for sampling
diff --git a/bgpt/configs/config_110M.yaml b/bgpt/configs/config_110M.yaml
@@ -1,7 +1,7 @@
 train_folders:
-  - "/lustre/orion/csc590/proj-shared/math-adder/merged-symbols/sample"
+  - "/lustre/orion/csc590/scratch/george-adams/data/wikipedia/train_text3"
 eval_folders: []
-base_dir: "/lustre/orion/csc590/scratch/jonathanlimsc/bgpt/bgpt_110m_ezmath_2digit"
+base_dir: "/lustre/orion/csc590/scratch/george-adams/bgpt2/byte_models/bgpt/110m_wikipedia"
 weights_path: "weights.pth"
 pre_weights_path: null
 logs_path: "log.txt"
@@ -17,10 +17,11 @@ learning_rate: 0.0001
 batch_size: 16
 accumulation_steps: 1
 patch_sampling_batch_size: 0
-load_from_checkpoint: false
+# load_from_checkpoint: false # Deprecated this into a command-line arg that is False by default. So that there's no need to maintain a config where this is True and a reload config where this is True.
 load_from_pre_checkpoint: false
 checkpoint_frequency: 5
 first_launch: true
+verbose: true
 wandb:
   proj_name: byte_models
   entity: jonathanlimsc

diff --git a/bgpt/configs/config_110M_jon.yaml b/bgpt/configs/config_110M_jon.yaml
@@ -0,0 +1,34 @@
+train_folders:
+  - "/lustre/orion/csc590/proj-shared/math-adder/merged-symbols/sample_10K"
+  # - "/lustre/orion/csc590/proj-shared/math-adder/merged-symbols/train"
+  # - "/lustre/orion/csc590/proj-shared/mathematics_dataset-v1.0/train-easy"
+  # - "/lustre/orion/csc590/proj-shared/mathematics_dataset-v1.0/train-medium"
+  # - "/lustre/orion/csc590/proj-shared/mathematics_dataset-v1.0/train-hard"
+  # - "/lustre/orion/csc590/proj-shared/wikipedia/train_text3"
+  # - "/lustre/orion/csc590/proj-shared/imagenet-1k/train"
+eval_folders: []
+base_dir: "/lustre/orion/csc590/scratch/jonathanlimsc/bgpt/test_continual"
+weights_path: "weights.pth"
+pre_weights_path: null
+logs_path: "log.txt"
+checkpoint_path: "checkpoints"
+dataloader_path: "dataloaders"
+patch_size: 16
+patch_length: 512
+byte_num_layers: 3
+patch_num_layers: 12
+hidden_size: 768
+num_epochs: 32
+learning_rate: 0.0001
+batch_size: 16
+accumulation_steps: 1
+patch_sampling_batch_size: 0
+# load_from_checkpoint: false # Deprecated this into a command-line arg that is False by default. So that there's no need to maintain a config where this is True and a reload config where this is True.
+load_from_pre_checkpoint: false
+checkpoint_frequency: 40
+first_launch: true
+verbose: false
+wandb:
+  proj_name: byte_models
+  entity: jonathanlimsc
+  mode: offline
diff --git a/bgpt/configs/config_110M_math_nvidia.yaml b/bgpt/configs/config_110M_math_nvidia.yaml
@@ -0,0 +1,31 @@
+train_folders:
+  - "/nfs/scratch/jonathan/byte_models/data/math-adder/merged-symbols/train"
+eval_folders:
+  - "/nfs/scratch/jonathan/byte_models/data/math-adder/merged-symbols/validation"
+base_dir: "/nfs/scratch/jonathan/byte_models/simple_math"
+weights_path: "weights.pth"
+pre_weights_path: null
+logs_path: "log.txt"
+checkpoint_path: "checkpoints"
+dataloader_path: "dataloaders"
+patch_size: 16
+patch_length: 512
+byte_num_layers: 3
+patch_num_layers: 12
+hidden_size: 768
+num_epochs: 10
+learning_rate: 0.0001
+batch_size: 8
+accumulation_steps: 1
+patch_sampling_batch_size: 0
+# load_from_checkpoint: false # Deprecated this into a command-line arg that is False by default. So that there's no need to maintain a config where this is True and a reload config where this is True.
+load_from_pre_checkpoint: false
+checkpoint_frequency: 10000
+logging_frequency: 100
+first_launch: true
+verbose: false
+wandb:
+  proj_name: byte_models
+  entity: jonathanlimsc
+  mode: online # put 'offline' if you don't want it to upload to WanB server
+  name: simple_math
diff --git a/bgpt/configs/config_110M_test.yaml b/bgpt/configs/config_110M_test.yaml
@@ -0,0 +1,31 @@
+train_folders:
+  - "/nfs/scratch/jonathan/byte_models/data/math-adder/merged-symbols/sample_10K"
+eval_folders:
+  - "/nfs/scratch/jonathan/byte_models/data/math-adder/merged-symbols/val_1K"
+base_dir: "/nfs/scratch/jonathan/byte_models/test_eval"
+weights_path: "weights.pth"
+pre_weights_path: null
+logs_path: "log.txt"
+checkpoint_path: "checkpoints"
+dataloader_path: "dataloaders"
+patch_size: 16
+patch_length: 512
+byte_num_layers: 3
+patch_num_layers: 12
+hidden_size: 768
+num_epochs: 10
+learning_rate: 0.0001
+batch_size: 8
+accumulation_steps: 1
+patch_sampling_batch_size: 0
+# load_from_checkpoint: false # Deprecated this into a command-line arg that is False by default. So that there's no need to maintain a config where this is True and a reload config where this is True.
+load_from_pre_checkpoint: false
+checkpoint_frequency: 1000
+logging_frequency: 100
+first_launch: true
+verbose: false
+wandb:
+  proj_name: byte_models
+  entity: jonathanlimsc
+  mode: online # put 'offline' if you don't want it to upload to WanB server
+  name: test_eval
diff --git a/bgpt/configs/config_1B_jon.yaml b/bgpt/configs/config_1B_jon.yaml
@@ -0,0 +1,28 @@
+train_folders:
+  - "/lustre/orion/csc590/proj-shared/math-adder/merged-symbols/sample"
+eval_folders: []
+base_dir: "/lustre/orion/csc590/scratch/jonathanlimsc/bgpt/testrun"
+weights_path: "weights.pth"
+pre_weights_path: null
+logs_path: "log.txt"
+checkpoint_path: "checkpoints"
+dataloader_path: "dataloaders"
+patch_size: 16
+patch_length: 512
+byte_num_layers: 3
+patch_num_layers: 16
+hidden_size: 2048
+num_epochs: 5
+learning_rate: 0.0001
+batch_size: 4
+accumulation_steps: 1
+patch_sampling_batch_size: 0
+load_from_checkpoint: false
+load_from_pre_checkpoint: false
+checkpoint_frequency: 5
+first_launch: true
+verbose: false
+wandb:
+  proj_name: byte_models
+  entity: jonathanlimsc
+  mode: offline
diff --git a/bgpt/configs/config_400M_jon.yaml b/bgpt/configs/config_400M_jon.yaml
@@ -0,0 +1,28 @@
+train_folders:
+  - "/lustre/orion/csc590/proj-shared/math-adder/merged-symbols/train"
+eval_folders: []
+base_dir: "/lustre/orion/csc590/scratch/jonathanlimsc/bgpt/testrun"
+weights_path: "weights.pth"
+pre_weights_path: null
+logs_path: "log.txt"
+checkpoint_path: "checkpoints"
+dataloader_path: "dataloaders"
+patch_size: 16
+patch_length: 512
+byte_num_layers: 3
+patch_num_layers: 24
+hidden_size: 1024
+num_epochs: 5
+learning_rate: 0.0001
+batch_size: 8
+accumulation_steps: 1
+patch_sampling_batch_size: 0
+load_from_checkpoint: false
+load_from_pre_checkpoint: false
+checkpoint_frequency: 5
+first_launch: true
+verbose: false
+wandb:
+  proj_name: byte_models
+  entity: jonathanlimsc
+  mode: offline
diff --git a/bgpt/configs/reload_config_110M.yaml b/bgpt/configs/reload_config_110M.yaml
@@ -0,0 +1,28 @@
+train_folders:
+  - "/lustre/orion/csc590/scratch/george-adams/data/wikipedia/train_text3"
+eval_folders: []
+base_dir: "/lustre/orion/csc590/scratch/george-adams/bgpt2/byte_models/bgpt/110m_wikipedia"
+weights_path: "weights.pth"
+pre_weights_path: null
+logs_path: "log.txt"
+checkpoint_path: "checkpoints"
+dataloader_path: "dataloaders"
+patch_size: 16
+patch_length: 1024
+byte_num_layers: 3
+patch_num_layers: 12
+hidden_size: 768
+num_epochs: 32
+learning_rate: 0.0001
+batch_size: 16
+accumulation_steps: 1
+patch_sampling_batch_size: 0
+load_from_checkpoint: true
+load_from_pre_checkpoint: false
+checkpoint_frequency: 5
+first_launch: true
+verbose: true
+wandb:
+  proj_name: byte_models
+  entity: jonathanlimsc
+  mode: offline