Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
26 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
27 changes: 27 additions & 0 deletions bgpt/conditional_launch.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
#!/bin/bash

# Define the source and target directories
SRC_DIR="/lustre/orion/csc590/scratch/george-adams/bgpt2/byte_models/bgpt"

# Change to config YAML path
CONFIG_PATH="/lustre/orion/csc590/scratch/george-adams/bgpt2/byte_models/bgpt/configs/config_110M.yaml"
NUM_NODES=32
NUM_GPUS_PER_NODE=8

cd "$SRC_DIR"

source activate /lustre/orion/csc590/scratch/george-adams/conda_envs/bgpt

module load rocm/5.2

export MASTER_IP=`ip -f inet addr show hsn0 | sed -En -e 's/.*inet ([0-9.]+).*/\1/p' | head -1`

# mkdir "$SLURM_JOB_NAME"
# mkdir "$SLURM_JOB_NAME"/checkpoints
# mkdir "$SLURM_JOB_NAME"/dataloaders

if [ "$1" = "--load-from-checkpoint" ]; then
srun torchrun --nnodes=${NUM_NODES} --nproc_per_node=${NUM_GPUS_PER_NODE} --rdzv_id=$SLURM_JOB_ID --rdzv_backend=c10d --rdzv_endpoint=$MASTER_IP:29400 train-gen.py --train-config-path ${CONFIG_PATH} --load-from-checkpoint
else
srun torchrun --nnodes=${NUM_NODES} --nproc_per_node=${NUM_GPUS_PER_NODE} --rdzv_id=$SLURM_JOB_ID --rdzv_backend=c10d --rdzv_endpoint=$MASTER_IP:29400 train-gen.py --train-config-path ${CONFIG_PATH}
fi
28 changes: 28 additions & 0 deletions bgpt/conditional_launch_jon.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
#!/bin/bash
# Define the source and target directories
SRC_DIR="/ccs/home/jonathanlimsc/generalist/byte_models/bgpt"

# Change to config YAML path
CONFIG_PATH="/ccs/home/jonathanlimsc/generalist/byte_models/bgpt/configs/config_110M_jon.yaml"
# TODO: Pass this through from launch_global_jon.sh
NUM_NODES=2

cd "$SRC_DIR"

source activate /ccs/home/jonathanlimsc/.conda/envs/bgpt2

module load rocm/5.2

export MASTER_IP=`ip -f inet addr show hsn0 | sed -En -e 's/.*inet ([0-9.]+).*/\1/p' | head -1`

# mkdir "$SLURM_JOB_NAME"
# mkdir "$SLURM_JOB_NAME"/checkpoints
# mkdir "$SLURM_JOB_NAME"/dataloaders

if [ "$1" = "--load-from-checkpoint" ]; then
srun torchrun --nnodes=${NUM_NODES} --nproc_per_node=8 --rdzv_id=$SLURM_JOB_ID --rdzv_backend=c10d --rdzv_endpoint=$MASTER_IP:29400 train-gen.py --train-config-path ${CONFIG_PATH} --load-from-checkpoint
else
srun torchrun --nnodes=${NUM_NODES} --nproc_per_node=8 --rdzv_id=$SLURM_JOB_ID --rdzv_backend=c10d --rdzv_endpoint=$MASTER_IP:29400 train-gen.py --train-config-path ${CONFIG_PATH}
fi


25 changes: 25 additions & 0 deletions bgpt/conditional_launch_nvidia.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
#!/bin/bash
# Define the source and target directories
SRC_DIR="/home/jonathan/cerc/byte_models/bgpt"

# Change to config YAML path
CONFIG_PATH="/home/jonathan/cerc/byte_models/bgpt/configs/config_110M_math_nvidia.yaml"
# TODO: Pass this through from launch_global_jon.sh
NUM_NODES=1
NUM_GPUS_PER_NODE=4

cd "$SRC_DIR"

source ${HOME}/.bashrc
source activate /nfs/scratch/jonathan/micromamba/envs/bgpt

# export MASTER_IP=$(scontrol show hostnames $SLURM_JOB_NODELIST | head -n 1)
export MASTER_IP=localhost

if [ "$1" = "--load-from-checkpoint" ]; then
srun torchrun --nnodes=${NUM_NODES} --nproc_per_node=${NUM_GPUS_PER_NODE} --rdzv_id=$SLURM_JOB_ID --rdzv_backend=c10d --rdzv_endpoint=$MASTER_IP:29400 train-gen.py --train-config-path ${CONFIG_PATH} --load-from-checkpoint
else
srun torchrun --nnodes=${NUM_NODES} --nproc_per_node=${NUM_GPUS_PER_NODE} --rdzv_id=$SLURM_JOB_ID --rdzv_backend=c10d --rdzv_endpoint=$MASTER_IP:29400 train-gen.py --train-config-path ${CONFIG_PATH}
fi


25 changes: 25 additions & 0 deletions bgpt/conditional_launch_nvidia_small.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
#!/bin/bash
# Define the source and target directories
SRC_DIR="/home/jonathan/cerc/byte_models/bgpt"

# Change to config YAML path
CONFIG_PATH="/home/jonathan/cerc/byte_models/bgpt/configs/config_110M_test.yaml"
# TODO: Pass this through from launch_global_jon.sh
NUM_NODES=1
NUM_GPUS_PER_NODE=2

cd "$SRC_DIR"

source ${HOME}/.bashrc
source activate /nfs/scratch/jonathan/micromamba/envs/bgpt

# export MASTER_IP=$(scontrol show hostnames $SLURM_JOB_NODELIST | head -n 1)
export MASTER_IP=localhost

if [ "$1" = "--load-from-checkpoint" ]; then
srun torchrun --nnodes=${NUM_NODES} --nproc_per_node=${NUM_GPUS_PER_NODE} --rdzv_id=$SLURM_JOB_ID --rdzv_backend=c10d --rdzv_endpoint=$MASTER_IP:29400 train-gen.py --train-config-path ${CONFIG_PATH} --load-from-checkpoint
else
srun torchrun --nnodes=${NUM_NODES} --nproc_per_node=${NUM_GPUS_PER_NODE} --rdzv_id=$SLURM_JOB_ID --rdzv_backend=c10d --rdzv_endpoint=$MASTER_IP:29400 train-gen.py --train-config-path ${CONFIG_PATH}
fi


59 changes: 59 additions & 0 deletions bgpt/config_og.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
# Configuration for generative modelling and classification
TRAIN_FOLDERS = [
# "wikipedia/train",
# "ag_news/train",
# "imagenet32/train",
# "cifar/train",
# "librispeech8K/train",
# "speech_commands8K/train",
#"irishman/train",
#"/home/mila/m/mina.beiramy/scratch/bgpt/data/dummy",
"/home/mila/m/mina.beiramy/scratch/bgpt/data/mix",
# "cpu_states/train",
] # Folder containing training data
EVAL_FOLDERS = [
# "wikipedia/test",
# "ag_news/test",
# "imagenet32/test",
# "cifar/test",
# "librispeech8K/test",
# "speech_commands8K/test",
#"irishman/test",
#"/home/mila/m/mina.beiramy/scratch/bgpt/data/dummy_val",
"/home/mila/m/mina.beiramy/scratch/bgpt/data/mix_val",
# "cpu_states/test",
] # Folder containing evaluation data

# Configuration for the paths
PRETRAINED_PATH = "weights-text.pth" # Path to pre-trained weights
WEIGHTS_PATH = "/home/mila/m/mina.beiramy/workspace/byte_models/scripts/abc-midi/exp/chkp/abcmidi-btch16-ptch16-v1.pth" # Path to save weights
LOGS_PATH = "/home/mila/m/mina.beiramy/workspace/byte_models/scripts/abc-midi/logs/logs-abcmidi-btch16-ptch16-v1.txt" # Path to save logs

# Configuration for the model
PATCH_SIZE = 16 # Patch Size
PATCH_LENGTH = 512 # Patch Length
BYTE_NUM_LAYERS = 3 # Number of layers in the decoder
PATCH_NUM_LAYERS = 12 # Number of layers in the encoder
HIDDEN_SIZE = 768 # Hidden Size

# Configuration for the training
NUM_EPOCHS = 10 # Number of epochs to train for (if early stopping doesn't intervene)
LEARNING_RATE = 1e-5 # Learning rate for the optimizer
BATCH_SIZE = 16 # Batch size for training
ACCUMULATION_STEPS = 1 # Accumulation steps to simulate large batch size
PATCH_SAMPLING_BATCH_SIZE = 0 # Batch size for patch during training, 0 for full conaudio
LOAD_FROM_CHECKPOINT = False # Whether to load weights from a checkpoint
LOAD_FROM_PRETRAINED = True # Whether to load pre-trained weights from a checkpoint
CONVERSION_MODE = 'abc&mid' # Mode of conversion (None for regular training, input->output for unidirectional conversion, input&output for bidirectional conversion)

# Configuration for inference
INFERENCE_WEIGHTS_PATH = "weights-conversion.pth" # Path to weights for inference
INPUT_EXT = "abc" # Extension of input files, used for conversion
TARGET_EXT = "mid" # Extension of target files
INPUT_FOLDER = "input" # Folder containing input files
OUTPUT_FOLDER = "output" # Folder to save output files
MODE = "convert" # Mode of inference (convert or generate)
NUM_SAMPLES = 100 # Number of samples to generate (only for generate mode)
TOP_K = 0 # Top k for sampling
TOP_P = 1. # Top p for sampling
TEMPERATURE = 1 # Temperature for sampling
7 changes: 4 additions & 3 deletions bgpt/configs/config_110M.yaml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
train_folders:
- "/lustre/orion/csc590/proj-shared/math-adder/merged-symbols/sample"
- "/lustre/orion/csc590/scratch/george-adams/data/wikipedia/train_text3"
eval_folders: []
base_dir: "/lustre/orion/csc590/scratch/jonathanlimsc/bgpt/bgpt_110m_ezmath_2digit"
base_dir: "/lustre/orion/csc590/scratch/george-adams/bgpt2/byte_models/bgpt/110m_wikipedia"
weights_path: "weights.pth"
pre_weights_path: null
logs_path: "log.txt"
Expand All @@ -17,10 +17,11 @@ learning_rate: 0.0001
batch_size: 16
accumulation_steps: 1
patch_sampling_batch_size: 0
load_from_checkpoint: false
# load_from_checkpoint: false # Deprecated this into a command-line arg that is False by default. So that there's no need to maintain a config where this is True and a reload config where this is True.
load_from_pre_checkpoint: false
checkpoint_frequency: 5
first_launch: true
verbose: true
wandb:
proj_name: byte_models
entity: jonathanlimsc
Expand Down
34 changes: 34 additions & 0 deletions bgpt/configs/config_110M_jon.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
train_folders:
- "/lustre/orion/csc590/proj-shared/math-adder/merged-symbols/sample_10K"
# - "/lustre/orion/csc590/proj-shared/math-adder/merged-symbols/train"
# - "/lustre/orion/csc590/proj-shared/mathematics_dataset-v1.0/train-easy"
# - "/lustre/orion/csc590/proj-shared/mathematics_dataset-v1.0/train-medium"
# - "/lustre/orion/csc590/proj-shared/mathematics_dataset-v1.0/train-hard"
# - "/lustre/orion/csc590/proj-shared/wikipedia/train_text3"
# - "/lustre/orion/csc590/proj-shared/imagenet-1k/train"
eval_folders: []
base_dir: "/lustre/orion/csc590/scratch/jonathanlimsc/bgpt/test_continual"
weights_path: "weights.pth"
pre_weights_path: null
logs_path: "log.txt"
checkpoint_path: "checkpoints"
dataloader_path: "dataloaders"
patch_size: 16
patch_length: 512
byte_num_layers: 3
patch_num_layers: 12
hidden_size: 768
num_epochs: 32
learning_rate: 0.0001
batch_size: 16
accumulation_steps: 1
patch_sampling_batch_size: 0
# load_from_checkpoint: false # Deprecated this into a command-line arg that is False by default. So that there's no need to maintain a config where this is True and a reload config where this is True.
load_from_pre_checkpoint: false
checkpoint_frequency: 40
first_launch: true
verbose: false
wandb:
proj_name: byte_models
entity: jonathanlimsc
mode: offline
31 changes: 31 additions & 0 deletions bgpt/configs/config_110M_math_nvidia.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
train_folders:
- "/nfs/scratch/jonathan/byte_models/data/math-adder/merged-symbols/train"
eval_folders:
- "/nfs/scratch/jonathan/byte_models/data/math-adder/merged-symbols/validation"
base_dir: "/nfs/scratch/jonathan/byte_models/simple_math"
weights_path: "weights.pth"
pre_weights_path: null
logs_path: "log.txt"
checkpoint_path: "checkpoints"
dataloader_path: "dataloaders"
patch_size: 16
patch_length: 512
byte_num_layers: 3
patch_num_layers: 12
hidden_size: 768
num_epochs: 10
learning_rate: 0.0001
batch_size: 8
accumulation_steps: 1
patch_sampling_batch_size: 0
# load_from_checkpoint: false # Deprecated this into a command-line arg that is False by default. So that there's no need to maintain a config where this is True and a reload config where this is True.
load_from_pre_checkpoint: false
checkpoint_frequency: 10000
logging_frequency: 100
first_launch: true
verbose: false
wandb:
proj_name: byte_models
entity: jonathanlimsc
mode: online # put 'offline' if you don't want it to upload to WanB server
name: simple_math
31 changes: 31 additions & 0 deletions bgpt/configs/config_110M_test.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
train_folders:
- "/nfs/scratch/jonathan/byte_models/data/math-adder/merged-symbols/sample_10K"
eval_folders:
- "/nfs/scratch/jonathan/byte_models/data/math-adder/merged-symbols/val_1K"
base_dir: "/nfs/scratch/jonathan/byte_models/test_eval"
weights_path: "weights.pth"
pre_weights_path: null
logs_path: "log.txt"
checkpoint_path: "checkpoints"
dataloader_path: "dataloaders"
patch_size: 16
patch_length: 512
byte_num_layers: 3
patch_num_layers: 12
hidden_size: 768
num_epochs: 10
learning_rate: 0.0001
batch_size: 8
accumulation_steps: 1
patch_sampling_batch_size: 0
# load_from_checkpoint: false # Deprecated this into a command-line arg that is False by default. So that there's no need to maintain a config where this is True and a reload config where this is True.
load_from_pre_checkpoint: false
checkpoint_frequency: 1000
logging_frequency: 100
first_launch: true
verbose: false
wandb:
proj_name: byte_models
entity: jonathanlimsc
mode: online # put 'offline' if you don't want it to upload to WanB server
name: test_eval
28 changes: 28 additions & 0 deletions bgpt/configs/config_1B_jon.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
train_folders:
- "/lustre/orion/csc590/proj-shared/math-adder/merged-symbols/sample"
eval_folders: []
base_dir: "/lustre/orion/csc590/scratch/jonathanlimsc/bgpt/testrun"
weights_path: "weights.pth"
pre_weights_path: null
logs_path: "log.txt"
checkpoint_path: "checkpoints"
dataloader_path: "dataloaders"
patch_size: 16
patch_length: 512
byte_num_layers: 3
patch_num_layers: 16
hidden_size: 2048
num_epochs: 5
learning_rate: 0.0001
batch_size: 4
accumulation_steps: 1
patch_sampling_batch_size: 0
load_from_checkpoint: false
load_from_pre_checkpoint: false
checkpoint_frequency: 5
first_launch: true
verbose: false
wandb:
proj_name: byte_models
entity: jonathanlimsc
mode: offline
28 changes: 28 additions & 0 deletions bgpt/configs/config_400M_jon.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
train_folders:
- "/lustre/orion/csc590/proj-shared/math-adder/merged-symbols/train"
eval_folders: []
base_dir: "/lustre/orion/csc590/scratch/jonathanlimsc/bgpt/testrun"
weights_path: "weights.pth"
pre_weights_path: null
logs_path: "log.txt"
checkpoint_path: "checkpoints"
dataloader_path: "dataloaders"
patch_size: 16
patch_length: 512
byte_num_layers: 3
patch_num_layers: 24
hidden_size: 1024
num_epochs: 5
learning_rate: 0.0001
batch_size: 8
accumulation_steps: 1
patch_sampling_batch_size: 0
load_from_checkpoint: false
load_from_pre_checkpoint: false
checkpoint_frequency: 5
first_launch: true
verbose: false
wandb:
proj_name: byte_models
entity: jonathanlimsc
mode: offline
28 changes: 28 additions & 0 deletions bgpt/configs/reload_config_110M.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
train_folders:
- "/lustre/orion/csc590/scratch/george-adams/data/wikipedia/train_text3"
eval_folders: []
base_dir: "/lustre/orion/csc590/scratch/george-adams/bgpt2/byte_models/bgpt/110m_wikipedia"
weights_path: "weights.pth"
pre_weights_path: null
logs_path: "log.txt"
checkpoint_path: "checkpoints"
dataloader_path: "dataloaders"
patch_size: 16
patch_length: 1024
byte_num_layers: 3
patch_num_layers: 12
hidden_size: 768
num_epochs: 32
learning_rate: 0.0001
batch_size: 16
accumulation_steps: 1
patch_sampling_batch_size: 0
load_from_checkpoint: true
load_from_pre_checkpoint: false
checkpoint_frequency: 5
first_launch: true
verbose: true
wandb:
proj_name: byte_models
entity: jonathanlimsc
mode: offline
Loading