diff --git a/bgpt/conditional_launch.sh b/bgpt/conditional_launch.sh new file mode 100644 index 0000000..96bf630 --- /dev/null +++ b/bgpt/conditional_launch.sh @@ -0,0 +1,27 @@ +#!/bin/bash + +# Define the source and target directories +SRC_DIR="/lustre/orion/csc590/scratch/george-adams/bgpt2/byte_models/bgpt" + +# Change to config YAML path +CONFIG_PATH="/lustre/orion/csc590/scratch/george-adams/bgpt2/byte_models/bgpt/configs/config_110M.yaml" +NUM_NODES=32 +NUM_GPUS_PER_NODE=8 + +cd "$SRC_DIR" + +source activate /lustre/orion/csc590/scratch/george-adams/conda_envs/bgpt + +module load rocm/5.2 + +export MASTER_IP=`ip -f inet addr show hsn0 | sed -En -e 's/.*inet ([0-9.]+).*/\1/p' | head -1` + +# mkdir "$SLURM_JOB_NAME" +# mkdir "$SLURM_JOB_NAME"/checkpoints +# mkdir "$SLURM_JOB_NAME"/dataloaders + +if [ "$1" = "--load-from-checkpoint" ]; then + srun torchrun --nnodes=${NUM_NODES} --nproc_per_node=${NUM_GPUS_PER_NODE} --rdzv_id=$SLURM_JOB_ID --rdzv_backend=c10d --rdzv_endpoint=$MASTER_IP:29400 train-gen.py --train-config-path ${CONFIG_PATH} --load-from-checkpoint +else + srun torchrun --nnodes=${NUM_NODES} --nproc_per_node=${NUM_GPUS_PER_NODE} --rdzv_id=$SLURM_JOB_ID --rdzv_backend=c10d --rdzv_endpoint=$MASTER_IP:29400 train-gen.py --train-config-path ${CONFIG_PATH} +fi \ No newline at end of file diff --git a/bgpt/conditional_launch_jon.sh b/bgpt/conditional_launch_jon.sh new file mode 100644 index 0000000..5651eb1 --- /dev/null +++ b/bgpt/conditional_launch_jon.sh @@ -0,0 +1,28 @@ +#!/bin/bash +# Define the source and target directories +SRC_DIR="/ccs/home/jonathanlimsc/generalist/byte_models/bgpt" + +# Change to config YAML path +CONFIG_PATH="/ccs/home/jonathanlimsc/generalist/byte_models/bgpt/configs/config_110M_jon.yaml" +# TODO: Pass this through from launch_global_jon.sh +NUM_NODES=2 + +cd "$SRC_DIR" + +source activate /ccs/home/jonathanlimsc/.conda/envs/bgpt2 + +module load rocm/5.2 + +export MASTER_IP=`ip -f inet addr show hsn0 | sed -En -e 's/.*inet ([0-9.]+).*/\1/p' | head -1` + +# mkdir "$SLURM_JOB_NAME" +# mkdir "$SLURM_JOB_NAME"/checkpoints +# mkdir "$SLURM_JOB_NAME"/dataloaders + +if [ "$1" = "--load-from-checkpoint" ]; then + srun torchrun --nnodes=${NUM_NODES} --nproc_per_node=8 --rdzv_id=$SLURM_JOB_ID --rdzv_backend=c10d --rdzv_endpoint=$MASTER_IP:29400 train-gen.py --train-config-path ${CONFIG_PATH} --load-from-checkpoint +else + srun torchrun --nnodes=${NUM_NODES} --nproc_per_node=8 --rdzv_id=$SLURM_JOB_ID --rdzv_backend=c10d --rdzv_endpoint=$MASTER_IP:29400 train-gen.py --train-config-path ${CONFIG_PATH} +fi + + diff --git a/bgpt/conditional_launch_nvidia.sh b/bgpt/conditional_launch_nvidia.sh new file mode 100644 index 0000000..b90bc5e --- /dev/null +++ b/bgpt/conditional_launch_nvidia.sh @@ -0,0 +1,25 @@ +#!/bin/bash +# Define the source and target directories +SRC_DIR="/home/jonathan/cerc/byte_models/bgpt" + +# Change to config YAML path +CONFIG_PATH="/home/jonathan/cerc/byte_models/bgpt/configs/config_110M_math_nvidia.yaml" +# TODO: Pass this through from launch_global_jon.sh +NUM_NODES=1 +NUM_GPUS_PER_NODE=4 + +cd "$SRC_DIR" + +source ${HOME}/.bashrc +source activate /nfs/scratch/jonathan/micromamba/envs/bgpt + +# export MASTER_IP=$(scontrol show hostnames $SLURM_JOB_NODELIST | head -n 1) +export MASTER_IP=localhost + +if [ "$1" = "--load-from-checkpoint" ]; then + srun torchrun --nnodes=${NUM_NODES} --nproc_per_node=${NUM_GPUS_PER_NODE} --rdzv_id=$SLURM_JOB_ID --rdzv_backend=c10d --rdzv_endpoint=$MASTER_IP:29400 train-gen.py --train-config-path ${CONFIG_PATH} --load-from-checkpoint +else + srun torchrun --nnodes=${NUM_NODES} --nproc_per_node=${NUM_GPUS_PER_NODE} --rdzv_id=$SLURM_JOB_ID --rdzv_backend=c10d --rdzv_endpoint=$MASTER_IP:29400 train-gen.py --train-config-path ${CONFIG_PATH} +fi + + diff --git a/bgpt/conditional_launch_nvidia_small.sh b/bgpt/conditional_launch_nvidia_small.sh new file mode 100644 index 0000000..457d21f --- /dev/null +++ b/bgpt/conditional_launch_nvidia_small.sh @@ -0,0 +1,25 @@ +#!/bin/bash +# Define the source and target directories +SRC_DIR="/home/jonathan/cerc/byte_models/bgpt" + +# Change to config YAML path +CONFIG_PATH="/home/jonathan/cerc/byte_models/bgpt/configs/config_110M_test.yaml" +# TODO: Pass this through from launch_global_jon.sh +NUM_NODES=1 +NUM_GPUS_PER_NODE=2 + +cd "$SRC_DIR" + +source ${HOME}/.bashrc +source activate /nfs/scratch/jonathan/micromamba/envs/bgpt + +# export MASTER_IP=$(scontrol show hostnames $SLURM_JOB_NODELIST | head -n 1) +export MASTER_IP=localhost + +if [ "$1" = "--load-from-checkpoint" ]; then + srun torchrun --nnodes=${NUM_NODES} --nproc_per_node=${NUM_GPUS_PER_NODE} --rdzv_id=$SLURM_JOB_ID --rdzv_backend=c10d --rdzv_endpoint=$MASTER_IP:29400 train-gen.py --train-config-path ${CONFIG_PATH} --load-from-checkpoint +else + srun torchrun --nnodes=${NUM_NODES} --nproc_per_node=${NUM_GPUS_PER_NODE} --rdzv_id=$SLURM_JOB_ID --rdzv_backend=c10d --rdzv_endpoint=$MASTER_IP:29400 train-gen.py --train-config-path ${CONFIG_PATH} +fi + + diff --git a/bgpt/config_og.py b/bgpt/config_og.py new file mode 100644 index 0000000..5bd5d47 --- /dev/null +++ b/bgpt/config_og.py @@ -0,0 +1,59 @@ +# Configuration for generative modelling and classification +TRAIN_FOLDERS = [ + # "wikipedia/train", + # "ag_news/train", + # "imagenet32/train", + # "cifar/train", + # "librispeech8K/train", + # "speech_commands8K/train", + #"irishman/train", + #"/home/mila/m/mina.beiramy/scratch/bgpt/data/dummy", + "/home/mila/m/mina.beiramy/scratch/bgpt/data/mix", + # "cpu_states/train", + ] # Folder containing training data +EVAL_FOLDERS = [ + # "wikipedia/test", + # "ag_news/test", + # "imagenet32/test", + # "cifar/test", + # "librispeech8K/test", + # "speech_commands8K/test", + #"irishman/test", + #"/home/mila/m/mina.beiramy/scratch/bgpt/data/dummy_val", + "/home/mila/m/mina.beiramy/scratch/bgpt/data/mix_val", + # "cpu_states/test", + ] # Folder containing evaluation data + +# Configuration for the paths +PRETRAINED_PATH = "weights-text.pth" # Path to pre-trained weights +WEIGHTS_PATH = "/home/mila/m/mina.beiramy/workspace/byte_models/scripts/abc-midi/exp/chkp/abcmidi-btch16-ptch16-v1.pth" # Path to save weights +LOGS_PATH = "/home/mila/m/mina.beiramy/workspace/byte_models/scripts/abc-midi/logs/logs-abcmidi-btch16-ptch16-v1.txt" # Path to save logs + +# Configuration for the model +PATCH_SIZE = 16 # Patch Size +PATCH_LENGTH = 512 # Patch Length +BYTE_NUM_LAYERS = 3 # Number of layers in the decoder +PATCH_NUM_LAYERS = 12 # Number of layers in the encoder +HIDDEN_SIZE = 768 # Hidden Size + +# Configuration for the training +NUM_EPOCHS = 10 # Number of epochs to train for (if early stopping doesn't intervene) +LEARNING_RATE = 1e-5 # Learning rate for the optimizer +BATCH_SIZE = 16 # Batch size for training +ACCUMULATION_STEPS = 1 # Accumulation steps to simulate large batch size +PATCH_SAMPLING_BATCH_SIZE = 0 # Batch size for patch during training, 0 for full conaudio +LOAD_FROM_CHECKPOINT = False # Whether to load weights from a checkpoint +LOAD_FROM_PRETRAINED = True # Whether to load pre-trained weights from a checkpoint +CONVERSION_MODE = 'abc&mid' # Mode of conversion (None for regular training, input->output for unidirectional conversion, input&output for bidirectional conversion) + +# Configuration for inference +INFERENCE_WEIGHTS_PATH = "weights-conversion.pth" # Path to weights for inference +INPUT_EXT = "abc" # Extension of input files, used for conversion +TARGET_EXT = "mid" # Extension of target files +INPUT_FOLDER = "input" # Folder containing input files +OUTPUT_FOLDER = "output" # Folder to save output files +MODE = "convert" # Mode of inference (convert or generate) +NUM_SAMPLES = 100 # Number of samples to generate (only for generate mode) +TOP_K = 0 # Top k for sampling +TOP_P = 1. # Top p for sampling +TEMPERATURE = 1 # Temperature for sampling \ No newline at end of file diff --git a/bgpt/configs/config_110M.yaml b/bgpt/configs/config_110M.yaml index d5924c2..e602679 100644 --- a/bgpt/configs/config_110M.yaml +++ b/bgpt/configs/config_110M.yaml @@ -1,7 +1,7 @@ train_folders: - - "/lustre/orion/csc590/proj-shared/math-adder/merged-symbols/sample" + - "/lustre/orion/csc590/scratch/george-adams/data/wikipedia/train_text3" eval_folders: [] -base_dir: "/lustre/orion/csc590/scratch/jonathanlimsc/bgpt/bgpt_110m_ezmath_2digit" +base_dir: "/lustre/orion/csc590/scratch/george-adams/bgpt2/byte_models/bgpt/110m_wikipedia" weights_path: "weights.pth" pre_weights_path: null logs_path: "log.txt" @@ -17,10 +17,11 @@ learning_rate: 0.0001 batch_size: 16 accumulation_steps: 1 patch_sampling_batch_size: 0 -load_from_checkpoint: false +# load_from_checkpoint: false # Deprecated this into a command-line arg that is False by default. So that there's no need to maintain a config where this is True and a reload config where this is True. load_from_pre_checkpoint: false checkpoint_frequency: 5 first_launch: true +verbose: true wandb: proj_name: byte_models entity: jonathanlimsc diff --git a/bgpt/configs/config_110M_jon.yaml b/bgpt/configs/config_110M_jon.yaml new file mode 100644 index 0000000..1cf2e36 --- /dev/null +++ b/bgpt/configs/config_110M_jon.yaml @@ -0,0 +1,34 @@ +train_folders: + - "/lustre/orion/csc590/proj-shared/math-adder/merged-symbols/sample_10K" + # - "/lustre/orion/csc590/proj-shared/math-adder/merged-symbols/train" + # - "/lustre/orion/csc590/proj-shared/mathematics_dataset-v1.0/train-easy" + # - "/lustre/orion/csc590/proj-shared/mathematics_dataset-v1.0/train-medium" + # - "/lustre/orion/csc590/proj-shared/mathematics_dataset-v1.0/train-hard" + # - "/lustre/orion/csc590/proj-shared/wikipedia/train_text3" + # - "/lustre/orion/csc590/proj-shared/imagenet-1k/train" +eval_folders: [] +base_dir: "/lustre/orion/csc590/scratch/jonathanlimsc/bgpt/test_continual" +weights_path: "weights.pth" +pre_weights_path: null +logs_path: "log.txt" +checkpoint_path: "checkpoints" +dataloader_path: "dataloaders" +patch_size: 16 +patch_length: 512 +byte_num_layers: 3 +patch_num_layers: 12 +hidden_size: 768 +num_epochs: 32 +learning_rate: 0.0001 +batch_size: 16 +accumulation_steps: 1 +patch_sampling_batch_size: 0 +# load_from_checkpoint: false # Deprecated this into a command-line arg that is False by default. So that there's no need to maintain a config where this is True and a reload config where this is True. +load_from_pre_checkpoint: false +checkpoint_frequency: 40 +first_launch: true +verbose: false +wandb: + proj_name: byte_models + entity: jonathanlimsc + mode: offline diff --git a/bgpt/configs/config_110M_math_nvidia.yaml b/bgpt/configs/config_110M_math_nvidia.yaml new file mode 100644 index 0000000..4522a82 --- /dev/null +++ b/bgpt/configs/config_110M_math_nvidia.yaml @@ -0,0 +1,31 @@ +train_folders: + - "/nfs/scratch/jonathan/byte_models/data/math-adder/merged-symbols/train" +eval_folders: + - "/nfs/scratch/jonathan/byte_models/data/math-adder/merged-symbols/validation" +base_dir: "/nfs/scratch/jonathan/byte_models/simple_math" +weights_path: "weights.pth" +pre_weights_path: null +logs_path: "log.txt" +checkpoint_path: "checkpoints" +dataloader_path: "dataloaders" +patch_size: 16 +patch_length: 512 +byte_num_layers: 3 +patch_num_layers: 12 +hidden_size: 768 +num_epochs: 10 +learning_rate: 0.0001 +batch_size: 8 +accumulation_steps: 1 +patch_sampling_batch_size: 0 +# load_from_checkpoint: false # Deprecated this into a command-line arg that is False by default. So that there's no need to maintain a config where this is True and a reload config where this is True. +load_from_pre_checkpoint: false +checkpoint_frequency: 10000 +logging_frequency: 100 +first_launch: true +verbose: false +wandb: + proj_name: byte_models + entity: jonathanlimsc + mode: online # put 'offline' if you don't want it to upload to WanB server + name: simple_math diff --git a/bgpt/configs/config_110M_test.yaml b/bgpt/configs/config_110M_test.yaml new file mode 100644 index 0000000..a6f4269 --- /dev/null +++ b/bgpt/configs/config_110M_test.yaml @@ -0,0 +1,31 @@ +train_folders: + - "/nfs/scratch/jonathan/byte_models/data/math-adder/merged-symbols/sample_10K" +eval_folders: + - "/nfs/scratch/jonathan/byte_models/data/math-adder/merged-symbols/val_1K" +base_dir: "/nfs/scratch/jonathan/byte_models/test_eval" +weights_path: "weights.pth" +pre_weights_path: null +logs_path: "log.txt" +checkpoint_path: "checkpoints" +dataloader_path: "dataloaders" +patch_size: 16 +patch_length: 512 +byte_num_layers: 3 +patch_num_layers: 12 +hidden_size: 768 +num_epochs: 10 +learning_rate: 0.0001 +batch_size: 8 +accumulation_steps: 1 +patch_sampling_batch_size: 0 +# load_from_checkpoint: false # Deprecated this into a command-line arg that is False by default. So that there's no need to maintain a config where this is True and a reload config where this is True. +load_from_pre_checkpoint: false +checkpoint_frequency: 1000 +logging_frequency: 100 +first_launch: true +verbose: false +wandb: + proj_name: byte_models + entity: jonathanlimsc + mode: online # put 'offline' if you don't want it to upload to WanB server + name: test_eval diff --git a/bgpt/configs/config_1B_jon.yaml b/bgpt/configs/config_1B_jon.yaml new file mode 100644 index 0000000..d846606 --- /dev/null +++ b/bgpt/configs/config_1B_jon.yaml @@ -0,0 +1,28 @@ +train_folders: + - "/lustre/orion/csc590/proj-shared/math-adder/merged-symbols/sample" +eval_folders: [] +base_dir: "/lustre/orion/csc590/scratch/jonathanlimsc/bgpt/testrun" +weights_path: "weights.pth" +pre_weights_path: null +logs_path: "log.txt" +checkpoint_path: "checkpoints" +dataloader_path: "dataloaders" +patch_size: 16 +patch_length: 512 +byte_num_layers: 3 +patch_num_layers: 16 +hidden_size: 2048 +num_epochs: 5 +learning_rate: 0.0001 +batch_size: 4 +accumulation_steps: 1 +patch_sampling_batch_size: 0 +load_from_checkpoint: false +load_from_pre_checkpoint: false +checkpoint_frequency: 5 +first_launch: true +verbose: false +wandb: + proj_name: byte_models + entity: jonathanlimsc + mode: offline diff --git a/bgpt/configs/config_400M_jon.yaml b/bgpt/configs/config_400M_jon.yaml new file mode 100644 index 0000000..b36ae20 --- /dev/null +++ b/bgpt/configs/config_400M_jon.yaml @@ -0,0 +1,28 @@ +train_folders: + - "/lustre/orion/csc590/proj-shared/math-adder/merged-symbols/train" +eval_folders: [] +base_dir: "/lustre/orion/csc590/scratch/jonathanlimsc/bgpt/testrun" +weights_path: "weights.pth" +pre_weights_path: null +logs_path: "log.txt" +checkpoint_path: "checkpoints" +dataloader_path: "dataloaders" +patch_size: 16 +patch_length: 512 +byte_num_layers: 3 +patch_num_layers: 24 +hidden_size: 1024 +num_epochs: 5 +learning_rate: 0.0001 +batch_size: 8 +accumulation_steps: 1 +patch_sampling_batch_size: 0 +load_from_checkpoint: false +load_from_pre_checkpoint: false +checkpoint_frequency: 5 +first_launch: true +verbose: false +wandb: + proj_name: byte_models + entity: jonathanlimsc + mode: offline diff --git a/bgpt/configs/reload_config_110M.yaml b/bgpt/configs/reload_config_110M.yaml new file mode 100644 index 0000000..077d149 --- /dev/null +++ b/bgpt/configs/reload_config_110M.yaml @@ -0,0 +1,28 @@ +train_folders: + - "/lustre/orion/csc590/scratch/george-adams/data/wikipedia/train_text3" +eval_folders: [] +base_dir: "/lustre/orion/csc590/scratch/george-adams/bgpt2/byte_models/bgpt/110m_wikipedia" +weights_path: "weights.pth" +pre_weights_path: null +logs_path: "log.txt" +checkpoint_path: "checkpoints" +dataloader_path: "dataloaders" +patch_size: 16 +patch_length: 1024 +byte_num_layers: 3 +patch_num_layers: 12 +hidden_size: 768 +num_epochs: 32 +learning_rate: 0.0001 +batch_size: 16 +accumulation_steps: 1 +patch_sampling_batch_size: 0 +load_from_checkpoint: true +load_from_pre_checkpoint: false +checkpoint_frequency: 5 +first_launch: true +verbose: true +wandb: + proj_name: byte_models + entity: jonathanlimsc + mode: offline diff --git a/bgpt/launch-interactive.sh b/bgpt/launch-interactive.sh new file mode 100755 index 0000000..155b15a --- /dev/null +++ b/bgpt/launch-interactive.sh @@ -0,0 +1,23 @@ +#!/bin/bash + +USERNAME=jonathanlimsc +# Define the source and target directories +SRC_DIR=/lustre/orion/csc590/scratch/${USERNAME}/bgpt + +# Change to config YAML path +CONFIG_PATH=$(pwd)/configs/config_100M.yaml + +cd "$SRC_DIR" + +source /ccs/home/jonathanlimsc/miniconda3/etc/profile.d +conda activate /ccs/home/jonathanlimsc/.conda/envs/bgpt/ + +module load rocm/5.2 + +# export MASTER_IP=`ip -f inet addr show hsn0 | sed -En -e 's/.*inet ([0-9.]+).*/\1/p' | head -1` + +# mkdir "$SLURM_JOB_NAME" +# mkdir "$SLURM_JOB_NAME"/checkpoints +# mkdir "$SLURM_JOB_NAME"/dataloaders + +python train-gen.py --train-config-path ${CONFIG_PATH} diff --git a/bgpt/launch.sh b/bgpt/launch.sh index 47d948a..28d5536 100644 --- a/bgpt/launch.sh +++ b/bgpt/launch.sh @@ -7,15 +7,14 @@ #SBATCH -N 32 # Define the source and target directories -SRC_DIR="/lustre/orion/csc590/scratch/george-adams/bgpt" +SRC_DIR="/lustre/orion/csc590/scratch/george-adams/byte_models/bgpt" # Change to config YAML path -CONFIG_PATH="bgpt/configs/config_100M.yaml" +CONFIG_PATH="/lustre/orion/csc590/scratch/george-adams/byte_models/bgpt/configs/config_110M.yaml" cd "$SRC_DIR" -source /lustre/orion/csc590/scratch/$(whoami)/miniconda3/etc/profile.d/conda.sh -conda activate /lustre/orion/csc590/scratch/george-adams/conda_envs/bgpt +source activate /lustre/orion/csc590/scratch/george-adams/conda_envs/bgpt module load rocm/5.2 @@ -25,4 +24,4 @@ export MASTER_IP=`ip -f inet addr show hsn0 | sed -En -e 's/.*inet ([0-9.]+).*/\ # mkdir "$SLURM_JOB_NAME"/checkpoints # mkdir "$SLURM_JOB_NAME"/dataloaders -srun torchrun --nnodes=2 --nproc_per_node=8 --rdzv_id=$SLURM_JOB_ID --rdzv_backend=c10d --rdzv_endpoint=$MASTER_IP:29400 train-gen.py --train-config-path ${CONFIG_PATH} +srun torchrun --nnodes=32 --nproc_per_node=8 --rdzv_id=$SLURM_JOB_ID --rdzv_backend=c10d --rdzv_endpoint=$MASTER_IP:29400 train-gen.py --train-config-path ${CONFIG_PATH} diff --git a/bgpt/launch_batch_jon.sh b/bgpt/launch_batch_jon.sh new file mode 100755 index 0000000..a7abe69 --- /dev/null +++ b/bgpt/launch_batch_jon.sh @@ -0,0 +1,28 @@ +#!/bin/bash +#SBATCH -A CSC590 +#SBATCH -J test-interactive +#SBATCH -o /lustre/orion/csc590/scratch/jonathanlimsc/bgpt/logs/%x-%j.out +#SBATCH -t 00:10:00 +#SBATCH -p batch +#SBATCH -q debug +#SBATCH -N 2 + +# Define the source and target directories +SRC_DIR="/ccs/home/jonathanlimsc/generalist/byte_models/bgpt" + +# Change to config YAML path +CONFIG_PATH="/ccs/home/jonathanlimsc/generalist/byte_models/bgpt/configs/config_1B_jon.yaml" + +cd "$SRC_DIR" + +source activate /ccs/home/jonathanlimsc/.conda/envs/bgpt2 + +module load rocm/5.2 + +export MASTER_IP=`ip -f inet addr show hsn0 | sed -En -e 's/.*inet ([0-9.]+).*/\1/p' | head -1` + +# mkdir "$SLURM_JOB_NAME" +# mkdir "$SLURM_JOB_NAME"/checkpoints +# mkdir "$SLURM_JOB_NAME"/dataloaders + +srun torchrun --nnodes=1 --nproc_per_node=8 --rdzv_id=$SLURM_JOB_ID --rdzv_backend=c10d --rdzv_endpoint=$MASTER_IP:29400 train-gen.py --train-config-path ${CONFIG_PATH} diff --git a/bgpt/launch_batch_jon_nvidia.sh b/bgpt/launch_batch_jon_nvidia.sh new file mode 100755 index 0000000..6418c22 --- /dev/null +++ b/bgpt/launch_batch_jon_nvidia.sh @@ -0,0 +1,29 @@ +#!/bin/bash +#SBATCH -J test-batch +#SBATCH -o /nfs/scratch/jonathan/byte_models/logs/%x-%j.out +#SBATCH --error=/nfs/scratch/jonathan/byte_models/logs/%x-%j.err +#SBATCH -N 1 +#SBATCH --cpus-per-task 6 +#SBATCH --gres=gpu:A100:2 +#SBATCH --mem=32G +#SBATCH -t 00:5:00 + +# Define the source and target directories +SRC_DIR="/home/jonathan/cerc/byte_models/bgpt" + +# Change to config YAML path +CONFIG_PATH="/home/jonathan/cerc/byte_models/bgpt/configs/config_110M_jon_nvidia.yaml" + +NUM_NODES=1 +NUM_GPUS_PER_NODE=2 + +cd "$SRC_DIR" + +source ${HOME}/.bashrc +source activate /nfs/scratch/jonathan/micromamba/envs/bgpt + +# export MASTER_IP=$(scontrol show hostnames $SLURM_JOB_NODELIST | head -n 1) +export MASTER_IP=localhost +# export MASTER_PORT=$((((RANDOM<<15)|RANDOM)%63001+2001)) + +srun torchrun --nnodes=${NUM_NODES} --nproc_per_node=${NUM_GPUS_PER_NODE} --rdzv_id=$SLURM_JOB_ID --rdzv_backend=c10d --rdzv_endpoint=$MASTER_IP:29400 train-gen.py --train-config-path ${CONFIG_PATH} --load-from-checkpoint diff --git a/bgpt/launch_global.sh b/bgpt/launch_global.sh new file mode 100755 index 0000000..c423168 --- /dev/null +++ b/bgpt/launch_global.sh @@ -0,0 +1,32 @@ +#!/bin/bash + +SRC_DIR="/lustre/orion/csc590/scratch/george-adams/bgpt2/byte_models/bgpt" + +# Number of jobs to submit +NUM_JOBS=2 +NUM_NODES=32 +NUM_GPUS_PER_NODE=8 +TIME_LIMIT_PER_JOB="2:00:00" + +# Job dependency (set to "none" for the first job) +DEPENDENCY="none" + +for (( i=1; i<=NUM_JOBS; i++ )) +do + # Job name + JOB_NAME="110m-wikipedia-reloaded-$i" + + # Submit job and capture the job ID + if [ "$DEPENDENCY" == "none" ]; then + JOB_SUBMIT_OUTPUT=$(sbatch --job-name=$JOB_NAME --output=${SRC_DIR}/logs/%x-%j.out --error=${SRC_DIR}/logs/%x-%j.err --time=${TIME_LIMIT_PER_JOB} --nodes=${NUM_NODES} --partition=batch conditional_launch.sh + else + JOB_SUBMIT_OUTPUT=$(sbatch --dependency=afterany:$DEPENDENCY --job-name=$JOB_NAME --output=${SRC_DIR}/logs/%x-%j.out --error=${SRC_DIR}/logs/%x-%j.err --time=${TIME_LIMIT_PER_JOB} --nodes=${NUM_NODES} --partition=batch conditional_launch.sh --load-from-checkpoint) + fi + + # Extract job ID from the submission output + JOB_ID=$(echo $JOB_SUBMIT_OUTPUT | grep -oP '\d+') + echo "Submitted job $JOB_ID" + + # Set this job ID as the dependency for the next job + DEPENDENCY=$JOB_ID +done diff --git a/bgpt/launch_global_jon.sh b/bgpt/launch_global_jon.sh new file mode 100755 index 0000000..3184637 --- /dev/null +++ b/bgpt/launch_global_jon.sh @@ -0,0 +1,30 @@ +#!/bin/bash + +SRC_DIR="/lustre/orion/csc590/scratch/jonathanlimsc/bgpt" + +# Number of jobs to submit +NUM_JOBS=2 +NUM_NODES=2 + +# Job dependency (set to "none" for the first job) +DEPENDENCY="none" + +for (( i=1; i<=NUM_JOBS; i++ )) +do + # Job name + JOB_NAME="test-continual-$i" + + # Submit job and capture the job ID + if [ "$DEPENDENCY" == "none" ]; then + JOB_SUBMIT_OUTPUT=$(sbatch -A csc590 --job-name=$JOB_NAME --output=${SRC_DIR}/logs/%x-%j.out --time=00:05:00 --partition=batch --nodes=${NUM_NODES} conditional_launch_jon.sh) + else + JOB_SUBMIT_OUTPUT=$(sbatch -A csc590 --dependency=afterany:$DEPENDENCY --job-name=$JOB_NAME --output=${SRC_DIR}/logs/%x-%j.out --time=00:05:00 --partition=batch --nodes=${NUM_NODES} conditional_launch_jon.sh --load-from-checkpoint) + fi + + # Extract job ID from the submission output + JOB_ID=$(echo $JOB_SUBMIT_OUTPUT | grep -oP '\d+') + echo "Submitted job $JOB_ID" + + # Set this job ID as the dependency for the next job + DEPENDENCY=$JOB_ID +done diff --git a/bgpt/launch_global_nvidia.sh b/bgpt/launch_global_nvidia.sh new file mode 100755 index 0000000..8045a7a --- /dev/null +++ b/bgpt/launch_global_nvidia.sh @@ -0,0 +1,33 @@ +#!/bin/bash + +SRC_DIR="/nfs/scratch/jonathan/byte_models" + +# Number of jobs to submit +NUM_JOBS=3 +NUM_NODES=1 +NUM_GPUS_PER_NODE=4 +CPUS_PER_TASK=6 +MEM="32G" +TIME_LIMIT_PER_JOB="24:00:00" +# Job dependency (set to "none" for the first job) +DEPENDENCY="none" + +for (( i=1; i<=NUM_JOBS; i++ )) +do + # Job name + JOB_NAME="continual-job-$i" + + # Submit job and capture the job ID + if [ "$DEPENDENCY" == "none" ]; then + JOB_SUBMIT_OUTPUT=$(sbatch --job-name=$JOB_NAME --output=${SRC_DIR}/logs/%x-%j.out --error=${SRC_DIR}/logs/%x-%j.err --time=${TIME_LIMIT_PER_JOB} --nodes=${NUM_NODES} --gres=gpu:A100:${NUM_GPUS_PER_NODE} --cpus-per-task=${CPUS_PER_TASK} --mem=${MEM} conditional_launch_nvidia.sh) + else + JOB_SUBMIT_OUTPUT=$(sbatch --dependency=afterany:$DEPENDENCY --job-name=$JOB_NAME --output=${SRC_DIR}/logs/%x-%j.out --error=${SRC_DIR}/logs/%x-%j.err --time=${TIME_LIMIT_PER_JOB} --nodes=${NUM_NODES} --gres=gpu:A100:${NUM_GPUS_PER_NODE} --cpus-per-task=${CPUS_PER_TASK} --mem=${MEM} conditional_launch_nvidia.sh --load-from-checkpoint) + fi + + # Extract job ID from the submission output + JOB_ID=$(echo $JOB_SUBMIT_OUTPUT | grep -oP '\d+') + echo "Submitted job $JOB_ID" + + # Set this job ID as the dependency for the next job + DEPENDENCY=$JOB_ID +done diff --git a/bgpt/launch_global_nvidia_small.sh b/bgpt/launch_global_nvidia_small.sh new file mode 100755 index 0000000..ca3052c --- /dev/null +++ b/bgpt/launch_global_nvidia_small.sh @@ -0,0 +1,33 @@ +#!/bin/bash + +SRC_DIR="/nfs/scratch/jonathan/byte_models" + +# Number of jobs to submit +NUM_JOBS=1 +NUM_NODES=1 +NUM_GPUS_PER_NODE=2 +CPUS_PER_TASK=6 +MEM="32G" +TIME_LIMIT_PER_JOB="24:00:00" +# Job dependency (set to "none" for the first job) +DEPENDENCY="none" + +for (( i=1; i<=NUM_JOBS; i++ )) +do + # Job name + JOB_NAME="test-eval-$i" + + # Submit job and capture the job ID + if [ "$DEPENDENCY" == "none" ]; then + JOB_SUBMIT_OUTPUT=$(sbatch --job-name=$JOB_NAME --output=${SRC_DIR}/logs/%x-%j.out --error=${SRC_DIR}/logs/%x-%j.err --time=${TIME_LIMIT_PER_JOB} --nodes=${NUM_NODES} --gres=gpu:A100:${NUM_GPUS_PER_NODE} --cpus-per-task=${CPUS_PER_TASK} --mem=${MEM} conditional_launch_nvidia_small.sh) + else + JOB_SUBMIT_OUTPUT=$(sbatch --dependency=afterany:$DEPENDENCY --job-name=$JOB_NAME --output=${SRC_DIR}/logs/%x-%j.out --error=${SRC_DIR}/logs/%x-%j.err --time=${TIME_LIMIT_PER_JOB} --nodes=${NUM_NODES} --gres=gpu:A100:${NUM_GPUS_PER_NODE} --cpus-per-task=${CPUS_PER_TASK} --mem=${MEM} conditional_launch_nvidia_small.sh --load-from-checkpoint) + fi + + # Extract job ID from the submission output + JOB_ID=$(echo $JOB_SUBMIT_OUTPUT | grep -oP '\d+') + echo "Submitted job $JOB_ID" + + # Set this job ID as the dependency for the next job + DEPENDENCY=$JOB_ID +done diff --git a/bgpt/launch_interactive_jon.sh b/bgpt/launch_interactive_jon.sh new file mode 100755 index 0000000..24f7a02 --- /dev/null +++ b/bgpt/launch_interactive_jon.sh @@ -0,0 +1,22 @@ +#!/bin/bash + +# Define the source and target directories +SRC_DIR="/ccs/home/jonathanlimsc/generalist/byte_models/bgpt" + +# Change to config YAML path +CONFIG_PATH="/ccs/home/jonathanlimsc/generalist/byte_models/bgpt/configs/config_110M_jon.yaml" + +cd "$SRC_DIR" + +source /ccs/home/jonathanlimsc/miniconda3/etc/profile.d/conda.sh +conda activate /ccs/home/jonathanlimsc/.conda/envs/bgpt2 + +# module load rocm/5.2 + +export MASTER_IP=`ip -f inet addr show hsn0 | sed -En -e 's/.*inet ([0-9.]+).*/\1/p' | head -1` + +# mkdir "$SLURM_JOB_NAME" +# mkdir "$SLURM_JOB_NAME"/checkpoints +# mkdir "$SLURM_JOB_NAME"/dataloaders + +srun torchrun --nnodes=1 --nproc_per_node=1 --rdzv_id=$SLURM_JOB_ID --rdzv_backend=c10d --rdzv_endpoint=$MASTER_IP:29400 train-gen.py --train-config-path ${CONFIG_PATH} diff --git a/bgpt/launch_interactive_jon_nvidia.sh b/bgpt/launch_interactive_jon_nvidia.sh new file mode 100755 index 0000000..aaccebd --- /dev/null +++ b/bgpt/launch_interactive_jon_nvidia.sh @@ -0,0 +1,21 @@ +#!/bin/bash + +# Define the source and target directories +SRC_DIR="/home/jonathan/cerc/byte_models/bgpt" + +# Change to config YAML path +CONFIG_PATH="/home/jonathan/cerc/byte_models/bgpt/configs/config_110M_jon_nvidia.yaml" + +cd "$SRC_DIR" + +source ${HOME}/.bashrc +source activate /nfs/scratch/jonathan/micromamba/envs/bgpt + +# module load rocm/5.2 + +export MASTER_IP=localhost +# export MASTER_IP=`ip -f inet addr show hsn0 | sed -En -e 's/.*inet ([0-9.]+).*/\1/p' | head -1` +# export MASTER_ADDR=$(scontrol show hostnames $SLURM_JOB_NODELIST | head -n 1) + +# torchrun --nnodes=1 --nproc_per_node=1 --rdzv_id=$SLURM_JOB_ID --rdzv_backend=c10d --rdzv_endpoint=$MASTER_IP:29400 train-gen.py --train-config-path ${CONFIG_PATH} +torchrun --nnodes=1 --nproc_per_node=1 --rdzv_id=1337 --rdzv_backend=c10d --rdzv_endpoint=$MASTER_IP:29400 train-gen.py --train-config-path ${CONFIG_PATH} diff --git a/bgpt/reloading_sampler.py b/bgpt/reloading_sampler.py new file mode 100644 index 0000000..9e416b7 --- /dev/null +++ b/bgpt/reloading_sampler.py @@ -0,0 +1,37 @@ +import math +from typing import Optional, Iterator + +import torch +from torch.utils.data import Dataset +import torch.distributed as dist +from torch.utils.data.distributed import DistributedSampler + + +class CustomDistributedSampler(DistributedSampler): + def __init__(self, dataset: Dataset, num_replicas: Optional[int] = None, + rank: Optional[int] = None, shuffle: bool = True, + seed: int = 0, drop_last: bool = False, start_index: int = 0) -> None: + ''' + start_index: The index in dataset to start sampling indices onwards from. Default is 0. + ''' + # Initialize the parent DistributedSampler class with the provided arguments + super(CustomDistributedSampler, self).__init__(dataset, num_replicas=num_replicas, rank=rank, shuffle=shuffle, + seed=seed, drop_last=drop_last) + + # Additional attribute for custom functionality + self.start_index = start_index + + def __iter__(self) -> Iterator[int]: + # Generate a list of indices as the DistributedSampler would + indices = super(CustomDistributedSampler, self).__iter__() + + # Convert iterator to list to manipulate the starting index + indices = list(indices) + + indices = indices[self.start_index:] + + return iter(indices) + + def set_index(self, index: int): + # Set the starting index for sampling + self.start_index = index diff --git a/bgpt/requirements_nvidia.txt b/bgpt/requirements_nvidia.txt new file mode 100644 index 0000000..495a37e Binary files /dev/null and b/bgpt/requirements_nvidia.txt differ diff --git a/bgpt/train-gen-og.py b/bgpt/train-gen-og.py new file mode 100644 index 0000000..70801a5 --- /dev/null +++ b/bgpt/train-gen-og.py @@ -0,0 +1,421 @@ +import os +import time +import torch +import random +import wandb +import numpy as np +from utils import * +from config_og import * +from tqdm import tqdm +from copy import deepcopy +from torch.cuda.amp import autocast, GradScaler +from torch.utils.data import Dataset, DataLoader +from transformers import GPT2Config, get_scheduler +import torch.distributed as dist +from torch.nn.parallel import DistributedDataParallel as DDP +from torch.utils.data.distributed import DistributedSampler +from datetime import datetime + + + +# Set up distributed training +world_size = int(os.environ['WORLD_SIZE']) if 'WORLD_SIZE' in os.environ else 1 +global_rank = int(os.environ['RANK']) if 'RANK' in os.environ else 0 +local_rank = int(os.environ['LOCAL_RANK']) if 'LOCAL_RANK' in os.environ else 0 + +if world_size > 1: + torch.cuda.set_device(local_rank) + device = torch.device("cuda", local_rank) + dist.init_process_group(backend='nccl') if world_size > 1 else None +else: + device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu") + +# Set random seed +seed = 0 + global_rank +random.seed(seed) +np.random.seed(seed) +torch.manual_seed(seed) +torch.cuda.manual_seed_all(seed) +torch.backends.cudnn.deterministic = True +torch.backends.cudnn.benchmark = False + +batch_size = BATCH_SIZE + +patch_config = GPT2Config(num_hidden_layers=PATCH_NUM_LAYERS, + max_length=PATCH_LENGTH, + max_position_embeddings=PATCH_LENGTH, + hidden_size=HIDDEN_SIZE, + n_head=HIDDEN_SIZE//64, + vocab_size=1) +byte_config = GPT2Config(num_hidden_layers=BYTE_NUM_LAYERS, + max_length=PATCH_SIZE+1, + max_position_embeddings=PATCH_SIZE+1, + hidden_size=HIDDEN_SIZE, + n_head=HIDDEN_SIZE//64, + vocab_size=256+1) +model = bGPTLMHeadModel(patch_config, byte_config) +model = model.to(device) + +# print parameter number +print("Parameter Number: "+str(sum(p.numel() for p in model.parameters() if p.requires_grad))) + +if world_size > 1: + model = DDP(model, device_ids=[local_rank], output_device=local_rank, find_unused_parameters=True) + +scaler = GradScaler() +is_autocast = True +optimizer = torch.optim.AdamW(model.parameters(), lr=LEARNING_RATE) + +def collate_batch(input_batches): + + input_patches, input_masks = zip(*input_batches) + input_patches = torch.nn.utils.rnn.pad_sequence(input_patches, batch_first=True, padding_value=256) + input_masks = torch.nn.utils.rnn.pad_sequence(input_masks, batch_first=True, padding_value=0) + + return input_patches.to(device), input_masks.to(device) + +def split_into_minibatches(input_patches, input_masks, minibatch_size): + minibatches = [] + for start_idx in range(0, len(input_patches), minibatch_size): + end_idx = start_idx + minibatch_size + minibatch_patches = input_patches[start_idx:end_idx] + minibatch_masks = input_masks[start_idx:end_idx] + minibatches.append((minibatch_patches, minibatch_masks)) + return minibatches + +def list_files_in_directory(directories): + file_list = [] + + for directory in directories: + for root, dirs, files in os.walk(directory): + for file in files: + file_path = os.path.join(root, file) + file_list.append(file_path) + return file_list + +def read_bytes(filename): + + ext = filename.split('.')[-1] + ext = bytearray(ext, 'utf-8') + ext = [byte for byte in ext][:PATCH_SIZE] + + with open(filename, 'rb') as f: + file_bytes = f.read() + + bytes = [] + for byte in file_bytes: + bytes.append(byte) + + if len(bytes)%PATCH_SIZE!=0: + bytes = bytes + [256] * (PATCH_SIZE - len(bytes) % PATCH_SIZE) + + bos_patch = ext + [256] * (PATCH_SIZE - len(ext)) + bytes = bos_patch + bytes + [256] * PATCH_SIZE + + if len(bytes) > PATCH_LENGTH*PATCH_SIZE: + print(f"Warning: {filename} is too long, truncating to {PATCH_LENGTH*PATCH_SIZE} bytes.") + bytes = bytes[:PATCH_LENGTH*PATCH_SIZE] + + masks = [1] * (len(bytes)//PATCH_SIZE) + + return bytes, masks + +class ByteDataset(Dataset): + def __init__(self, filenames): + if CONVERSION_MODE == None: + print(f"Regular Training Mode: {CONVERSION_MODE}, loading {len(filenames)} files") + self.filenames = filenames + elif "->" in CONVERSION_MODE: + print(f"Unidirectional Conversion Mode: {CONVERSION_MODE}, loading {len(filenames)} files") + input_ext = CONVERSION_MODE.split("->")[0] + target_ext = CONVERSION_MODE.split("->")[1] + + self.filenames = [] + for filename in filenames: + if filename.split('.')[-1]==input_ext: + target_filename = filename[:-(len(input_ext))] + target_ext + if os.path.exists(target_filename): + self.filenames.append((filename, target_filename)) + elif "&" in CONVERSION_MODE: + print(f"Bidirectional Conversion Mode: {CONVERSION_MODE}, loading {len(filenames)} files") + input_ext = CONVERSION_MODE.split("&")[0] + target_ext = CONVERSION_MODE.split("&")[1] + + self.filenames = [] + for filename in filenames: + if filename.split('.')[-1]==input_ext: + target_filename = filename[:-(len(input_ext))] + target_ext + if os.path.exists(target_filename): + self.filenames.append((filename, target_filename)) + elif filename.split('.')[-1]==target_ext: + input_filename = filename[:-(len(target_ext))] + input_ext + if os.path.exists(input_filename): + self.filenames.append((input_filename, filename)) + else: + raise ValueError("Invalid Conversion Mode, please check the config.py file") + + def __len__(self): + return len(self.filenames) + + def __getitem__(self, idx): + + if CONVERSION_MODE == None: + filename = self.filenames[idx] + file_bytes, file_masks = read_bytes(filename) + else: + input_filename, target_filename = self.filenames[idx] + input_bytes, input_masks = read_bytes(input_filename) + target_bytes, target_masks = read_bytes(target_filename) + + file_bytes = input_bytes[:-PATCH_SIZE] + target_bytes + file_masks = input_masks[:-1] + target_masks + + if len(file_bytes) > PATCH_LENGTH*PATCH_SIZE: + print(f"Warning: {input_filename} and {target_filename} are too long after concatenation, truncating to {PATCH_LENGTH*PATCH_SIZE} bytes.") + file_bytes = file_bytes[:PATCH_LENGTH*PATCH_SIZE] + file_masks = file_masks[:PATCH_LENGTH] + + file_bytes = torch.tensor(file_bytes, dtype=torch.long) + file_masks = torch.tensor(file_masks, dtype=torch.long) + + return file_bytes, file_masks + +# call model with a batch of input +def process_one_batch(batch): + input_patches, input_masks = batch + loss = model(input_patches, input_masks).loss + + # Reduce the loss on GPU 0 + if world_size > 1: + loss = loss.unsqueeze(0) + dist.reduce(loss, dst=0) + loss = loss / world_size + dist.broadcast(loss, src=0) + + return loss + +# do one epoch for training +# update total iters value from checkpoint in main training loop, else 1 +def train_epoch(total_train_iters = 1, logging_frequency = 100): + tqdm_train_set = tqdm(train_set) + # epoch loss + total_train_loss = 0 + iter_idx = 1 + model.train() + + for batch in tqdm_train_set: + minibatches = split_into_minibatches(batch[0], batch[1], BATCH_SIZE//ACCUMULATION_STEPS) + # minibatch loss + minibatch_loss = 0 + for minibatch in minibatches: + with autocast(): + loss = process_one_batch(minibatch) / ACCUMULATION_STEPS + scaler.scale(loss).backward() + loss_train = loss.item() + minibatch_loss += loss_train + total_train_loss += loss_train + + if global_rank == 0: + if iter_idx % logging_frequency == 0: + wandb.log({ + "train_loss": minibatch_loss, + "total_train_iters": total_train_iters, + }, step=total_train_iters) + scaler.step(optimizer) + scaler.update() + + lr_scheduler.step() + model.zero_grad(set_to_none=True) + tqdm_train_set.set_postfix({str(global_rank)+'_train_loss': total_train_loss / iter_idx}) + + iter_idx += 1 + total_train_iters +=1 + return total_train_loss / (iter_idx-1), total_train_iters + +# do one epoch for eval +def eval_epoch(total_val_iters = 1, logging_frequency = 100): + tqdm_eval_set = tqdm(eval_set) + total_eval_loss = 0 + iter_idx = 1 + model.eval() + + # Evaluate data for one epoch + for batch in tqdm_eval_set: + minibatches = split_into_minibatches(batch[0], batch[1], BATCH_SIZE//ACCUMULATION_STEPS) + minibatch_loss = 0 + for minibatch in minibatches: + with torch.no_grad(): + loss = process_one_batch(minibatch) / ACCUMULATION_STEPS + loss_eval = loss.item() + total_eval_loss += loss_eval + minibatch_loss += loss_eval + if global_rank == 0: + if iter_idx % logging_frequency == 0: + wandb.log({ + "eval_loss": minibatch_loss, + "total_val_iters": total_val_iters, + }, step=total_val_iters) + # print(f"current iter is : {total_val_iters}") + tqdm_eval_set.set_postfix({str(global_rank)+'_eval_loss': total_eval_loss / iter_idx}) + iter_idx += 1 + total_val_iters += 1 + # if iter_idx == 0: + # return 0 # Safeguard: Return 0 or another appropriate value if no iterations were done + # else: + # return total_eval_loss / iter_idx + return total_eval_loss / (iter_idx-1), total_val_iters + +# train and eval +if __name__ == "__main__": + current_time = datetime.now() + + # Format as a string + run_name = current_time.strftime("%Y-%m-%d_%H-%M-%S") + if global_rank == 0: + wandb.init(project="null", entity="mina-beiramy", name=f"bgpt_abc_midi_{run_name}") + + wandb.config.update({ + "TRAIN_FOLDERS": TRAIN_FOLDERS, + "EVAL_FOLDERS": EVAL_FOLDERS, + # "PRE_WEIGHTS_PATH": PRE_WEIGHTS_PATH, + "WEIGHTS_PATH": WEIGHTS_PATH, + "LOGS_PATH": LOGS_PATH, + "PATCH_SIZE": PATCH_SIZE, + "PATCH_LENGTH": PATCH_LENGTH, + "BYTE_NUM_LAYERS": BYTE_NUM_LAYERS, + "PATCH_NUM_LAYERS": PATCH_NUM_LAYERS, + "HIDDEN_SIZE": HIDDEN_SIZE, + "NUM_EPOCHS": NUM_EPOCHS, + "LEARNING_RATE": LEARNING_RATE, + "BATCH_SIZE": BATCH_SIZE, + "ACCUMULATION_STEPS": ACCUMULATION_STEPS, + "LOAD_FROM_CHECKPOINT": LOAD_FROM_CHECKPOINT, + # "LOAD_FROM_PRE_CHECKPOINT": LOAD_FROM_PRE_CHECKPOINT + # Add any other configurations you'd like to track + }) + + # load filenames under train and eval folder + train_files = list_files_in_directory(TRAIN_FOLDERS) + eval_files = list_files_in_directory(EVAL_FOLDERS) + + train_batch_nums = int(len(train_files) / batch_size) + eval_batch_nums = int(len(eval_files) / batch_size) + + random.shuffle(train_files) + random.shuffle(eval_files) + + train_files = train_files[:train_batch_nums*batch_size] + eval_files = eval_files[:eval_batch_nums*batch_size] + + train_set = ByteDataset(train_files) + eval_set = ByteDataset(eval_files) + + train_sampler = DistributedSampler(train_set, num_replicas=world_size, rank=local_rank) + eval_sampler = DistributedSampler(eval_set, num_replicas=world_size, rank=local_rank) + + train_set = DataLoader(train_set, batch_size=batch_size, collate_fn=collate_batch, sampler=train_sampler, shuffle = (train_sampler is None)) + eval_set = DataLoader(eval_set, batch_size=batch_size, collate_fn=collate_batch, sampler=eval_sampler, shuffle = (train_sampler is None)) + + lr_scheduler = get_scheduler( + name="cosine", + optimizer=optimizer, + num_warmup_steps=NUM_EPOCHS * len(train_set) // 10, + num_training_steps=NUM_EPOCHS * len(train_set), + ) + model = model.to(device) + optimizer = torch.optim.AdamW(model.parameters(), lr=LEARNING_RATE) + + if LOAD_FROM_PRETRAINED and os.path.exists(PRETRAINED_PATH): + # Load checkpoint to CPU + checkpoint = torch.load(PRETRAINED_PATH, map_location='cpu') + + # Here, model is assumed to be on GPU + # Load state dict to CPU model first, then move the model to GPU + if torch.cuda.device_count() > 1: + # If you have a DataParallel model, you need to load to model.module instead + cpu_model = deepcopy(model.module) + cpu_model.load_state_dict(checkpoint['model']) + model.module.load_state_dict(cpu_model.state_dict()) + else: + # Load to a CPU clone of the model, then load back + cpu_model = deepcopy(model) + cpu_model.load_state_dict(checkpoint['model']) + model.load_state_dict(cpu_model.state_dict()) + + print(f"Successfully Loaded Pretrained Checkpoint at Epoch {checkpoint['epoch']} with Loss {checkpoint['min_eval_loss']}") + + else: + pre_epoch = 0 + best_epoch = 0 + min_eval_loss = float('inf') + + if LOAD_FROM_CHECKPOINT and os.path.exists(WEIGHTS_PATH): + # Load checkpoint to CPU + checkpoint = torch.load(WEIGHTS_PATH, map_location='cpu') + + # Here, model is assumed to be on GPU + # Load state dict to CPU model first, then move the model to GPU + if torch.cuda.device_count() > 1: + # If you have a DataParallel model, you need to load to model.module instead + cpu_model = deepcopy(model.module) + cpu_model.load_state_dict(checkpoint['model']) + model.module.load_state_dict(cpu_model.state_dict()) + else: + # Load to a CPU clone of the model, then load back + cpu_model = deepcopy(model) + cpu_model.load_state_dict(checkpoint['model']) + model.load_state_dict(cpu_model.state_dict()) + optimizer.load_state_dict(checkpoint['optimizer']) + lr_scheduler.load_state_dict(checkpoint['lr_sched']) + pre_epoch = checkpoint['epoch'] + best_epoch = checkpoint['best_epoch'] + min_eval_loss = checkpoint['min_eval_loss'] + print("Successfully Loaded Checkpoint from Epoch %d" % pre_epoch) + checkpoint = None + + else: + pre_epoch = 0 + best_epoch = 0 + min_eval_loss = float('inf') + + total_train_iters = 1 + total_val_iters = 1 + + for epoch in range(1+pre_epoch, NUM_EPOCHS+1): + train_sampler.set_epoch(epoch) + eval_sampler.set_epoch(epoch) + print('-' * 21 + "Epoch " + str(epoch) + '-' * 21) + avg_train_loss, total_train_iters = train_epoch(total_train_iters, 1) + eval_loss, total_val_iters = eval_epoch(total_val_iters, 1) + print(total_val_iters) + + with open(LOGS_PATH,'a') as f: + f.write("Epoch " + str(epoch) + "\ntrain_loss: " + str(avg_train_loss) + "\neval_loss: " +str(eval_loss) + "\ntime: " + time.asctime(time.localtime(time.time())) + "\n\n") + # print(f"current iter is : {total_train_iters}") + if global_rank == 0: + wandb.log({ + "epoch_avg_train_loss": avg_train_loss, + "avg_eval_loss": eval_loss, + "epoch": epoch, + "total_iters": total_train_iters, + }, step=total_train_iters) + + if eval_loss < min_eval_loss: + best_epoch = epoch + min_eval_loss = eval_loss + checkpoint = { + 'model': model.module.state_dict() if hasattr(model, "module") else model.state_dict(), + 'optimizer': optimizer.state_dict(), + 'lr_sched': lr_scheduler.state_dict(), + 'epoch': epoch, + 'best_epoch': best_epoch, + 'min_eval_loss': min_eval_loss + } + torch.save(checkpoint, WEIGHTS_PATH) + + if world_size > 1: + dist.barrier() + + if global_rank==0: + print("Best Eval Epoch : "+str(best_epoch)) + print("Min Eval Loss : "+str(min_eval_loss)) diff --git a/bgpt/train-gen.py b/bgpt/train-gen.py index 38e995c..79603f2 100644 --- a/bgpt/train-gen.py +++ b/bgpt/train-gen.py @@ -5,11 +5,13 @@ import random import numpy as np import yaml +import json import argparse from utils import * # from config import * from pathlib import Path +from datetime import datetime from tqdm import tqdm from copy import deepcopy from torch.cuda.amp import autocast, GradScaler @@ -17,6 +19,7 @@ import torch.distributed as dist from torch.nn.parallel import DistributedDataParallel as DDP from torch.utils.data.distributed import DistributedSampler +from reloading_sampler import CustomDistributedSampler import wandb @@ -39,7 +42,7 @@ # Set random seed -seed = 0 + global_rank +seed = 0 random.seed(seed) np.random.seed(seed) torch.manual_seed(seed) @@ -47,28 +50,38 @@ torch.backends.cudnn.deterministic = True torch.backends.cudnn.benchmark = False + def find_most_recent_file(directory, pattern="*.pth"): file_paths = glob.glob(os.path.join(directory, pattern)) if file_paths: return max(file_paths, key=os.path.getmtime) return None +def find_most_recent_file_from_checkpoint_info(checkpoint_info_filepath): + checkpoint_filepath = None + with open(checkpoint_info_filepath, "r") as f: + data = json.load(f) + checkpoint_filepath = data.get("latest_checkpoint_path", None) + + return checkpoint_filepath def collate_batch(input_batches): - input_patches, input_masks = zip(*input_batches) + input_patches, input_masks, input_file_indices = zip(*input_batches) input_patches = torch.nn.utils.rnn.pad_sequence(input_patches, batch_first=True, padding_value=256) input_masks = torch.nn.utils.rnn.pad_sequence(input_masks, batch_first=True, padding_value=0) + input_file_indices = torch.nn.utils.rnn.pad_sequence(input_file_indices, batch_first=True, padding_value=-1) - return input_patches.to(device), input_masks.to(device) + return input_patches.to(device), input_masks.to(device), input_file_indices.to(device) -def split_into_minibatches(input_patches, input_masks, minibatch_size): +def split_into_minibatches(input_patches, input_masks, input_file_indices, minibatch_size): minibatches = [] for start_idx in range(0, len(input_patches), minibatch_size): end_idx = start_idx + minibatch_size minibatch_patches = input_patches[start_idx:end_idx] minibatch_masks = input_masks[start_idx:end_idx] - minibatches.append((minibatch_patches, minibatch_masks)) + minibatch_file_indices = input_file_indices[start_idx:end_idx] + minibatches.append((minibatch_patches, minibatch_masks, minibatch_file_indices)) return minibatches @@ -83,8 +96,8 @@ def list_files_in_directory(directories): return file_list -def read_bytes(filename, - patch_size, +def read_bytes(filename, + patch_size, patch_length): ext = filename.split('.')[-1] ext = bytearray(ext, 'utf-8') @@ -123,16 +136,20 @@ def __getitem__(self, idx): file_bytes = torch.tensor(file_bytes, dtype=torch.long) file_masks = torch.tensor(file_masks, dtype=torch.long) + file_idx = torch.tensor([idx], dtype=torch.long) - return file_bytes, file_masks + return file_bytes, file_masks, file_idx # call model with a batch of input -def process_one_batch(batch, - model, - world_size +def process_one_batch(batch, + model, + verbose=False ): - input_patches, input_masks = batch + input_patches, input_masks, input_file_indices = batch + if verbose: + print(f"Global Rank {global_rank}/{world_size} - File indices in batch: {input_file_indices}") + loss = model(input_patches, input_masks).loss # Reduce the loss on GPU 0 @@ -147,32 +164,43 @@ def process_one_batch(batch, # do one epoch for training def train_epoch(model, - train_set, - lr_scheduler, + train_set, + eval_set, + lr_scheduler, scaler, optimizer, - epoch, - best_epoch, - min_eval_loss, - batch_size, - accumulation_steps, - checkpoint_frequency, - checkpoint_path): - + epoch, + best_epoch, + min_eval_loss, + batch_size, + accumulation_steps, + checkpoint_frequency, + checkpoint_path, + logging_frequency, + total_iters=1, + verbose=False, + ): + global_batch_size = batch_size * world_size + # Note: Size of train_set is equal to the number of global batches + iters_per_epoch = len(train_set) + tqdm_train_set = tqdm(train_set) total_train_loss = 0 iter_idx = 1 - checkpoint_iters = 0 - total_iters = 0 + model.train() for batch in tqdm_train_set: - minibatches = split_into_minibatches(batch[0], batch[1], batch_size // accumulation_steps) + minibatches = split_into_minibatches(batch[0], batch[1], batch[2], batch_size // accumulation_steps) + minibatch_loss = 0 for minibatch in minibatches: with autocast(): - loss = process_one_batch(minibatch, model, world_size) / accumulation_steps + loss = process_one_batch(minibatch, model, verbose) / accumulation_steps scaler.scale(loss).backward() - total_train_loss += loss.item() + loss_val = loss.item() + minibatch_loss += loss_val + total_train_loss += loss_val + scaler.step(optimizer) scaler.update() @@ -180,60 +208,116 @@ def train_epoch(model, model.zero_grad(set_to_none=True) tqdm_train_set.set_postfix({str(global_rank) + '_train_loss': total_train_loss / iter_idx}) - if iter_idx == checkpoint_frequency: - checkpoint = { - 'model': model.module.state_dict() if hasattr(model, "module") else model.state_dict(), - 'optimizer': optimizer.state_dict(), - 'lr_sched': lr_scheduler.state_dict(), - 'epoch': epoch, - 'best_epoch': best_epoch, - 'min_eval_loss': min_eval_loss - } - - if checkpoint_iters == checkpoint_frequency: - torch.save(checkpoint, f'{checkpoint_path}/checkpoint{total_iters}.pth') - # torch.save(dataloader.state_dict(), f'{DATALOADER_PATH}/dataloader{total_iters}.pth') # THIS doesn't work - checkpoint_iters = 0 - - checkpoint_iters += 1 + # print(checkpoint_iters, checkpoint_frequency) + # print(total_iters) + + # Do eval + if iter_idx % checkpoint_frequency == 0: + eval_loss = eval_epoch(model, + eval_set, + batch_size, + accumulation_steps + ) + + + if global_rank == 0: + if iter_idx % logging_frequency == 0: + wandb.log({ + "train_loss": minibatch_loss, + "total_iters": total_iters, + }, step=total_iters) + + if iter_idx % checkpoint_frequency == 0: + # Log the latest loss for this checkpoint + wandb.log({ + "train_loss": minibatch_loss, + "total_iters": total_iters, + "ave_eval_loss": eval_loss if eval_loss else 0, + }, step=total_iters) + + # For sampler to start from this start_index when resuming checkpoint + train_sampler_start_index = (total_iters % iters_per_epoch)*global_batch_size + + checkpoint = { + 'model': model.module.state_dict() if hasattr(model, "module") else model.state_dict(), + 'optimizer': optimizer.state_dict(), + 'lr_sched': lr_scheduler.state_dict(), + 'epoch': epoch, + 'best_epoch': best_epoch, + 'min_eval_loss': min_eval_loss, + 'total_iters': total_iters, + 'train_sampler_start_index': train_sampler_start_index, + 'train_loss': minibatch_loss, + 'ave_eval_loss': eval_loss if eval_loss else 0, + } + checkpoint_filepath = f'{checkpoint_path}/checkpoint{total_iters}.pth' + torch.save(checkpoint, f'{checkpoint_path}/checkpoint{total_iters}.pth') + torch.save(checkpoint, f'{checkpoint_path}/latest.pth') + + # Save latest_checkpoint_info.json about the latest checkpoint info and path. + with open(f'{checkpoint_path}/latest_checkpoint_info.json', "w") as f: + checkpoint_data = { + "latest_checkpoint_path": checkpoint_filepath, + "epoch": epoch, + "best_epoch": best_epoch, + "total_iters": total_iters, + "train_sampler_start_index": train_sampler_start_index, + "train_loss": minibatch_loss, + "ave_eval_loss": eval_loss if eval_loss else 0, + } + json.dump(checkpoint_data, f) + + print(f"Checkpoint saved at {checkpoint_filepath}") + total_iters += 1 iter_idx += 1 - return total_train_loss / (iter_idx - 1) + return total_train_loss / max((iter_idx - 1), 1), total_iters # do one epoch for eval def eval_epoch(model, - eval_set, - batch_size, - accumulation_steps - ): - tqdm_eval_set = tqdm(eval_set) + eval_set, + batch_size, + accumulation_steps + ): + total_eval_loss = 0 iter_idx = 1 - model.eval() - # Evaluate data for one epoch - for batch in tqdm_eval_set: - minibatches = split_into_minibatches(batch[0], batch[1], batch_size // accumulation_steps) - for minibatch in minibatches: - with torch.no_grad(): - loss = process_one_batch(minibatch) / accumulation_steps - total_eval_loss += loss.item() - tqdm_eval_set.set_postfix({str(global_rank) + '_eval_loss': total_eval_loss / iter_idx}) - iter_idx += 1 - # return total_eval_loss / (iter_idx-1) + if len(eval_set) > 0: + model.eval() + tqdm_eval_set = tqdm(eval_set) + # Evaluate data for one epoch + for batch in tqdm_eval_set: + minibatches = split_into_minibatches(batch[0], batch[1], batch[2], batch_size // accumulation_steps) + for minibatch in minibatches: + with torch.no_grad(): + loss = process_one_batch(batch=minibatch, model=model) / accumulation_steps + total_eval_loss += loss.item() + tqdm_eval_set.set_postfix({str(global_rank) + '_eval_loss': total_eval_loss / iter_idx}) + iter_idx += 1 + + model.train() + + return total_eval_loss / max(iter_idx-1, 1) + # return 0 - return 0 def read_config_from_yaml(yaml_file): with open(yaml_file, 'r') as file: config = yaml.safe_load(file) return config + def main(args): config = read_config_from_yaml(args.train_config_path) + + LOAD_FROM_CHECKPOINT = args.load_from_checkpoint + config['load_from_checkpoint'] = LOAD_FROM_CHECKPOINT + print(config) + TRAIN_FOLDERS = config.get("train_folders") EVAL_FOLDERS = config.get("eval_folders") @@ -256,56 +340,65 @@ def main(args): BATCH_SIZE = config.get("batch_size") ACCUMULATION_STEPS = config.get("accumulation_steps") PATCH_SAMPLING_BATCH_SIZE = config.get("patch_sampling_batch_size") - LOAD_FROM_CHECKPOINT = config.get("load_from_checkpoint") + LOAD_FROM_PRE_CHECKPOINT = config.get("load_from_pre_checkpoint") CHECKPOINT_FREQUENCY = config.get("checkpoint_frequency") + LOGGING_FREQUENCY = config.get("logging_frequency") + VERBOSE = config.get("verbose") WANDB_CONFIG = config.get("wandb") WANDB_PROJ_NAME = WANDB_CONFIG.get("proj_name") WANDB_ENTITY = WANDB_CONFIG.get("entity") WANDB_MODE = WANDB_CONFIG.get("mode") + WANDB_NAME = WANDB_CONFIG.get("name", "run") FIRST_LAUNCH = config.get("first_launch") Path(BASE_DIR).mkdir(parents=True, exist_ok=True) Path(CHECKPOINT_PATH).mkdir(parents=True, exist_ok=True) Path(DATALOADER_PATH).mkdir(parents=True, exist_ok=True) - - wandb.init(project=WANDB_PROJ_NAME, entity=WANDB_ENTITY, mode=WANDB_MODE) - - wandb.config.update({ - "TRAIN_FOLDERS": TRAIN_FOLDERS, - "EVAL_FOLDERS": EVAL_FOLDERS, - # "PRE_WEIGHTS_PATH": PRE_WEIGHTS_PATH, - "WEIGHTS_PATH": WEIGHTS_PATH, - "LOGS_PATH": LOGS_PATH, - "PATCH_SIZE": PATCH_SIZE, - "PATCH_LENGTH": PATCH_LENGTH, - "BYTE_NUM_LAYERS": BYTE_NUM_LAYERS, - "PATCH_NUM_LAYERS": PATCH_NUM_LAYERS, - "HIDDEN_SIZE": HIDDEN_SIZE, - "NUM_EPOCHS": NUM_EPOCHS, - "LEARNING_RATE": LEARNING_RATE, - "BATCH_SIZE": BATCH_SIZE, - "ACCUMULATION_STEPS": ACCUMULATION_STEPS, - "LOAD_FROM_CHECKPOINT": LOAD_FROM_CHECKPOINT, - "LOAD_FROM_PRE_CHECKPOINT": LOAD_FROM_PRE_CHECKPOINT - # Add any other configurations you'd like to track - }) + + # Only log on master process + if global_rank == 0: + wandb.init(project=WANDB_PROJ_NAME, + entity=WANDB_ENTITY, + mode=WANDB_MODE, + dir=BASE_DIR, + name=WANDB_NAME + f"_{datetime.now().strftime('%Y%m%d_%H%M_%S')}") + + wandb.config.update({ + "TRAIN_FOLDERS": TRAIN_FOLDERS, + "EVAL_FOLDERS": EVAL_FOLDERS, + # "PRE_WEIGHTS_PATH": PRE_WEIGHTS_PATH, + "WEIGHTS_PATH": WEIGHTS_PATH, + "LOGS_PATH": LOGS_PATH, + "PATCH_SIZE": PATCH_SIZE, + "PATCH_LENGTH": PATCH_LENGTH, + "BYTE_NUM_LAYERS": BYTE_NUM_LAYERS, + "PATCH_NUM_LAYERS": PATCH_NUM_LAYERS, + "HIDDEN_SIZE": HIDDEN_SIZE, + "NUM_EPOCHS": NUM_EPOCHS, + "LEARNING_RATE": LEARNING_RATE, + "BATCH_SIZE": BATCH_SIZE, + "ACCUMULATION_STEPS": ACCUMULATION_STEPS, + "LOAD_FROM_CHECKPOINT": LOAD_FROM_CHECKPOINT, + "LOAD_FROM_PRE_CHECKPOINT": LOAD_FROM_PRE_CHECKPOINT + # Add any other configurations you'd like to track + }) batch_size = BATCH_SIZE patch_config = GPT2Config(num_hidden_layers=PATCH_NUM_LAYERS, - max_length=PATCH_LENGTH, - max_position_embeddings=PATCH_LENGTH, - hidden_size=HIDDEN_SIZE, - n_head=HIDDEN_SIZE // 64, - vocab_size=1) + max_length=PATCH_LENGTH, + max_position_embeddings=PATCH_LENGTH, + hidden_size=HIDDEN_SIZE, + n_head=HIDDEN_SIZE // 64, + vocab_size=1) byte_config = GPT2Config(num_hidden_layers=BYTE_NUM_LAYERS, - max_length=PATCH_SIZE + 1, - max_position_embeddings=PATCH_SIZE + 1, - hidden_size=HIDDEN_SIZE, - n_head=HIDDEN_SIZE // 64, - vocab_size=256 + 1) + max_length=PATCH_SIZE + 1, + max_position_embeddings=PATCH_SIZE + 1, + hidden_size=HIDDEN_SIZE, + n_head=HIDDEN_SIZE // 64, + vocab_size=256 + 1) model = bGPTLMHeadModel(patch_config, byte_config) model = model.to(device) @@ -317,8 +410,60 @@ def main(args): scaler = GradScaler() is_autocast = True + + model = model.to(device) + + is_checkpoint_loaded = False + checkpoint = None + train_sampler_start_index = 0 optimizer = torch.optim.AdamW(model.parameters(), lr=LEARNING_RATE) + if LOAD_FROM_CHECKPOINT: + # Load checkpoint to CPU + # most_recent_checkpoint = find_most_recent_file(CHECKPOINT_PATH, pattern="checkpoint*.pth") + most_recent_checkpoint = find_most_recent_file_from_checkpoint_info(f"{CHECKPOINT_PATH}/latest_checkpoint_info.json") + if most_recent_checkpoint is not None: + WEIGHTS_PATH = most_recent_checkpoint + checkpoint = torch.load(WEIGHTS_PATH, map_location='cpu') + + # Here, model is assumed to be on GPU + # Load state dict to CPU model first, then move the model to GPU + if torch.cuda.device_count() > 1: + # If you have a DataParallel model, you need to load to model.module instead + cpu_model = deepcopy(model.module) + cpu_model.load_state_dict(checkpoint['model']) + model.module.load_state_dict(cpu_model.state_dict()) + else: + # Load to a CPU clone of the model, then load back + cpu_model = deepcopy(model) + cpu_model.load_state_dict(checkpoint['model']) + model.load_state_dict(cpu_model.state_dict()) + + optimizer.load_state_dict(checkpoint['optimizer']) + # Note: lr_scheduler is loaded from checkpoint later after training dataloader is defined. + # lr_scheduler.load_state_dict(checkpoint['lr_sched']) + pre_epoch = checkpoint['epoch'] + best_epoch = checkpoint['best_epoch'] + min_eval_loss = checkpoint['min_eval_loss'] + total_iters = checkpoint['total_iters'] + train_sampler_start_index = checkpoint['train_sampler_start_index'] + checkpoint_train_loss = checkpoint['train_loss'] + print("Successfully Loaded Checkpoint from Epoch %d" % pre_epoch) + is_checkpoint_loaded = True + + # Log checkpoint's train_loss and total_iters for sanity checking on wandb + if global_rank == 0: + wandb.log({ + "train_loss": checkpoint_train_loss, + "total_iters": total_iters, + }, step=total_iters) + + else: + pre_epoch = 1 + best_epoch = 1 + min_eval_loss = 100 + total_iters = 1 + # load filenames under train and eval folder train_files = list_files_in_directory(TRAIN_FOLDERS) eval_files = list_files_in_directory(EVAL_FOLDERS) @@ -337,25 +482,10 @@ def main(args): eval_dataset = ByteDataset(eval_files, PATCH_SIZE, PATCH_LENGTH) # Initialize DistributedSampler - train_sampler = DistributedSampler(train_dataset, num_replicas=world_size, rank=local_rank) - eval_sampler = DistributedSampler(eval_dataset, num_replicas=world_size, rank=local_rank) - - # Load the most recent DataLoader state (if applicable) - # if args.first_launch != "True": - if not FIRST_LAUNCH: - # For train_set, if you're saving states of the sampler, load it here - most_recent_train_dataloader_state = find_most_recent_file(DATALOADER_PATH, pattern="train_dataloader_*.pth") - if most_recent_train_dataloader_state: - # Assuming you have a mechanism to load the sampler's state - train_sampler_state = torch.load(most_recent_train_dataloader_state, map_location='cpu') - train_sampler.load_state_dict(train_sampler_state) - print(f"Loaded train sampler state from {most_recent_train_dataloader_state}") - - # Similar logic could apply to eval_set if you have a stateful eval_sampler - most_recent_eval_dataloader_state = find_most_recent_file(DATALOADER_PATH, pattern="eval_dataloader_*.pth") - if most_recent_eval_dataloader_state: - # Load eval_sampler state if necessary - pass + train_sampler = CustomDistributedSampler(train_dataset, num_replicas=world_size, rank=global_rank, + start_index=train_sampler_start_index) + + eval_sampler = CustomDistributedSampler(eval_dataset, num_replicas=world_size, rank=global_rank) # Initialize DataLoaders with potentially state-restored samplers train_set = DataLoader(train_dataset, batch_size=batch_size, collate_fn=collate_batch, sampler=train_sampler, @@ -363,120 +493,104 @@ def main(args): eval_set = DataLoader(eval_dataset, batch_size=batch_size, collate_fn=collate_batch, sampler=eval_sampler, shuffle=False) + print(train_set) + # print(train_set.state_dict()) + # print(train_sampler.state_dict()) + # print(dir(train_sampler)) + # print(dir(train_set)) + lr_scheduler = get_scheduler( name="cosine", optimizer=optimizer, num_warmup_steps=NUM_EPOCHS * len(train_set) // 10, num_training_steps=NUM_EPOCHS * len(train_set), ) - model = model.to(device) - optimizer = torch.optim.AdamW(model.parameters(), lr=LEARNING_RATE) - - if LOAD_FROM_PRE_CHECKPOINT and os.path.exists(PRE_WEIGHTS_PATH): - # Load checkpoint to CPU - checkpoint = torch.load(PRE_WEIGHTS_PATH, map_location='cpu') - - # Here, model is assumed to be on GPU - # Load state dict to CPU model first, then move the model to GPU - if torch.cuda.device_count() > 1: - # If you have a DataParallel model, you need to load to model.module instead - cpu_model = deepcopy(model.module) - cpu_model.load_state_dict(checkpoint['model']) - model.module.load_state_dict(cpu_model.state_dict()) - else: - # Load to a CPU clone of the model, then load back - cpu_model = deepcopy(model) - cpu_model.load_state_dict(checkpoint['model']) - model.load_state_dict(cpu_model.state_dict()) - - print( - f"Successfully Loaded Pretrained Checkpoint at Epoch {checkpoint['epoch']} with Loss {checkpoint['min_eval_loss']}") - - else: - pre_epoch = 0 - best_epoch = 0 - min_eval_loss = 100 - - if LOAD_FROM_CHECKPOINT and os.path.exists(WEIGHTS_PATH): - # Load checkpoint to CPU - # TODO: Missing function. Is this supposed to be find_most_recent_file? - # most_recent_checkpoint = find_most_recent_checkpoint(CHECKPOINT_PATH) - most_recent_checkpoint = find_most_recent_file(CHECKPOINT_PATH, pattern="checkpoint*.pth") - if most_recent_checkpoint is not None: - WEIGHTS_PATH = most_recent_checkpoint - checkpoint = torch.load(WEIGHTS_PATH, map_location='cpu') - - # Here, model is assumed to be on GPU - # Load state dict to CPU model first, then move the model to GPU - if torch.cuda.device_count() > 1: - # If you have a DataParallel model, you need to load to model.module instead - cpu_model = deepcopy(model.module) - cpu_model.load_state_dict(checkpoint['model']) - model.module.load_state_dict(cpu_model.state_dict()) - else: - # Load to a CPU clone of the model, then load back - cpu_model = deepcopy(model) - cpu_model.load_state_dict(checkpoint['model']) - model.load_state_dict(cpu_model.state_dict()) - optimizer.load_state_dict(checkpoint['optimizer']) + + if is_checkpoint_loaded and checkpoint is not None: lr_scheduler.load_state_dict(checkpoint['lr_sched']) - pre_epoch = checkpoint['epoch'] - best_epoch = checkpoint['best_epoch'] - min_eval_loss = checkpoint['min_eval_loss'] - print("Successfully Loaded Checkpoint from Epoch %d" % pre_epoch) - checkpoint = None - else: - pre_epoch = 0 - best_epoch = 0 - min_eval_loss = 100 + # Note: Code that uses pre-checkpoint, which we aren't using now + # if LOAD_FROM_PRE_CHECKPOINT and os.path.exists(PRE_WEIGHTS_PATH): + # # Load checkpoint to CPU + # checkpoint = torch.load(PRE_WEIGHTS_PATH, map_location='cpu') + + # # Here, model is assumed to be on GPU + # # Load state dict to CPU model first, then move the model to GPU + # if torch.cuda.device_count() > 1: + # # If you have a DataParallel model, you need to load to model.module instead + # cpu_model = deepcopy(model.module) + # cpu_model.load_state_dict(checkpoint['model']) + # model.module.load_state_dict(cpu_model.state_dict()) + # else: + # # Load to a CPU clone of the model, then load back + # cpu_model = deepcopy(model) + # cpu_model.load_state_dict(checkpoint['model']) + # model.load_state_dict(cpu_model.state_dict()) + + # print( + # f"Successfully Loaded Pretrained Checkpoint at Epoch {checkpoint['epoch']} with Loss {checkpoint['min_eval_loss']}") + + # else: + # pre_epoch = 1 + # best_epoch = 1 + # min_eval_loss = 100 for epoch in range(1 + pre_epoch, NUM_EPOCHS + 1): train_sampler.set_epoch(epoch) eval_sampler.set_epoch(epoch) print('-' * 21 + "Epoch " + str(epoch) + '-' * 21) - train_loss = train_epoch(model, - train_set, - lr_scheduler, - scaler, - optimizer, - epoch, - best_epoch, - min_eval_loss, - BATCH_SIZE, - ACCUMULATION_STEPS, - CHECKPOINT_FREQUENCY, - CHECKPOINT_PATH - ) - - eval_loss = eval_epoch(model, - eval_set, - BATCH_SIZE, - ACCUMULATION_STEPS - ) + ave_train_loss, total_iters = train_epoch(model, + train_set, + eval_set, + lr_scheduler, + scaler, + optimizer, + epoch, + best_epoch, + min_eval_loss, + BATCH_SIZE, + ACCUMULATION_STEPS, + CHECKPOINT_FREQUENCY, + CHECKPOINT_PATH, + LOGGING_FREQUENCY, + total_iters, + VERBOSE + ) + if len(eval_set) != 0: + eval_loss = eval_epoch(model, + eval_set, + BATCH_SIZE, + ACCUMULATION_STEPS + ) + else: + eval_loss = 0 if global_rank == 0: with open(LOGS_PATH, 'a') as f: - f.write("Epoch " + str(epoch) + "\ntrain_loss: " + str(train_loss) + "\neval_loss: " + str( + f.write("Epoch " + str(epoch) + "\nave_train_loss: " + str(ave_train_loss) + "\neval_loss: " + str( eval_loss) + "\ntime: " + time.asctime(time.localtime(time.time())) + "\n\n") + wandb.log({ - "train_loss": train_loss, - "eval_loss": eval_loss, - "epoch": epoch - }) - if eval_loss < min_eval_loss: - best_epoch = epoch - min_eval_loss = eval_loss - checkpoint = { - 'model': model.module.state_dict() if hasattr(model, "module") else model.state_dict(), - 'optimizer': optimizer.state_dict(), - 'lr_sched': lr_scheduler.state_dict(), - 'epoch': epoch, - 'best_epoch': best_epoch, - 'min_eval_loss': min_eval_loss - } - torch.save(checkpoint, WEIGHTS_PATH) + "epoch_ave_train_loss": ave_train_loss, + "ave_eval_loss": eval_loss, + "epoch": epoch, + "total_iters": total_iters, + }, step=total_iters) + + # if eval_loss < min_eval_loss: + # best_epoch = epoch + # min_eval_loss = eval_loss + # checkpoint = { + # 'model': model.module.state_dict() if hasattr(model, "module") else model.state_dict(), + # 'optimizer': optimizer.state_dict(), + # 'lr_sched': lr_scheduler.state_dict(), + # 'epoch': epoch, + # 'best_epoch': best_epoch, + # 'min_eval_loss': min_eval_loss, + # 'total_iters': total_iters, + # } + # torch.save(checkpoint, WEIGHTS_PATH) # torch.save(dataloader.state_dict(), checkpoint_path) if world_size > 1: @@ -487,11 +601,14 @@ def main(args): print("Min Eval Loss : " + str(min_eval_loss)) - # train and eval if __name__ == "__main__": parser = argparse.ArgumentParser(description="Training script for bGBT") - parser.add_argument("--train-config-path", type=str, required=True, help="Path to the config YAML file for training run") + parser.add_argument("--train-config-path", type=str, required=True, + help="Path to the config YAML file for training run") + parser.add_argument("--load-from-checkpoint", action='store_true', dest="load_from_checkpoint", + help="If ths flag is present, model checkpoint will be loaded. By default without the flag, checkpoint will not be loaded.") + parser.set_defaults(load_from_checkpoint=False) args = parser.parse_args() main(args) diff --git a/scripts/abc-midi/README.md b/scripts/abc-midi/README.md new file mode 100644 index 0000000..5ce7c5c --- /dev/null +++ b/scripts/abc-midi/README.md @@ -0,0 +1,22 @@ +#### Configuration +clone this repo for abc->midi conversion. Same repository was mentiond in the paper for convesion. +``` + cd path/to/scripts/abc-midi + git clone git@github.com:xlvector/abcmidi.git + cd abcmidi + cmake +``` + +**NOTE** change the dir for save accordingly. +### Download irishman dataset +``` + cd path/to/scripts/abc-midi + python prep_abc_data.py sander-wood/irishman ../../data/hf_cache --split train -o ../../data/abc_data/train + python prep_abc_data.py sander-wood/irishman ../../data/hf_cache --split validation -o ../../data/abc_data/val +``` +### abc to midi file conversion +``` + cd path/to/scripts/abc-midi + python abc2midi.py ../../data/abc_data/train -o ../../data/midi_data/train + python abc2midi.py ../../data/abc_data/val -o ../../data/midi_data/val +``` diff --git a/scripts/abc-midi/abc2midi.py b/scripts/abc-midi/abc2midi.py new file mode 100644 index 0000000..630ccd6 --- /dev/null +++ b/scripts/abc-midi/abc2midi.py @@ -0,0 +1,53 @@ +import os +import argparse +import subprocess as sb +from tqdm import tqdm +from multiprocessing import Pool +from functools import partial + + +def _single_file(filename, abc_path, output_path): + # abc2midi is the package that converts formats (ref: https://github.com/xlvector/abcmidi) + command = ["abcmidi/abc2midi", os.path.join(abc_path, filename), "-o", os.path.join(output_path, filename[:-4]+".mid")] + process = sb.run(command, stderr=sb.PIPE, stdout=sb.PIPE) + + # storing err and outs to verify file conversion warning logs if needed + stdout_output = process.stdout.decode('utf-8') + stderr_output = process.stderr.decode('utf-8') + return stdout_output, stderr_output + +# install abcmidi package first +def convert_abc_to_midi(abc_path, output_path): + if not os.path.exists(output_path): + os.makedirs(output_path) + + # check log file after conversion finishes for any errors if midi files are not working + with open("conversion_log", "w") as log_file, Pool() as pool: + filenames = os.listdir(abc_path) + process_func = partial(_single_file, abc_path=abc_path, output_path=output_path) + results = list(tqdm(pool.imap(process_func, filenames, chunksize=20), total=len(filenames), desc="Converting abc files to midi files")) # reduce chunksize for lower memory usage + + for stdout_output, stderr_output in results: + log_file.write("STDOUT output:\n") + log_file.write(stdout_output) + log_file.write("\n\n") + log_file.write("STDERR output:\n") + log_file.write(stderr_output) + log_file.write("\n\n") + + +def main(): + parser = argparse.ArgumentParser(description="Convert abc files to midi files.") + parser.add_argument("abc_path", type=str, help="Path to directory containing abc files.") + parser.add_argument("-o", "--output_path", type=str, help="Output directory for saving midi files.", required=True) + args = parser.parse_args() + + convert_abc_to_midi(args.abc_path, args.output_path) + +if __name__ == "__main__": + main() + + # e.g. python abc2midi.py ../../data/abc_data/train -o ../../data/midi_data/train + # e.g. python abc2midi.py ../../data/abc_data/val -o ../../data/midi_data/val + # python abc2midi.py abc_data/val -o midi_data/val + \ No newline at end of file diff --git a/scripts/abc-midi/prep_abc_data.py b/scripts/abc-midi/prep_abc_data.py new file mode 100644 index 0000000..e95c1cc --- /dev/null +++ b/scripts/abc-midi/prep_abc_data.py @@ -0,0 +1,39 @@ +import os +import argparse +from datasets import load_dataset +from tqdm import tqdm + +def get_abc(dataset, cache_path, split, output_path): + + if not os.path.exists(output_path): + os.makedirs(output_path) + + data = load_dataset(dataset, cache_dir=cache_path, split=split) + # Loop through the dataset and save each 'abc notation' value into separate .abc files + for idx, sample in tqdm(enumerate(data), total=len(data), desc="Unpacking dataset into abc files"): + abc_notation = sample['abc notation'] + file_name = f"{idx}.abc" + file_path = os.path.join(output_path, file_name) + with open(file_path, 'w') as file: + file.write(abc_notation) + + + +def main(): + parser = argparse.ArgumentParser(description="Download and setup abc file train-val directories.") + parser.add_argument("dataset_id", type=str, help="Specify huggingface dataset id flag.") + parser.add_argument("cache_dir", type=str, help="Path to download huggingface dataset.") + parser.add_argument("--split", type=str, help="Train/Validation/Test split depending on the dataset.", required=True) + parser.add_argument("-o", "--output_dir", type=str, help="Set output directory for saving abc files.", required=True) + + args = parser.parse_args() + + get_abc(args.dataset_id, args.cache_dir, args.split, args.output_dir) + + +if __name__=="__main__": + main() + + # e.g. + # python prep_abc_data.py sander-wood/irishman ../../data/hf_cache --split train -o ../../data/abc_data/train + # python prep_abc_data.py sander-wood/irishman ../../data/hf_cache --split validation -o ../../data/abc_data/val \ No newline at end of file diff --git a/scripts/abc-midi/train.sh b/scripts/abc-midi/train.sh new file mode 100644 index 0000000..721542c --- /dev/null +++ b/scripts/abc-midi/train.sh @@ -0,0 +1,16 @@ +#!/bin/bash +#SBATCH -J abc-midi-bidirectional-btch16-paatch16-v1 +#SBATCH -o ./logs/abcmidi-btch16-ptch16-v1.out +#SBATCH --error=./logs/abcmidi-btch16-ptch16-v1.err +#SBATCH -c 4 +#SBATCH --gres=gpu:a100:1 +#SBATCH --mem=32G +#SBATCH -t 48:00:00 +#SBATCH --partition=unkillable + + +export WANDB_API_KEY=$(awk '/api_key/ {print $3}' /home/mila/m/mina.beiramy/.wandb_config) +module load anaconda/3 +module load cudatoolkit/11.6 +conda activate bgpt +python ../../bgpt/train-gen-og.py \ No newline at end of file diff --git a/scripts/easy-math/text_file_generator_batched.py b/scripts/easy-math/text_file_generator_batched.py index fb7b0dc..d0b2121 100644 --- a/scripts/easy-math/text_file_generator_batched.py +++ b/scripts/easy-math/text_file_generator_batched.py @@ -59,14 +59,14 @@ def merge_write(input_files, output_dir, max_examples_per_file, max_num_output_f # List of input file names -input_files = ['/lustre/orion/csc590/proj-shared/math-adder/2-digit-symbols/train.txt', - '/lustre/orion/csc590/proj-shared/math-adder/3-digit-symbols/train.txt', - '/lustre/orion/csc590/proj-shared/math-adder/4-digit-symbols/train.txt', - '/lustre/orion/csc590/proj-shared/math-adder/5-digit-symbols/train.txt', +input_files = ['/nfs/scratch/jonathan/byte_models/data/2-digit-symbols/train.txt', + '/nfs/scratch/jonathan/byte_models/data/3-digit-symbols/train.txt', + '/nfs/scratch/jonathan/byte_models/data/4-digit-symbols/train.txt', + '/nfs/scratch/jonathan/byte_models/data/5-digit-symbols/train.txt', ] # Output file name -output_dir = '/lustre/orion/csc590/proj-shared/math-adder/merged-symbols/train' +output_dir = '/nfs/scratch/jonathan/byte_models/data/math-adder/merged-symbols/train' Path(output_dir).mkdir(parents=True, exist_ok=True) # Will generate output files, each output file having examples interleaved from the various input files merge_write(input_files, output_dir, max_examples_per_file=550, max_num_output_files=1000000)