diff --git a/bgpt/conditional_launch.sh b/bgpt/conditional_launch.sh
new file mode 100644
index 0000000..96bf630
--- /dev/null
+++ b/bgpt/conditional_launch.sh
@@ -0,0 +1,27 @@
+#!/bin/bash
+
+# Define the source and target directories
+SRC_DIR="/lustre/orion/csc590/scratch/george-adams/bgpt2/byte_models/bgpt"
+
+# Change to config YAML path
+CONFIG_PATH="/lustre/orion/csc590/scratch/george-adams/bgpt2/byte_models/bgpt/configs/config_110M.yaml"
+NUM_NODES=32
+NUM_GPUS_PER_NODE=8
+
+cd "$SRC_DIR"
+
+source activate /lustre/orion/csc590/scratch/george-adams/conda_envs/bgpt
+
+module load rocm/5.2
+
+export MASTER_IP=`ip -f inet addr show hsn0 | sed -En -e 's/.*inet ([0-9.]+).*/\1/p' | head -1`
+
+# mkdir "$SLURM_JOB_NAME"
+# mkdir "$SLURM_JOB_NAME"/checkpoints
+# mkdir "$SLURM_JOB_NAME"/dataloaders
+
+if [ "$1" = "--load-from-checkpoint" ]; then
+    srun torchrun --nnodes=${NUM_NODES} --nproc_per_node=${NUM_GPUS_PER_NODE} --rdzv_id=$SLURM_JOB_ID --rdzv_backend=c10d --rdzv_endpoint=$MASTER_IP:29400 train-gen.py --train-config-path ${CONFIG_PATH} --load-from-checkpoint
+else
+    srun torchrun --nnodes=${NUM_NODES} --nproc_per_node=${NUM_GPUS_PER_NODE} --rdzv_id=$SLURM_JOB_ID --rdzv_backend=c10d --rdzv_endpoint=$MASTER_IP:29400 train-gen.py --train-config-path ${CONFIG_PATH}
+fi
\ No newline at end of file
diff --git a/bgpt/conditional_launch_jon.sh b/bgpt/conditional_launch_jon.sh
new file mode 100644
index 0000000..5651eb1
--- /dev/null
+++ b/bgpt/conditional_launch_jon.sh
@@ -0,0 +1,28 @@
+#!/bin/bash
+# Define the source and target directories
+SRC_DIR="/ccs/home/jonathanlimsc/generalist/byte_models/bgpt"
+
+# Change to config YAML path
+CONFIG_PATH="/ccs/home/jonathanlimsc/generalist/byte_models/bgpt/configs/config_110M_jon.yaml"
+# TODO: Pass this through from launch_global_jon.sh
+NUM_NODES=2
+
+cd "$SRC_DIR"
+
+source activate /ccs/home/jonathanlimsc/.conda/envs/bgpt2
+
+module load rocm/5.2
+
+export MASTER_IP=`ip -f inet addr show hsn0 | sed -En -e 's/.*inet ([0-9.]+).*/\1/p' | head -1`
+
+# mkdir "$SLURM_JOB_NAME"
+# mkdir "$SLURM_JOB_NAME"/checkpoints
+# mkdir "$SLURM_JOB_NAME"/dataloaders
+
+if [ "$1" = "--load-from-checkpoint" ]; then
+    srun torchrun --nnodes=${NUM_NODES} --nproc_per_node=8 --rdzv_id=$SLURM_JOB_ID --rdzv_backend=c10d --rdzv_endpoint=$MASTER_IP:29400 train-gen.py --train-config-path ${CONFIG_PATH} --load-from-checkpoint
+else
+    srun torchrun --nnodes=${NUM_NODES} --nproc_per_node=8 --rdzv_id=$SLURM_JOB_ID --rdzv_backend=c10d --rdzv_endpoint=$MASTER_IP:29400 train-gen.py --train-config-path ${CONFIG_PATH}
+fi
+
+
diff --git a/bgpt/conditional_launch_nvidia.sh b/bgpt/conditional_launch_nvidia.sh
new file mode 100644
index 0000000..b90bc5e
--- /dev/null
+++ b/bgpt/conditional_launch_nvidia.sh
@@ -0,0 +1,25 @@
+#!/bin/bash
+# Define the source and target directories
+SRC_DIR="/home/jonathan/cerc/byte_models/bgpt"
+
+# Change to config YAML path
+CONFIG_PATH="/home/jonathan/cerc/byte_models/bgpt/configs/config_110M_math_nvidia.yaml"
+# TODO: Pass this through from launch_global_jon.sh
+NUM_NODES=1
+NUM_GPUS_PER_NODE=4
+
+cd "$SRC_DIR"
+
+source ${HOME}/.bashrc
+source activate /nfs/scratch/jonathan/micromamba/envs/bgpt
+
+# export MASTER_IP=$(scontrol show hostnames $SLURM_JOB_NODELIST | head -n 1)
+export MASTER_IP=localhost
+
+if [ "$1" = "--load-from-checkpoint" ]; then
+    srun torchrun --nnodes=${NUM_NODES} --nproc_per_node=${NUM_GPUS_PER_NODE} --rdzv_id=$SLURM_JOB_ID --rdzv_backend=c10d --rdzv_endpoint=$MASTER_IP:29400 train-gen.py --train-config-path ${CONFIG_PATH} --load-from-checkpoint
+else
+    srun torchrun --nnodes=${NUM_NODES} --nproc_per_node=${NUM_GPUS_PER_NODE} --rdzv_id=$SLURM_JOB_ID --rdzv_backend=c10d --rdzv_endpoint=$MASTER_IP:29400 train-gen.py --train-config-path ${CONFIG_PATH}
+fi
+
+
diff --git a/bgpt/conditional_launch_nvidia_small.sh b/bgpt/conditional_launch_nvidia_small.sh
new file mode 100644
index 0000000..457d21f
--- /dev/null
+++ b/bgpt/conditional_launch_nvidia_small.sh
@@ -0,0 +1,25 @@
+#!/bin/bash
+# Define the source and target directories
+SRC_DIR="/home/jonathan/cerc/byte_models/bgpt"
+
+# Change to config YAML path
+CONFIG_PATH="/home/jonathan/cerc/byte_models/bgpt/configs/config_110M_test.yaml"
+# TODO: Pass this through from launch_global_jon.sh
+NUM_NODES=1
+NUM_GPUS_PER_NODE=2
+
+cd "$SRC_DIR"
+
+source ${HOME}/.bashrc
+source activate /nfs/scratch/jonathan/micromamba/envs/bgpt
+
+# export MASTER_IP=$(scontrol show hostnames $SLURM_JOB_NODELIST | head -n 1)
+export MASTER_IP=localhost
+
+if [ "$1" = "--load-from-checkpoint" ]; then
+    srun torchrun --nnodes=${NUM_NODES} --nproc_per_node=${NUM_GPUS_PER_NODE} --rdzv_id=$SLURM_JOB_ID --rdzv_backend=c10d --rdzv_endpoint=$MASTER_IP:29400 train-gen.py --train-config-path ${CONFIG_PATH} --load-from-checkpoint
+else
+    srun torchrun --nnodes=${NUM_NODES} --nproc_per_node=${NUM_GPUS_PER_NODE} --rdzv_id=$SLURM_JOB_ID --rdzv_backend=c10d --rdzv_endpoint=$MASTER_IP:29400 train-gen.py --train-config-path ${CONFIG_PATH}
+fi
+
+
diff --git a/bgpt/config_og.py b/bgpt/config_og.py
new file mode 100644
index 0000000..5bd5d47
--- /dev/null
+++ b/bgpt/config_og.py
@@ -0,0 +1,59 @@
+# Configuration for generative modelling and classification
+TRAIN_FOLDERS = [
+                # "wikipedia/train",  
+                # "ag_news/train", 
+                # "imagenet32/train", 
+                # "cifar/train", 
+                # "librispeech8K/train", 
+                # "speech_commands8K/train", 
+                #"irishman/train",
+                #"/home/mila/m/mina.beiramy/scratch/bgpt/data/dummy",
+                "/home/mila/m/mina.beiramy/scratch/bgpt/data/mix",
+                # "cpu_states/train",
+                 ]     # Folder containing training data
+EVAL_FOLDERS = [
+                # "wikipedia/test",  
+                # "ag_news/test", 
+                # "imagenet32/test", 
+                # "cifar/test", 
+                # "librispeech8K/test", 
+                # "speech_commands8K/test", 
+                #"irishman/test",
+                #"/home/mila/m/mina.beiramy/scratch/bgpt/data/dummy_val",
+                "/home/mila/m/mina.beiramy/scratch/bgpt/data/mix_val",
+                # "cpu_states/test",
+                ]                                               # Folder containing evaluation data
+
+# Configuration for the paths
+PRETRAINED_PATH = "weights-text.pth"                            # Path to pre-trained weights
+WEIGHTS_PATH = "/home/mila/m/mina.beiramy/workspace/byte_models/scripts/abc-midi/exp/chkp/abcmidi-btch16-ptch16-v1.pth"                       # Path to save weights
+LOGS_PATH = "/home/mila/m/mina.beiramy/workspace/byte_models/scripts/abc-midi/logs/logs-abcmidi-btch16-ptch16-v1.txt"                              # Path to save logs
+
+# Configuration for the model
+PATCH_SIZE = 16                                                 # Patch Size
+PATCH_LENGTH = 512                                              # Patch Length
+BYTE_NUM_LAYERS = 3                                             # Number of layers in the decoder
+PATCH_NUM_LAYERS = 12                                           # Number of layers in the encoder
+HIDDEN_SIZE = 768                                               # Hidden Size
+
+# Configuration for the training
+NUM_EPOCHS = 10                                                 # Number of epochs to train for (if early stopping doesn't intervene)
+LEARNING_RATE = 1e-5                                            # Learning rate for the optimizer
+BATCH_SIZE = 16                                                  # Batch size for training
+ACCUMULATION_STEPS = 1                                          # Accumulation steps to simulate large batch size
+PATCH_SAMPLING_BATCH_SIZE = 0                                   # Batch size for patch during training, 0 for full conaudio
+LOAD_FROM_CHECKPOINT = False                                    # Whether to load weights from a checkpoint
+LOAD_FROM_PRETRAINED = True                                     # Whether to load pre-trained weights from a checkpoint
+CONVERSION_MODE = 'abc&mid'                                          # Mode of conversion (None for regular training, input->output for unidirectional conversion, input&output for bidirectional conversion)
+
+# Configuration for inference
+INFERENCE_WEIGHTS_PATH = "weights-conversion.pth"               # Path to weights for inference
+INPUT_EXT = "abc"                                               # Extension of input files, used for conversion
+TARGET_EXT = "mid"                                              # Extension of target files
+INPUT_FOLDER = "input"                                          # Folder containing input files
+OUTPUT_FOLDER = "output"                                        # Folder to save output files
+MODE = "convert"                                                # Mode of inference (convert or generate)
+NUM_SAMPLES = 100                                               # Number of samples to generate (only for generate mode)
+TOP_K = 0                                                       # Top k for sampling
+TOP_P = 1.                                                      # Top p for sampling
+TEMPERATURE = 1                                                 # Temperature for sampling
\ No newline at end of file
diff --git a/bgpt/configs/config_110M.yaml b/bgpt/configs/config_110M.yaml
index d5924c2..e602679 100644
--- a/bgpt/configs/config_110M.yaml
+++ b/bgpt/configs/config_110M.yaml
@@ -1,7 +1,7 @@
 train_folders:
-  - "/lustre/orion/csc590/proj-shared/math-adder/merged-symbols/sample"
+  - "/lustre/orion/csc590/scratch/george-adams/data/wikipedia/train_text3"
 eval_folders: []
-base_dir: "/lustre/orion/csc590/scratch/jonathanlimsc/bgpt/bgpt_110m_ezmath_2digit"
+base_dir: "/lustre/orion/csc590/scratch/george-adams/bgpt2/byte_models/bgpt/110m_wikipedia"
 weights_path: "weights.pth"
 pre_weights_path: null
 logs_path: "log.txt"
@@ -17,10 +17,11 @@ learning_rate: 0.0001
 batch_size: 16
 accumulation_steps: 1
 patch_sampling_batch_size: 0
-load_from_checkpoint: false
+# load_from_checkpoint: false # Deprecated this into a command-line arg that is False by default. So that there's no need to maintain a config where this is True and a reload config where this is True.
 load_from_pre_checkpoint: false
 checkpoint_frequency: 5
 first_launch: true
+verbose: true
 wandb:
   proj_name: byte_models
   entity: jonathanlimsc
diff --git a/bgpt/configs/config_110M_jon.yaml b/bgpt/configs/config_110M_jon.yaml
new file mode 100644
index 0000000..1cf2e36
--- /dev/null
+++ b/bgpt/configs/config_110M_jon.yaml
@@ -0,0 +1,34 @@
+train_folders:
+  - "/lustre/orion/csc590/proj-shared/math-adder/merged-symbols/sample_10K"
+  # - "/lustre/orion/csc590/proj-shared/math-adder/merged-symbols/train"
+  # - "/lustre/orion/csc590/proj-shared/mathematics_dataset-v1.0/train-easy"
+  # - "/lustre/orion/csc590/proj-shared/mathematics_dataset-v1.0/train-medium"
+  # - "/lustre/orion/csc590/proj-shared/mathematics_dataset-v1.0/train-hard"
+  # - "/lustre/orion/csc590/proj-shared/wikipedia/train_text3"
+  # - "/lustre/orion/csc590/proj-shared/imagenet-1k/train"
+eval_folders: []
+base_dir: "/lustre/orion/csc590/scratch/jonathanlimsc/bgpt/test_continual"
+weights_path: "weights.pth"
+pre_weights_path: null
+logs_path: "log.txt"
+checkpoint_path: "checkpoints"
+dataloader_path: "dataloaders"
+patch_size: 16
+patch_length: 512
+byte_num_layers: 3
+patch_num_layers: 12
+hidden_size: 768
+num_epochs: 32
+learning_rate: 0.0001
+batch_size: 16
+accumulation_steps: 1
+patch_sampling_batch_size: 0
+# load_from_checkpoint: false # Deprecated this into a command-line arg that is False by default. So that there's no need to maintain a config where this is True and a reload config where this is True.
+load_from_pre_checkpoint: false
+checkpoint_frequency: 40
+first_launch: true
+verbose: false
+wandb:
+  proj_name: byte_models
+  entity: jonathanlimsc
+  mode: offline
diff --git a/bgpt/configs/config_110M_math_nvidia.yaml b/bgpt/configs/config_110M_math_nvidia.yaml
new file mode 100644
index 0000000..4522a82
--- /dev/null
+++ b/bgpt/configs/config_110M_math_nvidia.yaml
@@ -0,0 +1,31 @@
+train_folders:
+  - "/nfs/scratch/jonathan/byte_models/data/math-adder/merged-symbols/train"
+eval_folders:
+  - "/nfs/scratch/jonathan/byte_models/data/math-adder/merged-symbols/validation"
+base_dir: "/nfs/scratch/jonathan/byte_models/simple_math"
+weights_path: "weights.pth"
+pre_weights_path: null
+logs_path: "log.txt"
+checkpoint_path: "checkpoints"
+dataloader_path: "dataloaders"
+patch_size: 16
+patch_length: 512
+byte_num_layers: 3
+patch_num_layers: 12
+hidden_size: 768
+num_epochs: 10
+learning_rate: 0.0001
+batch_size: 8
+accumulation_steps: 1
+patch_sampling_batch_size: 0
+# load_from_checkpoint: false # Deprecated this into a command-line arg that is False by default. So that there's no need to maintain a config where this is True and a reload config where this is True.
+load_from_pre_checkpoint: false
+checkpoint_frequency: 10000
+logging_frequency: 100
+first_launch: true
+verbose: false
+wandb:
+  proj_name: byte_models
+  entity: jonathanlimsc
+  mode: online # put 'offline' if you don't want it to upload to WanB server
+  name: simple_math
diff --git a/bgpt/configs/config_110M_test.yaml b/bgpt/configs/config_110M_test.yaml
new file mode 100644
index 0000000..a6f4269
--- /dev/null
+++ b/bgpt/configs/config_110M_test.yaml
@@ -0,0 +1,31 @@
+train_folders:
+  - "/nfs/scratch/jonathan/byte_models/data/math-adder/merged-symbols/sample_10K"
+eval_folders:
+  - "/nfs/scratch/jonathan/byte_models/data/math-adder/merged-symbols/val_1K"
+base_dir: "/nfs/scratch/jonathan/byte_models/test_eval"
+weights_path: "weights.pth"
+pre_weights_path: null
+logs_path: "log.txt"
+checkpoint_path: "checkpoints"
+dataloader_path: "dataloaders"
+patch_size: 16
+patch_length: 512
+byte_num_layers: 3
+patch_num_layers: 12
+hidden_size: 768
+num_epochs: 10
+learning_rate: 0.0001
+batch_size: 8
+accumulation_steps: 1
+patch_sampling_batch_size: 0
+# load_from_checkpoint: false # Deprecated this into a command-line arg that is False by default. So that there's no need to maintain a config where this is True and a reload config where this is True.
+load_from_pre_checkpoint: false
+checkpoint_frequency: 1000
+logging_frequency: 100
+first_launch: true
+verbose: false
+wandb:
+  proj_name: byte_models
+  entity: jonathanlimsc
+  mode: online # put 'offline' if you don't want it to upload to WanB server
+  name: test_eval
diff --git a/bgpt/configs/config_1B_jon.yaml b/bgpt/configs/config_1B_jon.yaml
new file mode 100644
index 0000000..d846606
--- /dev/null
+++ b/bgpt/configs/config_1B_jon.yaml
@@ -0,0 +1,28 @@
+train_folders:
+  - "/lustre/orion/csc590/proj-shared/math-adder/merged-symbols/sample"
+eval_folders: []
+base_dir: "/lustre/orion/csc590/scratch/jonathanlimsc/bgpt/testrun"
+weights_path: "weights.pth"
+pre_weights_path: null
+logs_path: "log.txt"
+checkpoint_path: "checkpoints"
+dataloader_path: "dataloaders"
+patch_size: 16
+patch_length: 512
+byte_num_layers: 3
+patch_num_layers: 16
+hidden_size: 2048
+num_epochs: 5
+learning_rate: 0.0001
+batch_size: 4
+accumulation_steps: 1
+patch_sampling_batch_size: 0
+load_from_checkpoint: false
+load_from_pre_checkpoint: false
+checkpoint_frequency: 5
+first_launch: true
+verbose: false
+wandb:
+  proj_name: byte_models
+  entity: jonathanlimsc
+  mode: offline
diff --git a/bgpt/configs/config_400M_jon.yaml b/bgpt/configs/config_400M_jon.yaml
new file mode 100644
index 0000000..b36ae20
--- /dev/null
+++ b/bgpt/configs/config_400M_jon.yaml
@@ -0,0 +1,28 @@
+train_folders:
+  - "/lustre/orion/csc590/proj-shared/math-adder/merged-symbols/train"
+eval_folders: []
+base_dir: "/lustre/orion/csc590/scratch/jonathanlimsc/bgpt/testrun"
+weights_path: "weights.pth"
+pre_weights_path: null
+logs_path: "log.txt"
+checkpoint_path: "checkpoints"
+dataloader_path: "dataloaders"
+patch_size: 16
+patch_length: 512
+byte_num_layers: 3
+patch_num_layers: 24
+hidden_size: 1024
+num_epochs: 5
+learning_rate: 0.0001
+batch_size: 8
+accumulation_steps: 1
+patch_sampling_batch_size: 0
+load_from_checkpoint: false
+load_from_pre_checkpoint: false
+checkpoint_frequency: 5
+first_launch: true
+verbose: false
+wandb:
+  proj_name: byte_models
+  entity: jonathanlimsc
+  mode: offline
diff --git a/bgpt/configs/reload_config_110M.yaml b/bgpt/configs/reload_config_110M.yaml
new file mode 100644
index 0000000..077d149
--- /dev/null
+++ b/bgpt/configs/reload_config_110M.yaml
@@ -0,0 +1,28 @@
+train_folders:
+  - "/lustre/orion/csc590/scratch/george-adams/data/wikipedia/train_text3"
+eval_folders: []
+base_dir: "/lustre/orion/csc590/scratch/george-adams/bgpt2/byte_models/bgpt/110m_wikipedia"
+weights_path: "weights.pth"
+pre_weights_path: null
+logs_path: "log.txt"
+checkpoint_path: "checkpoints"
+dataloader_path: "dataloaders"
+patch_size: 16
+patch_length: 1024
+byte_num_layers: 3
+patch_num_layers: 12
+hidden_size: 768
+num_epochs: 32
+learning_rate: 0.0001
+batch_size: 16
+accumulation_steps: 1
+patch_sampling_batch_size: 0
+load_from_checkpoint: true
+load_from_pre_checkpoint: false
+checkpoint_frequency: 5
+first_launch: true
+verbose: true
+wandb:
+  proj_name: byte_models
+  entity: jonathanlimsc
+  mode: offline
diff --git a/bgpt/launch-interactive.sh b/bgpt/launch-interactive.sh
new file mode 100755
index 0000000..155b15a
--- /dev/null
+++ b/bgpt/launch-interactive.sh
@@ -0,0 +1,23 @@
+#!/bin/bash
+
+USERNAME=jonathanlimsc
+# Define the source and target directories
+SRC_DIR=/lustre/orion/csc590/scratch/${USERNAME}/bgpt
+
+# Change to config YAML path
+CONFIG_PATH=$(pwd)/configs/config_100M.yaml
+
+cd "$SRC_DIR"
+
+source /ccs/home/jonathanlimsc/miniconda3/etc/profile.d
+conda activate /ccs/home/jonathanlimsc/.conda/envs/bgpt/
+
+module load rocm/5.2
+
+# export MASTER_IP=`ip -f inet addr show hsn0 | sed -En -e 's/.*inet ([0-9.]+).*/\1/p' | head -1`
+
+# mkdir "$SLURM_JOB_NAME"
+# mkdir "$SLURM_JOB_NAME"/checkpoints
+# mkdir "$SLURM_JOB_NAME"/dataloaders
+
+python train-gen.py --train-config-path ${CONFIG_PATH}
diff --git a/bgpt/launch.sh b/bgpt/launch.sh
index 47d948a..28d5536 100644
--- a/bgpt/launch.sh
+++ b/bgpt/launch.sh
@@ -7,15 +7,14 @@
 #SBATCH -N 32
 
 # Define the source and target directories
-SRC_DIR="/lustre/orion/csc590/scratch/george-adams/bgpt"
+SRC_DIR="/lustre/orion/csc590/scratch/george-adams/byte_models/bgpt"
 
 # Change to config YAML path
-CONFIG_PATH="bgpt/configs/config_100M.yaml"
+CONFIG_PATH="/lustre/orion/csc590/scratch/george-adams/byte_models/bgpt/configs/config_110M.yaml"
 
 cd "$SRC_DIR"
 
-source /lustre/orion/csc590/scratch/$(whoami)/miniconda3/etc/profile.d/conda.sh
-conda activate /lustre/orion/csc590/scratch/george-adams/conda_envs/bgpt
+source activate /lustre/orion/csc590/scratch/george-adams/conda_envs/bgpt
 
 module load rocm/5.2
 
@@ -25,4 +24,4 @@ export MASTER_IP=`ip -f inet addr show hsn0 | sed -En -e 's/.*inet ([0-9.]+).*/\
 # mkdir "$SLURM_JOB_NAME"/checkpoints
 # mkdir "$SLURM_JOB_NAME"/dataloaders
 
-srun torchrun --nnodes=2 --nproc_per_node=8 --rdzv_id=$SLURM_JOB_ID --rdzv_backend=c10d --rdzv_endpoint=$MASTER_IP:29400 train-gen.py --train-config-path ${CONFIG_PATH}
+srun torchrun --nnodes=32 --nproc_per_node=8 --rdzv_id=$SLURM_JOB_ID --rdzv_backend=c10d --rdzv_endpoint=$MASTER_IP:29400 train-gen.py --train-config-path ${CONFIG_PATH}
diff --git a/bgpt/launch_batch_jon.sh b/bgpt/launch_batch_jon.sh
new file mode 100755
index 0000000..a7abe69
--- /dev/null
+++ b/bgpt/launch_batch_jon.sh
@@ -0,0 +1,28 @@
+#!/bin/bash
+#SBATCH -A CSC590
+#SBATCH -J test-interactive
+#SBATCH -o /lustre/orion/csc590/scratch/jonathanlimsc/bgpt/logs/%x-%j.out
+#SBATCH -t 00:10:00
+#SBATCH -p batch
+#SBATCH -q debug
+#SBATCH -N 2
+
+# Define the source and target directories
+SRC_DIR="/ccs/home/jonathanlimsc/generalist/byte_models/bgpt"
+
+# Change to config YAML path
+CONFIG_PATH="/ccs/home/jonathanlimsc/generalist/byte_models/bgpt/configs/config_1B_jon.yaml"
+
+cd "$SRC_DIR"
+
+source activate /ccs/home/jonathanlimsc/.conda/envs/bgpt2
+
+module load rocm/5.2
+
+export MASTER_IP=`ip -f inet addr show hsn0 | sed -En -e 's/.*inet ([0-9.]+).*/\1/p' | head -1`
+
+# mkdir "$SLURM_JOB_NAME"
+# mkdir "$SLURM_JOB_NAME"/checkpoints
+# mkdir "$SLURM_JOB_NAME"/dataloaders
+
+srun torchrun --nnodes=1 --nproc_per_node=8 --rdzv_id=$SLURM_JOB_ID --rdzv_backend=c10d --rdzv_endpoint=$MASTER_IP:29400 train-gen.py --train-config-path ${CONFIG_PATH}
diff --git a/bgpt/launch_batch_jon_nvidia.sh b/bgpt/launch_batch_jon_nvidia.sh
new file mode 100755
index 0000000..6418c22
--- /dev/null
+++ b/bgpt/launch_batch_jon_nvidia.sh
@@ -0,0 +1,29 @@
+#!/bin/bash
+#SBATCH -J test-batch
+#SBATCH -o /nfs/scratch/jonathan/byte_models/logs/%x-%j.out
+#SBATCH --error=/nfs/scratch/jonathan/byte_models/logs/%x-%j.err
+#SBATCH -N 1
+#SBATCH --cpus-per-task 6
+#SBATCH --gres=gpu:A100:2
+#SBATCH --mem=32G
+#SBATCH -t 00:5:00
+
+# Define the source and target directories
+SRC_DIR="/home/jonathan/cerc/byte_models/bgpt"
+
+# Change to config YAML path
+CONFIG_PATH="/home/jonathan/cerc/byte_models/bgpt/configs/config_110M_jon_nvidia.yaml"
+
+NUM_NODES=1
+NUM_GPUS_PER_NODE=2
+
+cd "$SRC_DIR"
+
+source ${HOME}/.bashrc
+source activate /nfs/scratch/jonathan/micromamba/envs/bgpt
+
+# export MASTER_IP=$(scontrol show hostnames $SLURM_JOB_NODELIST | head -n 1)
+export MASTER_IP=localhost
+# export MASTER_PORT=$((((RANDOM<<15)|RANDOM)%63001+2001))
+
+srun torchrun --nnodes=${NUM_NODES} --nproc_per_node=${NUM_GPUS_PER_NODE} --rdzv_id=$SLURM_JOB_ID --rdzv_backend=c10d --rdzv_endpoint=$MASTER_IP:29400 train-gen.py --train-config-path ${CONFIG_PATH} --load-from-checkpoint
diff --git a/bgpt/launch_global.sh b/bgpt/launch_global.sh
new file mode 100755
index 0000000..c423168
--- /dev/null
+++ b/bgpt/launch_global.sh
@@ -0,0 +1,32 @@
+#!/bin/bash
+
+SRC_DIR="/lustre/orion/csc590/scratch/george-adams/bgpt2/byte_models/bgpt"
+
+# Number of jobs to submit
+NUM_JOBS=2
+NUM_NODES=32
+NUM_GPUS_PER_NODE=8
+TIME_LIMIT_PER_JOB="2:00:00"
+
+# Job dependency (set to "none" for the first job)
+DEPENDENCY="none"
+
+for (( i=1; i<=NUM_JOBS; i++ ))
+do
+  # Job name
+  JOB_NAME="110m-wikipedia-reloaded-$i"
+
+  # Submit job and capture the job ID
+  if [ "$DEPENDENCY" == "none" ]; then
+    JOB_SUBMIT_OUTPUT=$(sbatch --job-name=$JOB_NAME --output=${SRC_DIR}/logs/%x-%j.out --error=${SRC_DIR}/logs/%x-%j.err --time=${TIME_LIMIT_PER_JOB} --nodes=${NUM_NODES} --partition=batch conditional_launch.sh
+  else
+    JOB_SUBMIT_OUTPUT=$(sbatch --dependency=afterany:$DEPENDENCY --job-name=$JOB_NAME --output=${SRC_DIR}/logs/%x-%j.out --error=${SRC_DIR}/logs/%x-%j.err --time=${TIME_LIMIT_PER_JOB} --nodes=${NUM_NODES} --partition=batch conditional_launch.sh --load-from-checkpoint)
+  fi
+
+  # Extract job ID from the submission output
+  JOB_ID=$(echo $JOB_SUBMIT_OUTPUT | grep -oP '\d+')
+  echo "Submitted job $JOB_ID"
+
+  # Set this job ID as the dependency for the next job
+  DEPENDENCY=$JOB_ID
+done
diff --git a/bgpt/launch_global_jon.sh b/bgpt/launch_global_jon.sh
new file mode 100755
index 0000000..3184637
--- /dev/null
+++ b/bgpt/launch_global_jon.sh
@@ -0,0 +1,30 @@
+#!/bin/bash
+
+SRC_DIR="/lustre/orion/csc590/scratch/jonathanlimsc/bgpt"
+
+# Number of jobs to submit
+NUM_JOBS=2
+NUM_NODES=2
+
+# Job dependency (set to "none" for the first job)
+DEPENDENCY="none"
+
+for (( i=1; i<=NUM_JOBS; i++ ))
+do
+  # Job name
+  JOB_NAME="test-continual-$i"
+
+  # Submit job and capture the job ID
+  if [ "$DEPENDENCY" == "none" ]; then
+    JOB_SUBMIT_OUTPUT=$(sbatch -A csc590 --job-name=$JOB_NAME --output=${SRC_DIR}/logs/%x-%j.out --time=00:05:00 --partition=batch --nodes=${NUM_NODES} conditional_launch_jon.sh)
+  else
+    JOB_SUBMIT_OUTPUT=$(sbatch -A csc590 --dependency=afterany:$DEPENDENCY --job-name=$JOB_NAME --output=${SRC_DIR}/logs/%x-%j.out --time=00:05:00 --partition=batch --nodes=${NUM_NODES} conditional_launch_jon.sh --load-from-checkpoint)
+  fi
+
+  # Extract job ID from the submission output
+  JOB_ID=$(echo $JOB_SUBMIT_OUTPUT | grep -oP '\d+')
+  echo "Submitted job $JOB_ID"
+
+  # Set this job ID as the dependency for the next job
+  DEPENDENCY=$JOB_ID
+done
diff --git a/bgpt/launch_global_nvidia.sh b/bgpt/launch_global_nvidia.sh
new file mode 100755
index 0000000..8045a7a
--- /dev/null
+++ b/bgpt/launch_global_nvidia.sh
@@ -0,0 +1,33 @@
+#!/bin/bash
+
+SRC_DIR="/nfs/scratch/jonathan/byte_models"
+
+# Number of jobs to submit
+NUM_JOBS=3
+NUM_NODES=1
+NUM_GPUS_PER_NODE=4
+CPUS_PER_TASK=6
+MEM="32G"
+TIME_LIMIT_PER_JOB="24:00:00"
+# Job dependency (set to "none" for the first job)
+DEPENDENCY="none"
+
+for (( i=1; i<=NUM_JOBS; i++ ))
+do
+  # Job name
+  JOB_NAME="continual-job-$i"
+
+  # Submit job and capture the job ID
+  if [ "$DEPENDENCY" == "none" ]; then
+    JOB_SUBMIT_OUTPUT=$(sbatch --job-name=$JOB_NAME --output=${SRC_DIR}/logs/%x-%j.out --error=${SRC_DIR}/logs/%x-%j.err --time=${TIME_LIMIT_PER_JOB} --nodes=${NUM_NODES} --gres=gpu:A100:${NUM_GPUS_PER_NODE} --cpus-per-task=${CPUS_PER_TASK} --mem=${MEM} conditional_launch_nvidia.sh)
+  else
+    JOB_SUBMIT_OUTPUT=$(sbatch --dependency=afterany:$DEPENDENCY --job-name=$JOB_NAME --output=${SRC_DIR}/logs/%x-%j.out --error=${SRC_DIR}/logs/%x-%j.err --time=${TIME_LIMIT_PER_JOB} --nodes=${NUM_NODES} --gres=gpu:A100:${NUM_GPUS_PER_NODE} --cpus-per-task=${CPUS_PER_TASK} --mem=${MEM} conditional_launch_nvidia.sh --load-from-checkpoint)
+  fi
+
+  # Extract job ID from the submission output
+  JOB_ID=$(echo $JOB_SUBMIT_OUTPUT | grep -oP '\d+')
+  echo "Submitted job $JOB_ID"
+
+  # Set this job ID as the dependency for the next job
+  DEPENDENCY=$JOB_ID
+done
diff --git a/bgpt/launch_global_nvidia_small.sh b/bgpt/launch_global_nvidia_small.sh
new file mode 100755
index 0000000..ca3052c
--- /dev/null
+++ b/bgpt/launch_global_nvidia_small.sh
@@ -0,0 +1,33 @@
+#!/bin/bash
+
+SRC_DIR="/nfs/scratch/jonathan/byte_models"
+
+# Number of jobs to submit
+NUM_JOBS=1
+NUM_NODES=1
+NUM_GPUS_PER_NODE=2
+CPUS_PER_TASK=6
+MEM="32G"
+TIME_LIMIT_PER_JOB="24:00:00"
+# Job dependency (set to "none" for the first job)
+DEPENDENCY="none"
+
+for (( i=1; i<=NUM_JOBS; i++ ))
+do
+  # Job name
+  JOB_NAME="test-eval-$i"
+
+  # Submit job and capture the job ID
+  if [ "$DEPENDENCY" == "none" ]; then
+    JOB_SUBMIT_OUTPUT=$(sbatch --job-name=$JOB_NAME --output=${SRC_DIR}/logs/%x-%j.out --error=${SRC_DIR}/logs/%x-%j.err --time=${TIME_LIMIT_PER_JOB} --nodes=${NUM_NODES} --gres=gpu:A100:${NUM_GPUS_PER_NODE} --cpus-per-task=${CPUS_PER_TASK} --mem=${MEM} conditional_launch_nvidia_small.sh)
+  else
+    JOB_SUBMIT_OUTPUT=$(sbatch --dependency=afterany:$DEPENDENCY --job-name=$JOB_NAME --output=${SRC_DIR}/logs/%x-%j.out --error=${SRC_DIR}/logs/%x-%j.err --time=${TIME_LIMIT_PER_JOB} --nodes=${NUM_NODES} --gres=gpu:A100:${NUM_GPUS_PER_NODE} --cpus-per-task=${CPUS_PER_TASK} --mem=${MEM} conditional_launch_nvidia_small.sh --load-from-checkpoint)
+  fi
+
+  # Extract job ID from the submission output
+  JOB_ID=$(echo $JOB_SUBMIT_OUTPUT | grep -oP '\d+')
+  echo "Submitted job $JOB_ID"
+
+  # Set this job ID as the dependency for the next job
+  DEPENDENCY=$JOB_ID
+done
diff --git a/bgpt/launch_interactive_jon.sh b/bgpt/launch_interactive_jon.sh
new file mode 100755
index 0000000..24f7a02
--- /dev/null
+++ b/bgpt/launch_interactive_jon.sh
@@ -0,0 +1,22 @@
+#!/bin/bash
+
+# Define the source and target directories
+SRC_DIR="/ccs/home/jonathanlimsc/generalist/byte_models/bgpt"
+
+# Change to config YAML path
+CONFIG_PATH="/ccs/home/jonathanlimsc/generalist/byte_models/bgpt/configs/config_110M_jon.yaml"
+
+cd "$SRC_DIR"
+
+source /ccs/home/jonathanlimsc/miniconda3/etc/profile.d/conda.sh
+conda activate /ccs/home/jonathanlimsc/.conda/envs/bgpt2
+
+# module load rocm/5.2
+
+export MASTER_IP=`ip -f inet addr show hsn0 | sed -En -e 's/.*inet ([0-9.]+).*/\1/p' | head -1`
+
+# mkdir "$SLURM_JOB_NAME"
+# mkdir "$SLURM_JOB_NAME"/checkpoints
+# mkdir "$SLURM_JOB_NAME"/dataloaders
+
+srun torchrun --nnodes=1 --nproc_per_node=1 --rdzv_id=$SLURM_JOB_ID --rdzv_backend=c10d --rdzv_endpoint=$MASTER_IP:29400 train-gen.py --train-config-path ${CONFIG_PATH}
diff --git a/bgpt/launch_interactive_jon_nvidia.sh b/bgpt/launch_interactive_jon_nvidia.sh
new file mode 100755
index 0000000..aaccebd
--- /dev/null
+++ b/bgpt/launch_interactive_jon_nvidia.sh
@@ -0,0 +1,21 @@
+#!/bin/bash
+
+# Define the source and target directories
+SRC_DIR="/home/jonathan/cerc/byte_models/bgpt"
+
+# Change to config YAML path
+CONFIG_PATH="/home/jonathan/cerc/byte_models/bgpt/configs/config_110M_jon_nvidia.yaml"
+
+cd "$SRC_DIR"
+
+source ${HOME}/.bashrc
+source activate /nfs/scratch/jonathan/micromamba/envs/bgpt
+
+# module load rocm/5.2
+
+export MASTER_IP=localhost
+# export MASTER_IP=`ip -f inet addr show hsn0 | sed -En -e 's/.*inet ([0-9.]+).*/\1/p' | head -1`
+# export MASTER_ADDR=$(scontrol show hostnames $SLURM_JOB_NODELIST | head -n 1)
+
+# torchrun --nnodes=1 --nproc_per_node=1 --rdzv_id=$SLURM_JOB_ID --rdzv_backend=c10d --rdzv_endpoint=$MASTER_IP:29400 train-gen.py --train-config-path ${CONFIG_PATH}
+torchrun --nnodes=1 --nproc_per_node=1 --rdzv_id=1337 --rdzv_backend=c10d --rdzv_endpoint=$MASTER_IP:29400 train-gen.py --train-config-path ${CONFIG_PATH}
diff --git a/bgpt/reloading_sampler.py b/bgpt/reloading_sampler.py
new file mode 100644
index 0000000..9e416b7
--- /dev/null
+++ b/bgpt/reloading_sampler.py
@@ -0,0 +1,37 @@
+import math
+from typing import Optional, Iterator
+
+import torch
+from torch.utils.data import Dataset
+import torch.distributed as dist
+from torch.utils.data.distributed import DistributedSampler
+
+
+class CustomDistributedSampler(DistributedSampler):
+    def __init__(self, dataset: Dataset, num_replicas: Optional[int] = None,
+                 rank: Optional[int] = None, shuffle: bool = True,
+                 seed: int = 0, drop_last: bool = False, start_index: int = 0) -> None:
+        '''
+        start_index: The index in dataset to start sampling indices onwards from. Default is 0.
+        '''
+        # Initialize the parent DistributedSampler class with the provided arguments
+        super(CustomDistributedSampler, self).__init__(dataset, num_replicas=num_replicas, rank=rank, shuffle=shuffle,
+                                                       seed=seed, drop_last=drop_last)
+
+        # Additional attribute for custom functionality
+        self.start_index = start_index
+
+    def __iter__(self) -> Iterator[int]:
+        # Generate a list of indices as the DistributedSampler would
+        indices = super(CustomDistributedSampler, self).__iter__()
+
+        # Convert iterator to list to manipulate the starting index
+        indices = list(indices)
+
+        indices = indices[self.start_index:]
+
+        return iter(indices)
+
+    def set_index(self, index: int):
+        # Set the starting index for sampling
+        self.start_index = index
diff --git a/bgpt/requirements_nvidia.txt b/bgpt/requirements_nvidia.txt
new file mode 100644
index 0000000..495a37e
Binary files /dev/null and b/bgpt/requirements_nvidia.txt differ
diff --git a/bgpt/train-gen-og.py b/bgpt/train-gen-og.py
new file mode 100644
index 0000000..70801a5
--- /dev/null
+++ b/bgpt/train-gen-og.py
@@ -0,0 +1,421 @@
+import os
+import time
+import torch
+import random
+import wandb
+import numpy as np
+from utils import *
+from config_og import *
+from tqdm import tqdm
+from copy import deepcopy
+from torch.cuda.amp import autocast, GradScaler
+from torch.utils.data import Dataset, DataLoader
+from transformers import GPT2Config, get_scheduler
+import torch.distributed as dist
+from torch.nn.parallel import DistributedDataParallel as DDP
+from torch.utils.data.distributed import DistributedSampler
+from datetime import datetime
+
+
+
+# Set up distributed training
+world_size = int(os.environ['WORLD_SIZE']) if 'WORLD_SIZE' in os.environ else 1
+global_rank = int(os.environ['RANK']) if 'RANK' in os.environ else 0
+local_rank = int(os.environ['LOCAL_RANK']) if 'LOCAL_RANK' in os.environ else 0
+
+if world_size > 1:
+    torch.cuda.set_device(local_rank)
+    device = torch.device("cuda", local_rank)
+    dist.init_process_group(backend='nccl') if world_size > 1 else None
+else:
+    device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
+    
+# Set random seed
+seed = 0 + global_rank
+random.seed(seed)
+np.random.seed(seed)
+torch.manual_seed(seed)
+torch.cuda.manual_seed_all(seed)
+torch.backends.cudnn.deterministic = True
+torch.backends.cudnn.benchmark = False
+
+batch_size = BATCH_SIZE
+
+patch_config = GPT2Config(num_hidden_layers=PATCH_NUM_LAYERS, 
+                    max_length=PATCH_LENGTH, 
+                    max_position_embeddings=PATCH_LENGTH,
+                    hidden_size=HIDDEN_SIZE,
+                    n_head=HIDDEN_SIZE//64,
+                    vocab_size=1)
+byte_config = GPT2Config(num_hidden_layers=BYTE_NUM_LAYERS, 
+                    max_length=PATCH_SIZE+1, 
+                    max_position_embeddings=PATCH_SIZE+1,
+                    hidden_size=HIDDEN_SIZE,
+                    n_head=HIDDEN_SIZE//64,
+                    vocab_size=256+1)
+model = bGPTLMHeadModel(patch_config, byte_config)
+model = model.to(device)
+
+# print parameter number
+print("Parameter Number: "+str(sum(p.numel() for p in model.parameters() if p.requires_grad)))
+
+if world_size > 1:
+    model = DDP(model, device_ids=[local_rank], output_device=local_rank,  find_unused_parameters=True)
+
+scaler = GradScaler()
+is_autocast = True
+optimizer = torch.optim.AdamW(model.parameters(), lr=LEARNING_RATE)
+    
+def collate_batch(input_batches):
+    
+    input_patches, input_masks = zip(*input_batches)
+    input_patches = torch.nn.utils.rnn.pad_sequence(input_patches, batch_first=True, padding_value=256)
+    input_masks = torch.nn.utils.rnn.pad_sequence(input_masks, batch_first=True, padding_value=0)
+
+    return input_patches.to(device), input_masks.to(device)
+
+def split_into_minibatches(input_patches, input_masks, minibatch_size):
+    minibatches = []
+    for start_idx in range(0, len(input_patches), minibatch_size):
+        end_idx = start_idx + minibatch_size
+        minibatch_patches = input_patches[start_idx:end_idx]
+        minibatch_masks = input_masks[start_idx:end_idx]
+        minibatches.append((minibatch_patches, minibatch_masks))
+    return minibatches
+
+def list_files_in_directory(directories):
+    file_list = []
+    
+    for directory in directories:
+        for root, dirs, files in os.walk(directory):
+            for file in files:
+                file_path = os.path.join(root, file)
+                file_list.append(file_path)
+    return file_list
+
+def read_bytes(filename):
+    
+    ext = filename.split('.')[-1]
+    ext = bytearray(ext, 'utf-8')
+    ext = [byte for byte in ext][:PATCH_SIZE]
+
+    with open(filename, 'rb') as f:
+        file_bytes = f.read()
+
+    bytes = []
+    for byte in file_bytes:
+        bytes.append(byte)
+
+    if len(bytes)%PATCH_SIZE!=0:
+        bytes = bytes + [256] * (PATCH_SIZE - len(bytes) % PATCH_SIZE)
+
+    bos_patch = ext + [256] * (PATCH_SIZE - len(ext))
+    bytes = bos_patch + bytes + [256] * PATCH_SIZE
+
+    if len(bytes) > PATCH_LENGTH*PATCH_SIZE:
+        print(f"Warning: {filename} is too long, truncating to {PATCH_LENGTH*PATCH_SIZE} bytes.")
+        bytes = bytes[:PATCH_LENGTH*PATCH_SIZE]
+
+    masks = [1] * (len(bytes)//PATCH_SIZE)
+
+    return bytes, masks
+
+class ByteDataset(Dataset):
+    def __init__(self, filenames):
+        if CONVERSION_MODE == None:
+            print(f"Regular Training Mode: {CONVERSION_MODE}, loading {len(filenames)} files")
+            self.filenames = filenames
+        elif "->" in CONVERSION_MODE:
+            print(f"Unidirectional Conversion Mode: {CONVERSION_MODE}, loading {len(filenames)} files")
+            input_ext = CONVERSION_MODE.split("->")[0]
+            target_ext = CONVERSION_MODE.split("->")[1]
+
+            self.filenames = []
+            for filename in filenames:
+                if filename.split('.')[-1]==input_ext:
+                    target_filename = filename[:-(len(input_ext))] + target_ext
+                    if os.path.exists(target_filename):
+                        self.filenames.append((filename, target_filename))
+        elif "&" in CONVERSION_MODE:
+            print(f"Bidirectional Conversion Mode: {CONVERSION_MODE}, loading {len(filenames)} files")
+            input_ext = CONVERSION_MODE.split("&")[0]
+            target_ext = CONVERSION_MODE.split("&")[1]
+
+            self.filenames = []
+            for filename in filenames:
+                if filename.split('.')[-1]==input_ext:
+                    target_filename = filename[:-(len(input_ext))] + target_ext
+                    if os.path.exists(target_filename):
+                        self.filenames.append((filename, target_filename))
+                elif filename.split('.')[-1]==target_ext:
+                    input_filename = filename[:-(len(target_ext))] + input_ext
+                    if os.path.exists(input_filename):
+                        self.filenames.append((input_filename, filename))
+        else:
+            raise ValueError("Invalid Conversion Mode, please check the config.py file")
+            
+    def __len__(self):
+        return len(self.filenames)
+
+    def __getitem__(self, idx):
+        
+        if CONVERSION_MODE == None:
+            filename = self.filenames[idx]
+            file_bytes, file_masks = read_bytes(filename)
+        else:
+            input_filename, target_filename = self.filenames[idx]
+            input_bytes, input_masks = read_bytes(input_filename)
+            target_bytes, target_masks = read_bytes(target_filename)
+
+            file_bytes = input_bytes[:-PATCH_SIZE] + target_bytes
+            file_masks = input_masks[:-1] + target_masks
+
+            if len(file_bytes) > PATCH_LENGTH*PATCH_SIZE:
+                print(f"Warning: {input_filename} and {target_filename} are too long after concatenation, truncating to {PATCH_LENGTH*PATCH_SIZE} bytes.")
+                file_bytes = file_bytes[:PATCH_LENGTH*PATCH_SIZE]
+                file_masks = file_masks[:PATCH_LENGTH]
+
+        file_bytes = torch.tensor(file_bytes, dtype=torch.long)
+        file_masks = torch.tensor(file_masks, dtype=torch.long)
+        
+        return file_bytes, file_masks
+
+# call model with a batch of input
+def process_one_batch(batch):
+    input_patches, input_masks = batch
+    loss = model(input_patches, input_masks).loss
+
+    # Reduce the loss on GPU 0
+    if world_size > 1:
+        loss = loss.unsqueeze(0)
+        dist.reduce(loss, dst=0)
+        loss = loss / world_size
+        dist.broadcast(loss, src=0)
+
+    return loss
+
+# do one epoch for training 
+# update total iters value from checkpoint in main training loop, else 1
+def train_epoch(total_train_iters = 1, logging_frequency = 100):
+    tqdm_train_set = tqdm(train_set)
+    # epoch loss
+    total_train_loss = 0
+    iter_idx = 1
+    model.train()
+
+    for batch in tqdm_train_set:
+        minibatches = split_into_minibatches(batch[0], batch[1], BATCH_SIZE//ACCUMULATION_STEPS)
+        # minibatch loss
+        minibatch_loss = 0
+        for minibatch in minibatches:
+            with autocast():
+                loss = process_one_batch(minibatch) / ACCUMULATION_STEPS
+            scaler.scale(loss).backward()
+            loss_train = loss.item()
+            minibatch_loss += loss_train
+            total_train_loss += loss_train
+            
+        if global_rank == 0:
+            if iter_idx % logging_frequency == 0:
+                wandb.log({
+                    "train_loss": minibatch_loss,
+                    "total_train_iters": total_train_iters,
+                }, step=total_train_iters)   
+        scaler.step(optimizer)
+        scaler.update()
+        
+        lr_scheduler.step()
+        model.zero_grad(set_to_none=True)
+        tqdm_train_set.set_postfix({str(global_rank)+'_train_loss': total_train_loss / iter_idx})
+        
+        iter_idx += 1
+        total_train_iters +=1
+    return total_train_loss / (iter_idx-1), total_train_iters
+
+# do one epoch for eval
+def eval_epoch(total_val_iters = 1, logging_frequency = 100):
+    tqdm_eval_set = tqdm(eval_set)
+    total_eval_loss = 0
+    iter_idx = 1
+    model.eval()
+  
+    # Evaluate data for one epoch
+    for batch in tqdm_eval_set: 
+        minibatches = split_into_minibatches(batch[0], batch[1], BATCH_SIZE//ACCUMULATION_STEPS)
+        minibatch_loss = 0
+        for minibatch in minibatches:
+            with torch.no_grad():
+                loss = process_one_batch(minibatch) / ACCUMULATION_STEPS
+            loss_eval = loss.item()
+            total_eval_loss += loss_eval
+            minibatch_loss += loss_eval
+        if global_rank == 0:
+            if iter_idx % logging_frequency == 0:
+                wandb.log({
+                    "eval_loss": minibatch_loss,
+                    "total_val_iters": total_val_iters,
+                }, step=total_val_iters)
+        # print(f"current iter is : {total_val_iters}")
+        tqdm_eval_set.set_postfix({str(global_rank)+'_eval_loss': total_eval_loss / iter_idx})
+        iter_idx += 1
+        total_val_iters += 1 
+    # if iter_idx == 0:
+    #     return 0  # Safeguard: Return 0 or another appropriate value if no iterations were done
+    # else:
+    #     return total_eval_loss / iter_idx  
+    return total_eval_loss / (iter_idx-1), total_val_iters
+
+# train and eval
+if __name__ == "__main__":
+    current_time = datetime.now()
+
+    # Format as a string
+    run_name = current_time.strftime("%Y-%m-%d_%H-%M-%S")
+    if global_rank == 0:
+        wandb.init(project="null", entity="mina-beiramy", name=f"bgpt_abc_midi_{run_name}")
+        
+    wandb.config.update({
+            "TRAIN_FOLDERS": TRAIN_FOLDERS,
+            "EVAL_FOLDERS": EVAL_FOLDERS,
+            # "PRE_WEIGHTS_PATH": PRE_WEIGHTS_PATH,
+            "WEIGHTS_PATH": WEIGHTS_PATH,
+            "LOGS_PATH": LOGS_PATH,
+            "PATCH_SIZE": PATCH_SIZE,
+            "PATCH_LENGTH": PATCH_LENGTH,
+            "BYTE_NUM_LAYERS": BYTE_NUM_LAYERS,
+            "PATCH_NUM_LAYERS": PATCH_NUM_LAYERS,
+            "HIDDEN_SIZE": HIDDEN_SIZE,
+            "NUM_EPOCHS": NUM_EPOCHS,
+            "LEARNING_RATE": LEARNING_RATE,
+            "BATCH_SIZE": BATCH_SIZE,
+            "ACCUMULATION_STEPS": ACCUMULATION_STEPS,
+            "LOAD_FROM_CHECKPOINT": LOAD_FROM_CHECKPOINT,
+            # "LOAD_FROM_PRE_CHECKPOINT": LOAD_FROM_PRE_CHECKPOINT
+            # Add any other configurations you'd like to track
+        })
+    
+    # load filenames under train and eval folder
+    train_files = list_files_in_directory(TRAIN_FOLDERS)
+    eval_files = list_files_in_directory(EVAL_FOLDERS)
+
+    train_batch_nums = int(len(train_files) / batch_size)
+    eval_batch_nums = int(len(eval_files) / batch_size)
+
+    random.shuffle(train_files)
+    random.shuffle(eval_files)
+
+    train_files = train_files[:train_batch_nums*batch_size]
+    eval_files = eval_files[:eval_batch_nums*batch_size]
+
+    train_set = ByteDataset(train_files)
+    eval_set = ByteDataset(eval_files)
+
+    train_sampler = DistributedSampler(train_set, num_replicas=world_size, rank=local_rank)
+    eval_sampler = DistributedSampler(eval_set, num_replicas=world_size, rank=local_rank)
+
+    train_set = DataLoader(train_set, batch_size=batch_size, collate_fn=collate_batch, sampler=train_sampler, shuffle = (train_sampler is None))
+    eval_set = DataLoader(eval_set, batch_size=batch_size, collate_fn=collate_batch, sampler=eval_sampler, shuffle = (train_sampler is None))
+
+    lr_scheduler = get_scheduler(
+        name="cosine",
+        optimizer=optimizer,
+        num_warmup_steps=NUM_EPOCHS * len(train_set) // 10,
+        num_training_steps=NUM_EPOCHS * len(train_set),
+    )
+    model = model.to(device)
+    optimizer = torch.optim.AdamW(model.parameters(), lr=LEARNING_RATE)
+
+    if LOAD_FROM_PRETRAINED and os.path.exists(PRETRAINED_PATH):
+        # Load checkpoint to CPU
+        checkpoint = torch.load(PRETRAINED_PATH, map_location='cpu')
+
+        # Here, model is assumed to be on GPU
+        # Load state dict to CPU model first, then move the model to GPU
+        if torch.cuda.device_count() > 1:
+            # If you have a DataParallel model, you need to load to model.module instead
+            cpu_model = deepcopy(model.module)
+            cpu_model.load_state_dict(checkpoint['model'])
+            model.module.load_state_dict(cpu_model.state_dict())
+        else:
+            # Load to a CPU clone of the model, then load back
+            cpu_model = deepcopy(model)
+            cpu_model.load_state_dict(checkpoint['model'])
+            model.load_state_dict(cpu_model.state_dict())
+            
+        print(f"Successfully Loaded Pretrained Checkpoint at Epoch {checkpoint['epoch']} with Loss {checkpoint['min_eval_loss']}")
+    
+    else:
+        pre_epoch = 0
+        best_epoch = 0
+        min_eval_loss = float('inf')
+
+    if LOAD_FROM_CHECKPOINT and os.path.exists(WEIGHTS_PATH):
+        # Load checkpoint to CPU
+        checkpoint = torch.load(WEIGHTS_PATH, map_location='cpu')
+
+        # Here, model is assumed to be on GPU
+        # Load state dict to CPU model first, then move the model to GPU
+        if torch.cuda.device_count() > 1:
+            # If you have a DataParallel model, you need to load to model.module instead
+            cpu_model = deepcopy(model.module)
+            cpu_model.load_state_dict(checkpoint['model'])
+            model.module.load_state_dict(cpu_model.state_dict())
+        else:
+            # Load to a CPU clone of the model, then load back
+            cpu_model = deepcopy(model)
+            cpu_model.load_state_dict(checkpoint['model'])
+            model.load_state_dict(cpu_model.state_dict())
+        optimizer.load_state_dict(checkpoint['optimizer'])
+        lr_scheduler.load_state_dict(checkpoint['lr_sched'])
+        pre_epoch = checkpoint['epoch']
+        best_epoch = checkpoint['best_epoch']
+        min_eval_loss = checkpoint['min_eval_loss']
+        print("Successfully Loaded Checkpoint from Epoch %d" % pre_epoch)
+        checkpoint = None
+    
+    else:
+        pre_epoch = 0
+        best_epoch = 0
+        min_eval_loss = float('inf')
+        
+    total_train_iters = 1
+    total_val_iters = 1
+
+    for epoch in range(1+pre_epoch, NUM_EPOCHS+1):
+        train_sampler.set_epoch(epoch)
+        eval_sampler.set_epoch(epoch)
+        print('-' * 21 + "Epoch " + str(epoch) + '-' * 21)
+        avg_train_loss, total_train_iters = train_epoch(total_train_iters, 1)
+        eval_loss, total_val_iters = eval_epoch(total_val_iters, 1)
+        print(total_val_iters)
+            
+        with open(LOGS_PATH,'a') as f:
+            f.write("Epoch " + str(epoch) + "\ntrain_loss: " + str(avg_train_loss) + "\neval_loss: " +str(eval_loss) + "\ntime: " + time.asctime(time.localtime(time.time())) + "\n\n")
+        # print(f"current iter is : {total_train_iters}")
+        if global_rank == 0:
+            wandb.log({
+                    "epoch_avg_train_loss": avg_train_loss,
+                    "avg_eval_loss": eval_loss,
+                    "epoch": epoch,
+                    "total_iters": total_train_iters,
+                }, step=total_train_iters)
+        
+        if eval_loss < min_eval_loss:
+            best_epoch = epoch
+            min_eval_loss = eval_loss
+            checkpoint = { 
+                            'model': model.module.state_dict() if hasattr(model, "module") else model.state_dict(),
+                            'optimizer': optimizer.state_dict(),
+                            'lr_sched': lr_scheduler.state_dict(),
+                            'epoch': epoch,
+                            'best_epoch': best_epoch,
+                            'min_eval_loss': min_eval_loss
+                            }
+            torch.save(checkpoint, WEIGHTS_PATH)
+        
+        if world_size > 1:
+            dist.barrier()
+
+    if global_rank==0:
+        print("Best Eval Epoch : "+str(best_epoch))
+        print("Min Eval Loss : "+str(min_eval_loss))
diff --git a/bgpt/train-gen.py b/bgpt/train-gen.py
index 38e995c..79603f2 100644
--- a/bgpt/train-gen.py
+++ b/bgpt/train-gen.py
@@ -5,11 +5,13 @@
 import random
 import numpy as np
 import yaml
+import json
 import argparse
 
 from utils import *
 # from config import *
 from pathlib import Path
+from datetime import datetime
 from tqdm import tqdm
 from copy import deepcopy
 from torch.cuda.amp import autocast, GradScaler
@@ -17,6 +19,7 @@
 import torch.distributed as dist
 from torch.nn.parallel import DistributedDataParallel as DDP
 from torch.utils.data.distributed import DistributedSampler
+from reloading_sampler import CustomDistributedSampler
 
 import wandb
 
@@ -39,7 +42,7 @@
 
 
 # Set random seed
-seed = 0 + global_rank
+seed = 0
 random.seed(seed)
 np.random.seed(seed)
 torch.manual_seed(seed)
@@ -47,28 +50,38 @@
 torch.backends.cudnn.deterministic = True
 torch.backends.cudnn.benchmark = False
 
+
 def find_most_recent_file(directory, pattern="*.pth"):
     file_paths = glob.glob(os.path.join(directory, pattern))
     if file_paths:
         return max(file_paths, key=os.path.getmtime)
     return None
 
+def find_most_recent_file_from_checkpoint_info(checkpoint_info_filepath):
+    checkpoint_filepath = None
+    with open(checkpoint_info_filepath, "r") as f:
+        data = json.load(f)
+        checkpoint_filepath = data.get("latest_checkpoint_path", None)
+
+    return checkpoint_filepath
 
 def collate_batch(input_batches):
-    input_patches, input_masks = zip(*input_batches)
+    input_patches, input_masks, input_file_indices = zip(*input_batches)
     input_patches = torch.nn.utils.rnn.pad_sequence(input_patches, batch_first=True, padding_value=256)
     input_masks = torch.nn.utils.rnn.pad_sequence(input_masks, batch_first=True, padding_value=0)
+    input_file_indices = torch.nn.utils.rnn.pad_sequence(input_file_indices, batch_first=True, padding_value=-1)
 
-    return input_patches.to(device), input_masks.to(device)
+    return input_patches.to(device), input_masks.to(device), input_file_indices.to(device)
 
 
-def split_into_minibatches(input_patches, input_masks, minibatch_size):
+def split_into_minibatches(input_patches, input_masks, input_file_indices, minibatch_size):
     minibatches = []
     for start_idx in range(0, len(input_patches), minibatch_size):
         end_idx = start_idx + minibatch_size
         minibatch_patches = input_patches[start_idx:end_idx]
         minibatch_masks = input_masks[start_idx:end_idx]
-        minibatches.append((minibatch_patches, minibatch_masks))
+        minibatch_file_indices = input_file_indices[start_idx:end_idx]
+        minibatches.append((minibatch_patches, minibatch_masks, minibatch_file_indices))
     return minibatches
 
 
@@ -83,8 +96,8 @@ def list_files_in_directory(directories):
     return file_list
 
 
-def read_bytes(filename, 
-               patch_size, 
+def read_bytes(filename,
+               patch_size,
                patch_length):
     ext = filename.split('.')[-1]
     ext = bytearray(ext, 'utf-8')
@@ -123,16 +136,20 @@ def __getitem__(self, idx):
 
         file_bytes = torch.tensor(file_bytes, dtype=torch.long)
         file_masks = torch.tensor(file_masks, dtype=torch.long)
+        file_idx = torch.tensor([idx], dtype=torch.long)
 
-        return file_bytes, file_masks
+        return file_bytes, file_masks, file_idx
 
 
 # call model with a batch of input
-def process_one_batch(batch, 
-                      model, 
-                      world_size
+def process_one_batch(batch,
+                      model,
+                      verbose=False
                       ):
-    input_patches, input_masks = batch
+    input_patches, input_masks, input_file_indices = batch
+    if verbose:
+        print(f"Global Rank {global_rank}/{world_size} - File indices in batch: {input_file_indices}")
+
     loss = model(input_patches, input_masks).loss
 
     # Reduce the loss on GPU 0
@@ -147,32 +164,43 @@ def process_one_batch(batch,
 
 # do one epoch for training
 def train_epoch(model,
-                train_set, 
-                lr_scheduler, 
+                train_set,
+                eval_set,
+                lr_scheduler,
                 scaler,
                 optimizer,
-                epoch, 
-                best_epoch, 
-                min_eval_loss, 
-                batch_size, 
-                accumulation_steps, 
-                checkpoint_frequency, 
-                checkpoint_path):
-    
+                epoch,
+                best_epoch,
+                min_eval_loss,
+                batch_size,
+                accumulation_steps,
+                checkpoint_frequency,
+                checkpoint_path,
+                logging_frequency,
+                total_iters=1,
+                verbose=False,
+                ):
+    global_batch_size = batch_size * world_size
+    # Note: Size of train_set is equal to the number of global batches
+    iters_per_epoch = len(train_set)
+
     tqdm_train_set = tqdm(train_set)
     total_train_loss = 0
     iter_idx = 1
-    checkpoint_iters = 0
-    total_iters = 0
+    
     model.train()
 
     for batch in tqdm_train_set:
-        minibatches = split_into_minibatches(batch[0], batch[1], batch_size // accumulation_steps)
+        minibatches = split_into_minibatches(batch[0], batch[1], batch[2], batch_size // accumulation_steps)
+        minibatch_loss = 0
         for minibatch in minibatches:
             with autocast():
-                loss = process_one_batch(minibatch, model, world_size) / accumulation_steps
+                loss = process_one_batch(minibatch, model, verbose) / accumulation_steps
             scaler.scale(loss).backward()
-            total_train_loss += loss.item()
+            loss_val = loss.item()
+            minibatch_loss += loss_val
+            total_train_loss += loss_val
+
         scaler.step(optimizer)
         scaler.update()
 
@@ -180,60 +208,116 @@ def train_epoch(model,
         model.zero_grad(set_to_none=True)
         tqdm_train_set.set_postfix({str(global_rank) + '_train_loss': total_train_loss / iter_idx})
 
-        if iter_idx == checkpoint_frequency:
-            checkpoint = {
-                'model': model.module.state_dict() if hasattr(model, "module") else model.state_dict(),
-                'optimizer': optimizer.state_dict(),
-                'lr_sched': lr_scheduler.state_dict(),
-                'epoch': epoch,
-                'best_epoch': best_epoch,
-                'min_eval_loss': min_eval_loss
-            }
-
-        if checkpoint_iters == checkpoint_frequency:
-            torch.save(checkpoint, f'{checkpoint_path}/checkpoint{total_iters}.pth')
-            # torch.save(dataloader.state_dict(), f'{DATALOADER_PATH}/dataloader{total_iters}.pth') # THIS doesn't work
-            checkpoint_iters = 0
-
-        checkpoint_iters += 1
+        # print(checkpoint_iters, checkpoint_frequency)
+        # print(total_iters)
+
+        # Do eval
+        if iter_idx % checkpoint_frequency == 0:
+            eval_loss = eval_epoch(model,
+                                    eval_set,
+                                    batch_size,
+                                    accumulation_steps
+                                    )
+            
+
+        if global_rank == 0:
+            if iter_idx % logging_frequency == 0:
+                wandb.log({
+                    "train_loss": minibatch_loss,
+                    "total_iters": total_iters,
+                }, step=total_iters)
+
+            if iter_idx % checkpoint_frequency == 0:
+                # Log the latest loss for this checkpoint
+                wandb.log({
+                    "train_loss": minibatch_loss,
+                    "total_iters": total_iters,
+                    "ave_eval_loss": eval_loss if eval_loss else 0,
+                }, step=total_iters)
+
+                # For sampler to start from this start_index when resuming checkpoint
+                train_sampler_start_index = (total_iters % iters_per_epoch)*global_batch_size
+
+                checkpoint = {
+                    'model': model.module.state_dict() if hasattr(model, "module") else model.state_dict(),
+                    'optimizer': optimizer.state_dict(),
+                    'lr_sched': lr_scheduler.state_dict(),
+                    'epoch': epoch,
+                    'best_epoch': best_epoch,
+                    'min_eval_loss': min_eval_loss,
+                    'total_iters': total_iters,
+                    'train_sampler_start_index': train_sampler_start_index,
+                    'train_loss': minibatch_loss,
+                    'ave_eval_loss': eval_loss if eval_loss else 0,
+                }
+                checkpoint_filepath = f'{checkpoint_path}/checkpoint{total_iters}.pth'
+                torch.save(checkpoint, f'{checkpoint_path}/checkpoint{total_iters}.pth')
+                torch.save(checkpoint, f'{checkpoint_path}/latest.pth')
+
+                # Save latest_checkpoint_info.json about the latest checkpoint info and path.
+                with open(f'{checkpoint_path}/latest_checkpoint_info.json', "w") as f:
+                    checkpoint_data = {
+                        "latest_checkpoint_path": checkpoint_filepath,
+                        "epoch": epoch,
+                        "best_epoch": best_epoch,
+                        "total_iters": total_iters,
+                        "train_sampler_start_index": train_sampler_start_index,
+                        "train_loss": minibatch_loss,
+                        "ave_eval_loss": eval_loss if eval_loss else 0,
+                    }
+                    json.dump(checkpoint_data, f)
+
+                print(f"Checkpoint saved at {checkpoint_filepath}")
+
         total_iters += 1
         iter_idx += 1
 
-    return total_train_loss / (iter_idx - 1)
+    return total_train_loss / max((iter_idx - 1), 1), total_iters
 
 
 # do one epoch for eval
 def eval_epoch(model,
-               eval_set, 
-               batch_size, 
-               accumulation_steps
-               ):
-    tqdm_eval_set = tqdm(eval_set)
+                eval_set,
+                batch_size,
+                accumulation_steps
+            ):
+    
     total_eval_loss = 0
     iter_idx = 1
-    model.eval()
 
-    # Evaluate data for one epoch
-    for batch in tqdm_eval_set:
-        minibatches = split_into_minibatches(batch[0], batch[1], batch_size // accumulation_steps)
-        for minibatch in minibatches:
-            with torch.no_grad():
-                loss = process_one_batch(minibatch) / accumulation_steps
-            total_eval_loss += loss.item()
-        tqdm_eval_set.set_postfix({str(global_rank) + '_eval_loss': total_eval_loss / iter_idx})
-        iter_idx += 1
-    # return total_eval_loss / (iter_idx-1)
+    if len(eval_set) > 0:
+        model.eval()
+        tqdm_eval_set = tqdm(eval_set)
+        # Evaluate data for one epoch
+        for batch in tqdm_eval_set:
+            minibatches = split_into_minibatches(batch[0], batch[1], batch[2], batch_size // accumulation_steps)
+            for minibatch in minibatches:
+                with torch.no_grad():
+                    loss = process_one_batch(batch=minibatch, model=model) / accumulation_steps
+                total_eval_loss += loss.item()
+            tqdm_eval_set.set_postfix({str(global_rank) + '_eval_loss': total_eval_loss / iter_idx})
+            iter_idx += 1
+
+        model.train()
+
+    return total_eval_loss / max(iter_idx-1, 1)
+    # return 0
 
-    return 0
 
 def read_config_from_yaml(yaml_file):
     with open(yaml_file, 'r') as file:
         config = yaml.safe_load(file)
     return config
 
+
 def main(args):
     config = read_config_from_yaml(args.train_config_path)
+
+    LOAD_FROM_CHECKPOINT = args.load_from_checkpoint
+    config['load_from_checkpoint'] = LOAD_FROM_CHECKPOINT
+    
     print(config)
+
     TRAIN_FOLDERS = config.get("train_folders")
     EVAL_FOLDERS = config.get("eval_folders")
 
@@ -256,56 +340,65 @@ def main(args):
     BATCH_SIZE = config.get("batch_size")
     ACCUMULATION_STEPS = config.get("accumulation_steps")
     PATCH_SAMPLING_BATCH_SIZE = config.get("patch_sampling_batch_size")
-    LOAD_FROM_CHECKPOINT = config.get("load_from_checkpoint")
+    
     LOAD_FROM_PRE_CHECKPOINT = config.get("load_from_pre_checkpoint")
     CHECKPOINT_FREQUENCY = config.get("checkpoint_frequency")
+    LOGGING_FREQUENCY = config.get("logging_frequency")
+    VERBOSE = config.get("verbose")
     WANDB_CONFIG = config.get("wandb")
     WANDB_PROJ_NAME = WANDB_CONFIG.get("proj_name")
     WANDB_ENTITY = WANDB_CONFIG.get("entity")
     WANDB_MODE = WANDB_CONFIG.get("mode")
+    WANDB_NAME = WANDB_CONFIG.get("name", "run")
 
     FIRST_LAUNCH = config.get("first_launch")
 
     Path(BASE_DIR).mkdir(parents=True, exist_ok=True)
     Path(CHECKPOINT_PATH).mkdir(parents=True, exist_ok=True)
     Path(DATALOADER_PATH).mkdir(parents=True, exist_ok=True)
-
-    wandb.init(project=WANDB_PROJ_NAME, entity=WANDB_ENTITY, mode=WANDB_MODE)
-
-    wandb.config.update({
-        "TRAIN_FOLDERS": TRAIN_FOLDERS,
-        "EVAL_FOLDERS": EVAL_FOLDERS,
-        # "PRE_WEIGHTS_PATH": PRE_WEIGHTS_PATH,
-        "WEIGHTS_PATH": WEIGHTS_PATH,
-        "LOGS_PATH": LOGS_PATH,
-        "PATCH_SIZE": PATCH_SIZE,
-        "PATCH_LENGTH": PATCH_LENGTH,
-        "BYTE_NUM_LAYERS": BYTE_NUM_LAYERS,
-        "PATCH_NUM_LAYERS": PATCH_NUM_LAYERS,
-        "HIDDEN_SIZE": HIDDEN_SIZE,
-        "NUM_EPOCHS": NUM_EPOCHS,
-        "LEARNING_RATE": LEARNING_RATE,
-        "BATCH_SIZE": BATCH_SIZE,
-        "ACCUMULATION_STEPS": ACCUMULATION_STEPS,
-        "LOAD_FROM_CHECKPOINT": LOAD_FROM_CHECKPOINT,
-        "LOAD_FROM_PRE_CHECKPOINT": LOAD_FROM_PRE_CHECKPOINT
-        # Add any other configurations you'd like to track
-    })
+    
+    # Only log on master process
+    if global_rank == 0:
+        wandb.init(project=WANDB_PROJ_NAME, 
+                   entity=WANDB_ENTITY, 
+                   mode=WANDB_MODE,
+                   dir=BASE_DIR,
+                   name=WANDB_NAME + f"_{datetime.now().strftime('%Y%m%d_%H%M_%S')}")
+
+        wandb.config.update({
+            "TRAIN_FOLDERS": TRAIN_FOLDERS,
+            "EVAL_FOLDERS": EVAL_FOLDERS,
+            # "PRE_WEIGHTS_PATH": PRE_WEIGHTS_PATH,
+            "WEIGHTS_PATH": WEIGHTS_PATH,
+            "LOGS_PATH": LOGS_PATH,
+            "PATCH_SIZE": PATCH_SIZE,
+            "PATCH_LENGTH": PATCH_LENGTH,
+            "BYTE_NUM_LAYERS": BYTE_NUM_LAYERS,
+            "PATCH_NUM_LAYERS": PATCH_NUM_LAYERS,
+            "HIDDEN_SIZE": HIDDEN_SIZE,
+            "NUM_EPOCHS": NUM_EPOCHS,
+            "LEARNING_RATE": LEARNING_RATE,
+            "BATCH_SIZE": BATCH_SIZE,
+            "ACCUMULATION_STEPS": ACCUMULATION_STEPS,
+            "LOAD_FROM_CHECKPOINT": LOAD_FROM_CHECKPOINT,
+            "LOAD_FROM_PRE_CHECKPOINT": LOAD_FROM_PRE_CHECKPOINT
+            # Add any other configurations you'd like to track
+        })
 
     batch_size = BATCH_SIZE
 
     patch_config = GPT2Config(num_hidden_layers=PATCH_NUM_LAYERS,
-                            max_length=PATCH_LENGTH,
-                            max_position_embeddings=PATCH_LENGTH,
-                            hidden_size=HIDDEN_SIZE,
-                            n_head=HIDDEN_SIZE // 64,
-                            vocab_size=1)
+                              max_length=PATCH_LENGTH,
+                              max_position_embeddings=PATCH_LENGTH,
+                              hidden_size=HIDDEN_SIZE,
+                              n_head=HIDDEN_SIZE // 64,
+                              vocab_size=1)
     byte_config = GPT2Config(num_hidden_layers=BYTE_NUM_LAYERS,
-                            max_length=PATCH_SIZE + 1,
-                            max_position_embeddings=PATCH_SIZE + 1,
-                            hidden_size=HIDDEN_SIZE,
-                            n_head=HIDDEN_SIZE // 64,
-                            vocab_size=256 + 1)
+                             max_length=PATCH_SIZE + 1,
+                             max_position_embeddings=PATCH_SIZE + 1,
+                             hidden_size=HIDDEN_SIZE,
+                             n_head=HIDDEN_SIZE // 64,
+                             vocab_size=256 + 1)
     model = bGPTLMHeadModel(patch_config, byte_config)
     model = model.to(device)
 
@@ -317,8 +410,60 @@ def main(args):
 
     scaler = GradScaler()
     is_autocast = True
+
+    model = model.to(device)
+
+    is_checkpoint_loaded = False
+    checkpoint = None
+    train_sampler_start_index = 0
     optimizer = torch.optim.AdamW(model.parameters(), lr=LEARNING_RATE)
 
+    if LOAD_FROM_CHECKPOINT:
+        # Load checkpoint to CPU
+        # most_recent_checkpoint = find_most_recent_file(CHECKPOINT_PATH, pattern="checkpoint*.pth")
+        most_recent_checkpoint = find_most_recent_file_from_checkpoint_info(f"{CHECKPOINT_PATH}/latest_checkpoint_info.json")
+        if most_recent_checkpoint is not None:
+            WEIGHTS_PATH = most_recent_checkpoint
+            checkpoint = torch.load(WEIGHTS_PATH, map_location='cpu')
+
+        # Here, model is assumed to be on GPU
+        # Load state dict to CPU model first, then move the model to GPU
+        if torch.cuda.device_count() > 1:
+            # If you have a DataParallel model, you need to load to model.module instead
+            cpu_model = deepcopy(model.module)
+            cpu_model.load_state_dict(checkpoint['model'])
+            model.module.load_state_dict(cpu_model.state_dict())
+        else:
+            # Load to a CPU clone of the model, then load back
+            cpu_model = deepcopy(model)
+            cpu_model.load_state_dict(checkpoint['model'])
+            model.load_state_dict(cpu_model.state_dict())
+
+        optimizer.load_state_dict(checkpoint['optimizer'])
+        # Note: lr_scheduler is loaded from checkpoint later after training dataloader is defined.
+        # lr_scheduler.load_state_dict(checkpoint['lr_sched'])
+        pre_epoch = checkpoint['epoch']
+        best_epoch = checkpoint['best_epoch']
+        min_eval_loss = checkpoint['min_eval_loss']
+        total_iters = checkpoint['total_iters']
+        train_sampler_start_index = checkpoint['train_sampler_start_index']
+        checkpoint_train_loss = checkpoint['train_loss']
+        print("Successfully Loaded Checkpoint from Epoch %d" % pre_epoch)
+        is_checkpoint_loaded = True
+
+        # Log checkpoint's train_loss and total_iters for sanity checking on wandb
+        if global_rank == 0:
+            wandb.log({
+                        "train_loss": checkpoint_train_loss,
+                        "total_iters": total_iters,
+                        }, step=total_iters)
+
+    else:
+        pre_epoch = 1
+        best_epoch = 1
+        min_eval_loss = 100
+        total_iters = 1
+
     # load filenames under train and eval folder
     train_files = list_files_in_directory(TRAIN_FOLDERS)
     eval_files = list_files_in_directory(EVAL_FOLDERS)
@@ -337,25 +482,10 @@ def main(args):
     eval_dataset = ByteDataset(eval_files, PATCH_SIZE, PATCH_LENGTH)
 
     # Initialize DistributedSampler
-    train_sampler = DistributedSampler(train_dataset, num_replicas=world_size, rank=local_rank)
-    eval_sampler = DistributedSampler(eval_dataset, num_replicas=world_size, rank=local_rank)
-
-    # Load the most recent DataLoader state (if applicable)
-    # if args.first_launch != "True":
-    if not FIRST_LAUNCH:
-        # For train_set, if you're saving states of the sampler, load it here
-        most_recent_train_dataloader_state = find_most_recent_file(DATALOADER_PATH, pattern="train_dataloader_*.pth")
-        if most_recent_train_dataloader_state:
-            # Assuming you have a mechanism to load the sampler's state
-            train_sampler_state = torch.load(most_recent_train_dataloader_state, map_location='cpu')
-            train_sampler.load_state_dict(train_sampler_state)
-            print(f"Loaded train sampler state from {most_recent_train_dataloader_state}")
-
-        # Similar logic could apply to eval_set if you have a stateful eval_sampler
-        most_recent_eval_dataloader_state = find_most_recent_file(DATALOADER_PATH, pattern="eval_dataloader_*.pth")
-        if most_recent_eval_dataloader_state:
-            # Load eval_sampler state if necessary
-            pass
+    train_sampler = CustomDistributedSampler(train_dataset, num_replicas=world_size, rank=global_rank,
+                                             start_index=train_sampler_start_index)
+    
+    eval_sampler = CustomDistributedSampler(eval_dataset, num_replicas=world_size, rank=global_rank)
 
     # Initialize DataLoaders with potentially state-restored samplers
     train_set = DataLoader(train_dataset, batch_size=batch_size, collate_fn=collate_batch, sampler=train_sampler,
@@ -363,120 +493,104 @@ def main(args):
     eval_set = DataLoader(eval_dataset, batch_size=batch_size, collate_fn=collate_batch, sampler=eval_sampler,
                           shuffle=False)
 
+    print(train_set)
+    # print(train_set.state_dict())
+    # print(train_sampler.state_dict())
+    # print(dir(train_sampler))
+    # print(dir(train_set))
+
     lr_scheduler = get_scheduler(
         name="cosine",
         optimizer=optimizer,
         num_warmup_steps=NUM_EPOCHS * len(train_set) // 10,
         num_training_steps=NUM_EPOCHS * len(train_set),
     )
-    model = model.to(device)
-    optimizer = torch.optim.AdamW(model.parameters(), lr=LEARNING_RATE)
-
-    if LOAD_FROM_PRE_CHECKPOINT and os.path.exists(PRE_WEIGHTS_PATH):
-        # Load checkpoint to CPU
-        checkpoint = torch.load(PRE_WEIGHTS_PATH, map_location='cpu')
-
-        # Here, model is assumed to be on GPU
-        # Load state dict to CPU model first, then move the model to GPU
-        if torch.cuda.device_count() > 1:
-            # If you have a DataParallel model, you need to load to model.module instead
-            cpu_model = deepcopy(model.module)
-            cpu_model.load_state_dict(checkpoint['model'])
-            model.module.load_state_dict(cpu_model.state_dict())
-        else:
-            # Load to a CPU clone of the model, then load back
-            cpu_model = deepcopy(model)
-            cpu_model.load_state_dict(checkpoint['model'])
-            model.load_state_dict(cpu_model.state_dict())
-
-        print(
-            f"Successfully Loaded Pretrained Checkpoint at Epoch {checkpoint['epoch']} with Loss {checkpoint['min_eval_loss']}")
-
-    else:
-        pre_epoch = 0
-        best_epoch = 0
-        min_eval_loss = 100
-
-    if LOAD_FROM_CHECKPOINT and os.path.exists(WEIGHTS_PATH):
-        # Load checkpoint to CPU
-        # TODO: Missing function. Is this supposed to be find_most_recent_file?
-        # most_recent_checkpoint = find_most_recent_checkpoint(CHECKPOINT_PATH)
-        most_recent_checkpoint = find_most_recent_file(CHECKPOINT_PATH, pattern="checkpoint*.pth")
-        if most_recent_checkpoint is not None:
-            WEIGHTS_PATH = most_recent_checkpoint
-            checkpoint = torch.load(WEIGHTS_PATH, map_location='cpu')
-
-        # Here, model is assumed to be on GPU
-        # Load state dict to CPU model first, then move the model to GPU
-        if torch.cuda.device_count() > 1:
-            # If you have a DataParallel model, you need to load to model.module instead
-            cpu_model = deepcopy(model.module)
-            cpu_model.load_state_dict(checkpoint['model'])
-            model.module.load_state_dict(cpu_model.state_dict())
-        else:
-            # Load to a CPU clone of the model, then load back
-            cpu_model = deepcopy(model)
-            cpu_model.load_state_dict(checkpoint['model'])
-            model.load_state_dict(cpu_model.state_dict())
-        optimizer.load_state_dict(checkpoint['optimizer'])
+    
+    if is_checkpoint_loaded and checkpoint is not None:
         lr_scheduler.load_state_dict(checkpoint['lr_sched'])
-        pre_epoch = checkpoint['epoch']
-        best_epoch = checkpoint['best_epoch']
-        min_eval_loss = checkpoint['min_eval_loss']
-        print("Successfully Loaded Checkpoint from Epoch %d" % pre_epoch)
-        checkpoint = None
 
-    else:
-        pre_epoch = 0
-        best_epoch = 0
-        min_eval_loss = 100
+    # Note: Code that uses pre-checkpoint, which we aren't using now
+    # if LOAD_FROM_PRE_CHECKPOINT and os.path.exists(PRE_WEIGHTS_PATH):
+    #     # Load checkpoint to CPU
+    #     checkpoint = torch.load(PRE_WEIGHTS_PATH, map_location='cpu')
+
+    #     # Here, model is assumed to be on GPU
+    #     # Load state dict to CPU model first, then move the model to GPU
+    #     if torch.cuda.device_count() > 1:
+    #         # If you have a DataParallel model, you need to load to model.module instead
+    #         cpu_model = deepcopy(model.module)
+    #         cpu_model.load_state_dict(checkpoint['model'])
+    #         model.module.load_state_dict(cpu_model.state_dict())
+    #     else:
+    #         # Load to a CPU clone of the model, then load back
+    #         cpu_model = deepcopy(model)
+    #         cpu_model.load_state_dict(checkpoint['model'])
+    #         model.load_state_dict(cpu_model.state_dict())
+
+    #     print(
+    #         f"Successfully Loaded Pretrained Checkpoint at Epoch {checkpoint['epoch']} with Loss {checkpoint['min_eval_loss']}")
+
+    # else:
+    #     pre_epoch = 1
+    #     best_epoch = 1
+    #     min_eval_loss = 100
 
     for epoch in range(1 + pre_epoch, NUM_EPOCHS + 1):
         train_sampler.set_epoch(epoch)
         eval_sampler.set_epoch(epoch)
         print('-' * 21 + "Epoch " + str(epoch) + '-' * 21)
 
-        train_loss = train_epoch(model, 
-                                 train_set, 
-                                 lr_scheduler, 
-                                 scaler,
-                                 optimizer,
-                                 epoch, 
-                                 best_epoch, 
-                                 min_eval_loss, 
-                                 BATCH_SIZE, 
-                                 ACCUMULATION_STEPS, 
-                                 CHECKPOINT_FREQUENCY, 
-                                 CHECKPOINT_PATH
-                                 )
-        
-        eval_loss = eval_epoch(model,
-                               eval_set, 
-                               BATCH_SIZE, 
-                               ACCUMULATION_STEPS
-                               )
+        ave_train_loss, total_iters = train_epoch(model,
+                                              train_set,
+                                              eval_set,
+                                              lr_scheduler,
+                                              scaler,
+                                              optimizer,
+                                              epoch,
+                                              best_epoch,
+                                              min_eval_loss,
+                                              BATCH_SIZE,
+                                              ACCUMULATION_STEPS,
+                                              CHECKPOINT_FREQUENCY,
+                                              CHECKPOINT_PATH,
+                                              LOGGING_FREQUENCY,
+                                              total_iters,
+                                              VERBOSE
+                                              )
+        if len(eval_set) != 0:
+            eval_loss = eval_epoch(model,
+                                   eval_set,
+                                   BATCH_SIZE,
+                                   ACCUMULATION_STEPS
+                                   )
+        else:
+            eval_loss = 0
 
         if global_rank == 0:
             with open(LOGS_PATH, 'a') as f:
-                f.write("Epoch " + str(epoch) + "\ntrain_loss: " + str(train_loss) + "\neval_loss: " + str(
+                f.write("Epoch " + str(epoch) + "\nave_train_loss: " + str(ave_train_loss) + "\neval_loss: " + str(
                     eval_loss) + "\ntime: " + time.asctime(time.localtime(time.time())) + "\n\n")
+            
             wandb.log({
-                "train_loss": train_loss,
-                "eval_loss": eval_loss,
-                "epoch": epoch
-            })
-            if eval_loss < min_eval_loss:
-                best_epoch = epoch
-                min_eval_loss = eval_loss
-                checkpoint = {
-                    'model': model.module.state_dict() if hasattr(model, "module") else model.state_dict(),
-                    'optimizer': optimizer.state_dict(),
-                    'lr_sched': lr_scheduler.state_dict(),
-                    'epoch': epoch,
-                    'best_epoch': best_epoch,
-                    'min_eval_loss': min_eval_loss
-                }
-                torch.save(checkpoint, WEIGHTS_PATH)
+                "epoch_ave_train_loss": ave_train_loss,
+                "ave_eval_loss": eval_loss,
+                "epoch": epoch,
+                "total_iters": total_iters,
+            }, step=total_iters)
+            
+            # if eval_loss < min_eval_loss:
+            #     best_epoch = epoch
+            #     min_eval_loss = eval_loss
+            #     checkpoint = {
+            #         'model': model.module.state_dict() if hasattr(model, "module") else model.state_dict(),
+            #         'optimizer': optimizer.state_dict(),
+            #         'lr_sched': lr_scheduler.state_dict(),
+            #         'epoch': epoch,
+            #         'best_epoch': best_epoch,
+            #         'min_eval_loss': min_eval_loss,
+            #         'total_iters': total_iters,
+            #     }
+            #     torch.save(checkpoint, WEIGHTS_PATH)
                 # torch.save(dataloader.state_dict(), checkpoint_path)
 
         if world_size > 1:
@@ -487,11 +601,14 @@ def main(args):
         print("Min Eval Loss : " + str(min_eval_loss))
 
 
-
 # train and eval
 if __name__ == "__main__":
     parser = argparse.ArgumentParser(description="Training script for bGBT")
-    parser.add_argument("--train-config-path", type=str, required=True, help="Path to the config YAML file for training run")
+    parser.add_argument("--train-config-path", type=str, required=True,
+                        help="Path to the config YAML file for training run")
+    parser.add_argument("--load-from-checkpoint", action='store_true', dest="load_from_checkpoint",
+                        help="If ths flag is present, model checkpoint will be loaded. By default without the flag, checkpoint will not be loaded.")
+    parser.set_defaults(load_from_checkpoint=False)
     args = parser.parse_args()
 
     main(args)
diff --git a/scripts/abc-midi/README.md b/scripts/abc-midi/README.md
new file mode 100644
index 0000000..5ce7c5c
--- /dev/null
+++ b/scripts/abc-midi/README.md
@@ -0,0 +1,22 @@
+#### Configuration
+clone this repo for abc->midi conversion. Same repository was mentiond in the paper for convesion.
+```
+    cd path/to/scripts/abc-midi
+    git clone git@github.com:xlvector/abcmidi.git
+    cd abcmidi
+    cmake
+```
+
+**NOTE** change the dir for save accordingly.
+### Download irishman dataset
+```
+    cd path/to/scripts/abc-midi
+    python prep_abc_data.py sander-wood/irishman ../../data/hf_cache --split train -o ../../data/abc_data/train
+    python prep_abc_data.py sander-wood/irishman ../../data/hf_cache --split validation -o ../../data/abc_data/val
+```
+### abc to midi file conversion
+```
+    cd path/to/scripts/abc-midi
+    python abc2midi.py ../../data/abc_data/train -o ../../data/midi_data/train
+    python abc2midi.py ../../data/abc_data/val -o ../../data/midi_data/val
+```
diff --git a/scripts/abc-midi/abc2midi.py b/scripts/abc-midi/abc2midi.py
new file mode 100644
index 0000000..630ccd6
--- /dev/null
+++ b/scripts/abc-midi/abc2midi.py
@@ -0,0 +1,53 @@
+import os 
+import argparse
+import subprocess as sb
+from tqdm import tqdm
+from multiprocessing import Pool
+from functools import partial
+
+
+def _single_file(filename, abc_path, output_path):
+    # abc2midi is the package that converts formats (ref: https://github.com/xlvector/abcmidi)
+    command = ["abcmidi/abc2midi", os.path.join(abc_path, filename), "-o", os.path.join(output_path, filename[:-4]+".mid")]
+    process = sb.run(command, stderr=sb.PIPE, stdout=sb.PIPE)
+
+    # storing err and outs to verify file conversion warning logs if needed
+    stdout_output = process.stdout.decode('utf-8')  
+    stderr_output = process.stderr.decode('utf-8') 
+    return stdout_output, stderr_output
+
+# install abcmidi package first 
+def convert_abc_to_midi(abc_path, output_path):
+    if not os.path.exists(output_path):
+        os.makedirs(output_path)
+
+    # check log file after conversion finishes for any errors if midi files are not working
+    with open("conversion_log", "w") as log_file, Pool() as pool:
+        filenames = os.listdir(abc_path)
+        process_func = partial(_single_file, abc_path=abc_path, output_path=output_path)
+        results = list(tqdm(pool.imap(process_func, filenames, chunksize=20), total=len(filenames), desc="Converting abc files to midi files")) # reduce chunksize for lower memory usage
+        
+        for stdout_output, stderr_output in results:
+            log_file.write("STDOUT output:\n")
+            log_file.write(stdout_output)
+            log_file.write("\n\n")
+            log_file.write("STDERR output:\n")
+            log_file.write(stderr_output)
+            log_file.write("\n\n")
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Convert abc files to midi files.")
+    parser.add_argument("abc_path", type=str, help="Path to directory containing abc files.")
+    parser.add_argument("-o", "--output_path", type=str, help="Output directory for saving midi files.", required=True)
+    args = parser.parse_args()
+
+    convert_abc_to_midi(args.abc_path, args.output_path)
+
+if __name__ == "__main__":
+    main()
+
+    # e.g. python abc2midi.py ../../data/abc_data/train -o ../../data/midi_data/train
+    # e.g. python abc2midi.py ../../data/abc_data/val -o ../../data/midi_data/val
+    #      python abc2midi.py abc_data/val -o midi_data/val
+    
\ No newline at end of file
diff --git a/scripts/abc-midi/prep_abc_data.py b/scripts/abc-midi/prep_abc_data.py
new file mode 100644
index 0000000..e95c1cc
--- /dev/null
+++ b/scripts/abc-midi/prep_abc_data.py
@@ -0,0 +1,39 @@
+import os
+import argparse
+from datasets import load_dataset
+from tqdm import tqdm
+
+def get_abc(dataset, cache_path, split, output_path):
+
+    if not os.path.exists(output_path):
+        os.makedirs(output_path)
+
+    data = load_dataset(dataset, cache_dir=cache_path, split=split)
+    # Loop through the dataset and save each 'abc notation' value into separate .abc files
+    for idx, sample in tqdm(enumerate(data), total=len(data), desc="Unpacking dataset into abc files"):
+        abc_notation = sample['abc notation']
+        file_name = f"{idx}.abc"  
+        file_path = os.path.join(output_path, file_name)
+        with open(file_path, 'w') as file:
+            file.write(abc_notation)  
+
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Download and setup abc file train-val directories.")
+    parser.add_argument("dataset_id", type=str, help="Specify huggingface dataset id flag.")
+    parser.add_argument("cache_dir", type=str, help="Path to download huggingface dataset.")
+    parser.add_argument("--split", type=str, help="Train/Validation/Test split depending on the dataset.", required=True)
+    parser.add_argument("-o", "--output_dir", type=str, help="Set output directory for saving abc files.", required=True)
+
+    args = parser.parse_args()
+
+    get_abc(args.dataset_id, args.cache_dir, args.split, args.output_dir)
+
+
+if __name__=="__main__":
+    main()
+
+    # e.g. 
+    # python prep_abc_data.py sander-wood/irishman ../../data/hf_cache --split train -o ../../data/abc_data/train
+    # python prep_abc_data.py sander-wood/irishman ../../data/hf_cache --split validation -o ../../data/abc_data/val
\ No newline at end of file
diff --git a/scripts/abc-midi/train.sh b/scripts/abc-midi/train.sh
new file mode 100644
index 0000000..721542c
--- /dev/null
+++ b/scripts/abc-midi/train.sh
@@ -0,0 +1,16 @@
+#!/bin/bash
+#SBATCH -J abc-midi-bidirectional-btch16-paatch16-v1
+#SBATCH -o ./logs/abcmidi-btch16-ptch16-v1.out
+#SBATCH --error=./logs/abcmidi-btch16-ptch16-v1.err
+#SBATCH -c 4
+#SBATCH --gres=gpu:a100:1
+#SBATCH --mem=32G
+#SBATCH -t 48:00:00
+#SBATCH --partition=unkillable
+
+
+export WANDB_API_KEY=$(awk '/api_key/ {print $3}' /home/mila/m/mina.beiramy/.wandb_config)
+module load anaconda/3
+module load cudatoolkit/11.6
+conda activate bgpt
+python ../../bgpt/train-gen-og.py
\ No newline at end of file
diff --git a/scripts/easy-math/text_file_generator_batched.py b/scripts/easy-math/text_file_generator_batched.py
index fb7b0dc..d0b2121 100644
--- a/scripts/easy-math/text_file_generator_batched.py
+++ b/scripts/easy-math/text_file_generator_batched.py
@@ -59,14 +59,14 @@ def merge_write(input_files, output_dir, max_examples_per_file, max_num_output_f
 
 
 # List of input file names
-input_files = ['/lustre/orion/csc590/proj-shared/math-adder/2-digit-symbols/train.txt', 
-               '/lustre/orion/csc590/proj-shared/math-adder/3-digit-symbols/train.txt',
-               '/lustre/orion/csc590/proj-shared/math-adder/4-digit-symbols/train.txt',
-               '/lustre/orion/csc590/proj-shared/math-adder/5-digit-symbols/train.txt',
+input_files = ['/nfs/scratch/jonathan/byte_models/data/2-digit-symbols/train.txt', 
+               '/nfs/scratch/jonathan/byte_models/data/3-digit-symbols/train.txt', 
+               '/nfs/scratch/jonathan/byte_models/data/4-digit-symbols/train.txt', 
+               '/nfs/scratch/jonathan/byte_models/data/5-digit-symbols/train.txt', 
                ]
 
 # Output file name
-output_dir = '/lustre/orion/csc590/proj-shared/math-adder/merged-symbols/train'
+output_dir = '/nfs/scratch/jonathan/byte_models/data/math-adder/merged-symbols/train'
 Path(output_dir).mkdir(parents=True, exist_ok=True)
 # Will generate output files, each output file having examples interleaved from the various input files
 merge_write(input_files, output_dir, max_examples_per_file=550, max_num_output_files=1000000)