Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion configs/_cluster/helios.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ infrastructure:
# export pixi variables
- 'export PIXI_HOME=$PROJECT_HOME_PATH/pixi'
- 'export PATH="$HOME/.pixi/bin:$PATH"'
- 'export XDG_DATA_HOME="PROJECT_HOME_PATH/data"'
- 'export XDG_DATA_HOME="$PROJECT_HOME_PATH/data"'
- 'export XDG_CACHE_HOME="$PROJECT_HOME_PATH/cache"'
- 'export XDG_STATE_HOME="$PROJECT_HOME_PATH/state"'

Expand Down
2 changes: 1 addition & 1 deletion configs/pc_project/llama_1B_importances.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@ apply_functions:
dataloader: ${trainer.train_dataloader}
dmodel: ${common.dmodel}
dff: ${common.dff}
calibration_dataset_size: 8192 # nvidia used 2k steps of 4k sequence lenght - this is the saturaion poin - longer doesnt improve miningfully
calibration_dataset_size: 8192 # nvidia used 2k steps of 4k sequence length - this is the saturation point - longer doesn't improve meaningfully
seq_len: ${common.sequence_length}
total_batch_size: ${trainer.train_dataloader.total_batch_size}
n_blocks: ${model.encoder.n_blocks}
Expand Down
2 changes: 1 addition & 1 deletion configs/pc_project/llama_8B_importances.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@ apply_functions:
dataloader: ${trainer.train_dataloader}
dmodel: ${common.dmodel}
dff: ${common.dff}
calibration_dataset_size: 8192 # nvidia used 2k steps of 4k sequence lenght - this is the saturaion poin - longer doesnt improve miningfully
calibration_dataset_size: 8192 # nvidia used 2k steps of 4k sequence length - this is the saturation point - longer doesn't improve meaningfully
seq_len: ${common.sequence_length}
total_batch_size: ${trainer.train_dataloader.total_batch_size}
n_blocks: ${model.encoder.n_blocks}
Expand Down
6 changes: 3 additions & 3 deletions main.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,7 @@ def check_env_vars():
assert int(os.environ["RANK"]) < int(os.environ["WORLD_SIZE"])


def setup_enviroment():
def setup_environment():
if "WORLD_SIZE" not in os.environ:
logger.warning("WORLD_SIZE is not set, setting it to 1")
os.environ["WORLD_SIZE"] = "1"
Expand Down Expand Up @@ -267,7 +267,7 @@ def initialize_training_components(cfg: OmegaConf, metric_logger=None):
cfg, model, learning_rate
)
elif cfg.trainer.checkpoint.load.type == "nano":
# TODO! if you want to apply function on loaded model it does NOT work now, it applies function on newly inintialized model than it loads model weights
# TODO! if you want to apply function on loaded model it does NOT work now, it applies function on newly initialized model than it loads model weights
model, optimizer, scheduler = get_model_optimizer_scheduler(
cfg, model, learning_rate
)
Expand All @@ -292,7 +292,7 @@ def initialize_training_components(cfg: OmegaConf, metric_logger=None):


def run(cfg: OmegaConf, metric_logger=None):
setup_enviroment()
setup_environment()

if "distributed" in cfg.trainer and cfg.trainer.distributed is not None:
distributed_setup()
Expand Down
2 changes: 1 addition & 1 deletion src/core/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ def solve_config_lr(
config_lr: float,
) -> tuple[
float, float
]: # TODO temporary place - move to devinitions eval+ when created
]: # TODO temporary place - move to definitions eval+ when created
ret_lr, ret_exp_lr = None, None
if config_lr < 1.0:
ret_lr = config_lr
Expand Down