Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 7 additions & 1 deletion nemo/utils/exp_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -292,6 +292,10 @@ class ExpManagerConfig:
fault_tolerance: Optional[FaultToleranceParams] = field(default_factory=FaultToleranceParams)
# logs TFLOPs per sec per gpu
log_tflops_per_sec_per_gpu: Optional[bool] = True
# Move old log files into run_0/, run_1/, etc. subdirectories on resume.
# Disable on shared filesystems (e.g. Lustre) where moving files while other ranks
# are starting up can cause FileNotFoundError in Lightning's checkpoint scanner.
move_files_to_run_dirs: Optional[bool] = True
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

shouldn;t this be handled while exiting the previous run safely?

Copy link
Copy Markdown
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

no, it's not. the "move to run_X/" logic is triggered on start of new job. It has a race condition - every rank creates nemo_rankX_blabla.txt log file -> some ranks are faster, some are slower -> some rank is first listing the files, and then calling os.stat on them -> between the two calls, rank0 moved the file to run_X -> os.stat fails with FileNotFoundError

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Let's default to False, we can rethink logging at a later date.



class TimingCallback(Callback):
Expand Down Expand Up @@ -625,6 +629,7 @@ def exp_manager(trainer: 'lightning.pytorch.Trainer', cfg: Optional[Union[DictCo
cfg.resume_ignore_no_checkpoint,
cfg.checkpoint_callback_params.dirpath,
cfg.resume_from_checkpoint,
cfg.move_files_to_run_dirs,
)

checkpoint_name = name
Expand Down Expand Up @@ -903,6 +908,7 @@ def check_resume(
resume_ignore_no_checkpoint: bool = False,
dirpath: str = None,
resume_from_checkpoint: str = None,
move_files_to_run_dirs: bool = True,
):
"""Checks that resume=True was used correctly with the arguments pass to exp_manager. Sets
trainer._checkpoint_connector._ckpt_path as necessary.
Expand Down Expand Up @@ -1054,7 +1060,7 @@ def check_resume(
trainer.ckpt_path = str(checkpoint)
logging.info(f'Resuming training from checkpoint: {trainer.ckpt_path}')

if is_global_rank_zero():
if move_files_to_run_dirs and is_global_rank_zero():
# Check to see if any files exist that need to be moved
files_to_move = []
if Path(log_dir).exists():
Expand Down
Loading