From f2b3747ce741769a1ea501ea555314a5edbbce43 Mon Sep 17 00:00:00 2001 From: yumingxuan Date: Wed, 20 May 2026 16:35:30 +0800 Subject: [PATCH 1/6] feat: add torch profiler support alongside nsys - rename nsys.py -> profiler.py with NsysCallback + TorchProfilerCallback - add profiler_type arg (nsys/torch) to megatron_args - add profile_rank arg (-1=all, 0=rank0 default) - TorchProfilerCallback: torch.profiler.profile with step schedule, saves to output_dir/torch_profiler/ - NsysCallback: add check_error on cudaProfilerStart/Stop - update mapping.py with torch_profiler entry --- swift/megatron/arguments/megatron_args.py | 7 ++ swift/megatron/callbacks/mapping.py | 3 + swift/megatron/callbacks/profiler.py | 103 ++++++++++++++++++++++ 3 files changed, 113 insertions(+) create mode 100644 swift/megatron/callbacks/profiler.py diff --git a/swift/megatron/arguments/megatron_args.py b/swift/megatron/arguments/megatron_args.py index 8fc1cfc1b3..1493a4c8d3 100644 --- a/swift/megatron/arguments/megatron_args.py +++ b/swift/megatron/arguments/megatron_args.py @@ -595,6 +595,10 @@ class MegatronArguments(RLHFMegatronArgumentsMixin, MegatronTunerMixin): problem_type: Literal['regression', 'single_label_classification', 'multi_label_classification'] = None save_strategy: Literal['steps', 'epoch'] = 'steps' callbacks: List[str] = field(default_factory=list) + nsys_profile_start: int = 0 # 1-based; 0 = disabled + nsys_profile_end: int = 5 + profiler_type: str = 'nsys' # nsys or torch + profile_rank: int = 0 # local rank to profile; -1 = all ranks @staticmethod def load_args_config(ckpt_dir: Optional[str]) -> Dict[str, Any]: @@ -705,6 +709,9 @@ def __post_init__(self): self.gradient_accumulation_fusion = False self.callbacks += ['print', 'default_flow'] self.callbacks += self.report_to + if self.nsys_profile_start > 0: + cb = 'torch_profiler' if self.profiler_type == 'torch' else 'nsys' + self.callbacks.append(cb) if self.save_total_limit is not None: if self.async_save: raise ValueError('async_save is not supported with save_total_limit.') diff --git a/swift/megatron/callbacks/mapping.py b/swift/megatron/callbacks/mapping.py index 68e13269c1..bb5179d202 100644 --- a/swift/megatron/callbacks/mapping.py +++ b/swift/megatron/callbacks/mapping.py @@ -1,5 +1,6 @@ # Copyright (c) ModelScope Contributors. All rights reserved. from .default_flow import DefaultFlowCallback +from .profiler import NsysCallback, TorchProfilerCallback from .print import PrintCallback from .swanlab import SwanlabCallback from .tensorboard import TensorboardCallback @@ -8,6 +9,8 @@ megatron_callbacks_map = { 'print': PrintCallback, 'default_flow': DefaultFlowCallback, + 'nsys': NsysCallback, + 'torch_profiler': TorchProfilerCallback, 'swanlab': SwanlabCallback, 'wandb': WandbCallback, 'tensorboard': TensorboardCallback, diff --git a/swift/megatron/callbacks/profiler.py b/swift/megatron/callbacks/profiler.py new file mode 100644 index 0000000000..e4dc7f134e --- /dev/null +++ b/swift/megatron/callbacks/profiler.py @@ -0,0 +1,103 @@ +# Copyright (c) ModelScope Contributors. All rights reserved. +import os +import torch +import torch.profiler +from swift.megatron.callbacks.base import MegatronCallback + + +class NsysCallback(MegatronCallback): + """Profile steps [nsys_profile_start, nsys_profile_end] via cudaProfilerStart/Stop. + + Requires nsys launched with --start-later --capture-range=cudaProfilerApi. + profile_rank controls which local rank triggers profiling (default 0). + Set NSYS_PER_RANK=1 to override and profile every rank independently. + Step numbers are 1-based. + """ + + def __init__(self, trainer): + super().__init__(trainer) + self.start_step = getattr(self.args, 'nsys_profile_start', 5) + self.end_step = getattr(self.args, 'nsys_profile_end', 5) + self._local_rank = int(os.environ.get('LOCAL_RANK', 0)) + self._profile_rank = getattr(self.args, 'profile_rank', 0) + self._per_rank = os.environ.get('NSYS_PER_RANK', '0') == '1' + self._profiling = False + + def _should_profile(self): + if self._per_rank: + return True + return self._profile_rank == -1 or self._local_rank == self._profile_rank + + def on_step_begin(self): + if not self._should_profile(): + return + step = self.state.iteration + 1 + if step == self.start_step and not self._profiling: + print(f'[nsys] cudaProfilerStart at step {step} (local_rank={self._local_rank})', + flush=True) + torch.cuda.check_error(torch.cuda.cudart().cudaProfilerStart()) + self._profiling = True + + def on_step_end(self): + if not self._should_profile(): + return + step = self.state.iteration + if self._profiling and step >= self.end_step: + print(f'[nsys] cudaProfilerStop after step {step} (local_rank={self._local_rank})', + flush=True) + torch.cuda.check_error(torch.cuda.cudart().cudaProfilerStop()) + self._profiling = False + + +class TorchProfilerCallback(MegatronCallback): + """Profile steps [nsys_profile_start, nsys_profile_end] via torch.profiler. + + profile_rank controls which local rank profiles (default 0); -1 = all ranks. + Saves TensorBoard traces to {tensorboard_dir}/torch_profiler/rank{R}_node{N}/. + Step numbers are 1-based. + """ + + def __init__(self, trainer): + super().__init__(trainer) + self.start_step = getattr(self.args, 'nsys_profile_start', 5) + self.end_step = getattr(self.args, 'nsys_profile_end', 5) + self._local_rank = int(os.environ.get('LOCAL_RANK', 0)) + self._node_rank = int(os.environ.get('NODE_RANK', 0)) + self._profile_rank = getattr(self.args, 'profile_rank', 0) + self._prof = None + + def _should_profile(self): + return self._profile_rank == -1 or self._local_rank == self._profile_rank + + def on_train_begin(self): + if not self._should_profile(): + return + wait = max(0, self.start_step - 1) + active = max(1, self.end_step - self.start_step + 1) + base_dir = self.args.output_dir + trace_dir = os.path.join( + base_dir, 'torch_profiler', + f'rank{self._local_rank}_node{self._node_rank}') + os.makedirs(trace_dir, exist_ok=True) + self._prof = torch.profiler.profile( + schedule=torch.profiler.schedule(wait=wait, warmup=0, active=active, repeat=1), + on_trace_ready=torch.profiler.tensorboard_trace_handler(trace_dir), + record_shapes=True, + profile_memory=True, + with_stack=True, + with_flops=True, + ) + self._prof.__enter__() + print(f'[torch_profiler] started rank={self._local_rank} node={self._node_rank} ' + f'wait={wait} active={active} trace_dir={trace_dir}', flush=True) + + def on_step_end(self): + if self._prof is not None: + self._prof.step() + + def on_train_end(self): + if self._prof is not None: + self._prof.__exit__(None, None, None) + print(f'[torch_profiler] trace saved rank={self._local_rank} node={self._node_rank}', + flush=True) + self._prof = None From 32edf43b7c90502e2b21614dc727fea160bdf3d3 Mon Sep 17 00:00:00 2001 From: yumingxuan Date: Wed, 20 May 2026 16:55:20 +0800 Subject: [PATCH 2/6] defaults to none --- swift/megatron/arguments/megatron_args.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/swift/megatron/arguments/megatron_args.py b/swift/megatron/arguments/megatron_args.py index d1d5b2ec5b..98bf538fd7 100644 --- a/swift/megatron/arguments/megatron_args.py +++ b/swift/megatron/arguments/megatron_args.py @@ -641,7 +641,7 @@ class MegatronArguments(RLHFMegatronArgumentsMixin, MegatronTunerMixin): callbacks: List[str] = field(default_factory=list) nsys_profile_start: int = 0 # 1-based; 0 = disabled nsys_profile_end: int = 5 - profiler_type: str = 'nsys' # nsys or torch + profiler_type: str = 'none' # nsys or torch profile_rank: int = 0 # local rank to profile; -1 = all ranks @staticmethod @@ -758,7 +758,7 @@ def __post_init__(self): self.gradient_accumulation_fusion = False self.callbacks += ['print', 'default_flow'] self.callbacks += self.report_to - if self.nsys_profile_start > 0: + if self.profiler_type != 'none': cb = 'torch_profiler' if self.profiler_type == 'torch' else 'nsys' self.callbacks.append(cb) if self.save_total_limit is not None: From 79d75dfbdf84dc3dc61369401b761142ae6eb058 Mon Sep 17 00:00:00 2001 From: yumingxuan Date: Wed, 20 May 2026 17:00:05 +0800 Subject: [PATCH 3/6] change default value --- swift/megatron/arguments/megatron_args.py | 4 ++-- swift/megatron/callbacks/profiler.py | 11 +++-------- 2 files changed, 5 insertions(+), 10 deletions(-) diff --git a/swift/megatron/arguments/megatron_args.py b/swift/megatron/arguments/megatron_args.py index 98bf538fd7..b00ddc8fe4 100644 --- a/swift/megatron/arguments/megatron_args.py +++ b/swift/megatron/arguments/megatron_args.py @@ -639,8 +639,8 @@ class MegatronArguments(RLHFMegatronArgumentsMixin, MegatronTunerMixin): problem_type: Literal['regression', 'single_label_classification', 'multi_label_classification'] = None save_strategy: Literal['steps', 'epoch'] = 'steps' callbacks: List[str] = field(default_factory=list) - nsys_profile_start: int = 0 # 1-based; 0 = disabled - nsys_profile_end: int = 5 + nsys_profile_start: int = -1 # 1-based; 0 = disabled + nsys_profile_end: int = -1 profiler_type: str = 'none' # nsys or torch profile_rank: int = 0 # local rank to profile; -1 = all ranks diff --git a/swift/megatron/callbacks/profiler.py b/swift/megatron/callbacks/profiler.py index e4dc7f134e..37180e9d78 100644 --- a/swift/megatron/callbacks/profiler.py +++ b/swift/megatron/callbacks/profiler.py @@ -10,22 +10,17 @@ class NsysCallback(MegatronCallback): Requires nsys launched with --start-later --capture-range=cudaProfilerApi. profile_rank controls which local rank triggers profiling (default 0). - Set NSYS_PER_RANK=1 to override and profile every rank independently. - Step numbers are 1-based. """ def __init__(self, trainer): super().__init__(trainer) - self.start_step = getattr(self.args, 'nsys_profile_start', 5) - self.end_step = getattr(self.args, 'nsys_profile_end', 5) + self.start_step = getattr(self.args, 'nsys_profile_start', -1) + self.end_step = getattr(self.args, 'nsys_profile_end', -1) self._local_rank = int(os.environ.get('LOCAL_RANK', 0)) self._profile_rank = getattr(self.args, 'profile_rank', 0) - self._per_rank = os.environ.get('NSYS_PER_RANK', '0') == '1' self._profiling = False def _should_profile(self): - if self._per_rank: - return True return self._profile_rank == -1 or self._local_rank == self._profile_rank def on_step_begin(self): @@ -61,9 +56,9 @@ def __init__(self, trainer): super().__init__(trainer) self.start_step = getattr(self.args, 'nsys_profile_start', 5) self.end_step = getattr(self.args, 'nsys_profile_end', 5) + self._profile_rank = getattr(self.args, 'profile_rank', 0) self._local_rank = int(os.environ.get('LOCAL_RANK', 0)) self._node_rank = int(os.environ.get('NODE_RANK', 0)) - self._profile_rank = getattr(self.args, 'profile_rank', 0) self._prof = None def _should_profile(self): From c002e290983c7a68b4290897ce507d9ad18e5891 Mon Sep 17 00:00:00 2001 From: yumingxuan Date: Wed, 20 May 2026 17:13:48 +0800 Subject: [PATCH 4/6] use chrome trace --- swift/megatron/callbacks/profiler.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/swift/megatron/callbacks/profiler.py b/swift/megatron/callbacks/profiler.py index 37180e9d78..79feb453a7 100644 --- a/swift/megatron/callbacks/profiler.py +++ b/swift/megatron/callbacks/profiler.py @@ -60,6 +60,7 @@ def __init__(self, trainer): self._local_rank = int(os.environ.get('LOCAL_RANK', 0)) self._node_rank = int(os.environ.get('NODE_RANK', 0)) self._prof = None + self._trace_dir = None def _should_profile(self): return self._profile_rank == -1 or self._local_rank == self._profile_rank @@ -76,12 +77,12 @@ def on_train_begin(self): os.makedirs(trace_dir, exist_ok=True) self._prof = torch.profiler.profile( schedule=torch.profiler.schedule(wait=wait, warmup=0, active=active, repeat=1), - on_trace_ready=torch.profiler.tensorboard_trace_handler(trace_dir), record_shapes=True, profile_memory=True, with_stack=True, with_flops=True, ) + self._trace_dir = trace_dir self._prof.__enter__() print(f'[torch_profiler] started rank={self._local_rank} node={self._node_rank} ' f'wait={wait} active={active} trace_dir={trace_dir}', flush=True) @@ -93,6 +94,9 @@ def on_step_end(self): def on_train_end(self): if self._prof is not None: self._prof.__exit__(None, None, None) - print(f'[torch_profiler] trace saved rank={self._local_rank} node={self._node_rank}', - flush=True) + chrome_path = os.path.join(self._trace_dir, 'chrome_trace.json') + self._prof.export_chrome_trace(chrome_path) + print(f'[torch_profiler] trace saved rank={self._local_rank} node={self._node_rank} ' + f'chrome={chrome_path}', flush=True) self._prof = None + self._trace_dir = None From 774d0653b52838c6de4c5357e5db178d7eeb7885 Mon Sep 17 00:00:00 2001 From: yumingxuan Date: Wed, 20 May 2026 17:24:48 +0800 Subject: [PATCH 5/6] global ranks --- swift/megatron/arguments/megatron_args.py | 2 +- swift/megatron/callbacks/profiler.py | 14 ++++++++------ 2 files changed, 9 insertions(+), 7 deletions(-) diff --git a/swift/megatron/arguments/megatron_args.py b/swift/megatron/arguments/megatron_args.py index b00ddc8fe4..b56d1d01e9 100644 --- a/swift/megatron/arguments/megatron_args.py +++ b/swift/megatron/arguments/megatron_args.py @@ -642,7 +642,7 @@ class MegatronArguments(RLHFMegatronArgumentsMixin, MegatronTunerMixin): nsys_profile_start: int = -1 # 1-based; 0 = disabled nsys_profile_end: int = -1 profiler_type: str = 'none' # nsys or torch - profile_rank: int = 0 # local rank to profile; -1 = all ranks + profile_rank: Optional[List[int]] = None # global ranks to profile; None = all ranks @staticmethod def load_args_config(ckpt_dir: Optional[str]) -> Dict[str, Any]: diff --git a/swift/megatron/callbacks/profiler.py b/swift/megatron/callbacks/profiler.py index 79feb453a7..5f41291fbb 100644 --- a/swift/megatron/callbacks/profiler.py +++ b/swift/megatron/callbacks/profiler.py @@ -9,7 +9,7 @@ class NsysCallback(MegatronCallback): """Profile steps [nsys_profile_start, nsys_profile_end] via cudaProfilerStart/Stop. Requires nsys launched with --start-later --capture-range=cudaProfilerApi. - profile_rank controls which local rank triggers profiling (default 0). + profile_rank: list of global ranks to profile; None = all ranks. """ def __init__(self, trainer): @@ -17,11 +17,12 @@ def __init__(self, trainer): self.start_step = getattr(self.args, 'nsys_profile_start', -1) self.end_step = getattr(self.args, 'nsys_profile_end', -1) self._local_rank = int(os.environ.get('LOCAL_RANK', 0)) - self._profile_rank = getattr(self.args, 'profile_rank', 0) + self._global_rank = int(os.environ.get('RANK', 0)) + self._profile_ranks = getattr(self.args, 'profile_rank', None) self._profiling = False def _should_profile(self): - return self._profile_rank == -1 or self._local_rank == self._profile_rank + return self._profile_ranks is None or self._global_rank in self._profile_ranks def on_step_begin(self): if not self._should_profile(): @@ -47,7 +48,7 @@ def on_step_end(self): class TorchProfilerCallback(MegatronCallback): """Profile steps [nsys_profile_start, nsys_profile_end] via torch.profiler. - profile_rank controls which local rank profiles (default 0); -1 = all ranks. + profile_rank: list of global ranks to profile; None = all ranks. Saves TensorBoard traces to {tensorboard_dir}/torch_profiler/rank{R}_node{N}/. Step numbers are 1-based. """ @@ -56,14 +57,15 @@ def __init__(self, trainer): super().__init__(trainer) self.start_step = getattr(self.args, 'nsys_profile_start', 5) self.end_step = getattr(self.args, 'nsys_profile_end', 5) - self._profile_rank = getattr(self.args, 'profile_rank', 0) self._local_rank = int(os.environ.get('LOCAL_RANK', 0)) + self._global_rank = int(os.environ.get('RANK', 0)) self._node_rank = int(os.environ.get('NODE_RANK', 0)) + self._profile_ranks = getattr(self.args, 'profile_rank', None) self._prof = None self._trace_dir = None def _should_profile(self): - return self._profile_rank == -1 or self._local_rank == self._profile_rank + return self._profile_ranks is None or self._global_rank in self._profile_ranks def on_train_begin(self): if not self._should_profile(): From 905f1f085c4c5ee3844c8f9238431ae857a78a23 Mon Sep 17 00:00:00 2001 From: yumingxuan Date: Wed, 20 May 2026 17:30:50 +0800 Subject: [PATCH 6/6] style: apply pre-commit formatting fixes Co-Authored-By: Claude Opus 4.7 --- swift/megatron/arguments/megatron_args.py | 2 +- swift/megatron/callbacks/mapping.py | 2 +- swift/megatron/callbacks/profiler.py | 23 ++++++++++++----------- 3 files changed, 14 insertions(+), 13 deletions(-) diff --git a/swift/megatron/arguments/megatron_args.py b/swift/megatron/arguments/megatron_args.py index b56d1d01e9..f416236c72 100644 --- a/swift/megatron/arguments/megatron_args.py +++ b/swift/megatron/arguments/megatron_args.py @@ -640,7 +640,7 @@ class MegatronArguments(RLHFMegatronArgumentsMixin, MegatronTunerMixin): save_strategy: Literal['steps', 'epoch'] = 'steps' callbacks: List[str] = field(default_factory=list) nsys_profile_start: int = -1 # 1-based; 0 = disabled - nsys_profile_end: int = -1 + nsys_profile_end: int = -1 profiler_type: str = 'none' # nsys or torch profile_rank: Optional[List[int]] = None # global ranks to profile; None = all ranks diff --git a/swift/megatron/callbacks/mapping.py b/swift/megatron/callbacks/mapping.py index bb5179d202..4cf6f3ede8 100644 --- a/swift/megatron/callbacks/mapping.py +++ b/swift/megatron/callbacks/mapping.py @@ -1,7 +1,7 @@ # Copyright (c) ModelScope Contributors. All rights reserved. from .default_flow import DefaultFlowCallback -from .profiler import NsysCallback, TorchProfilerCallback from .print import PrintCallback +from .profiler import NsysCallback, TorchProfilerCallback from .swanlab import SwanlabCallback from .tensorboard import TensorboardCallback from .wandb import WandbCallback diff --git a/swift/megatron/callbacks/profiler.py b/swift/megatron/callbacks/profiler.py index 5f41291fbb..ed00060f1b 100644 --- a/swift/megatron/callbacks/profiler.py +++ b/swift/megatron/callbacks/profiler.py @@ -2,6 +2,7 @@ import os import torch import torch.profiler + from swift.megatron.callbacks.base import MegatronCallback @@ -29,8 +30,7 @@ def on_step_begin(self): return step = self.state.iteration + 1 if step == self.start_step and not self._profiling: - print(f'[nsys] cudaProfilerStart at step {step} (local_rank={self._local_rank})', - flush=True) + print(f'[nsys] cudaProfilerStart at step {step} (local_rank={self._local_rank})', flush=True) torch.cuda.check_error(torch.cuda.cudart().cudaProfilerStart()) self._profiling = True @@ -39,8 +39,7 @@ def on_step_end(self): return step = self.state.iteration if self._profiling and step >= self.end_step: - print(f'[nsys] cudaProfilerStop after step {step} (local_rank={self._local_rank})', - flush=True) + print(f'[nsys] cudaProfilerStop after step {step} (local_rank={self._local_rank})', flush=True) torch.cuda.check_error(torch.cuda.cudart().cudaProfilerStop()) self._profiling = False @@ -73,9 +72,7 @@ def on_train_begin(self): wait = max(0, self.start_step - 1) active = max(1, self.end_step - self.start_step + 1) base_dir = self.args.output_dir - trace_dir = os.path.join( - base_dir, 'torch_profiler', - f'rank{self._local_rank}_node{self._node_rank}') + trace_dir = os.path.join(base_dir, 'torch_profiler', f'rank{self._local_rank}_node{self._node_rank}') os.makedirs(trace_dir, exist_ok=True) self._prof = torch.profiler.profile( schedule=torch.profiler.schedule(wait=wait, warmup=0, active=active, repeat=1), @@ -86,8 +83,10 @@ def on_train_begin(self): ) self._trace_dir = trace_dir self._prof.__enter__() - print(f'[torch_profiler] started rank={self._local_rank} node={self._node_rank} ' - f'wait={wait} active={active} trace_dir={trace_dir}', flush=True) + print( + f'[torch_profiler] started rank={self._local_rank} node={self._node_rank} ' + f'wait={wait} active={active} trace_dir={trace_dir}', + flush=True) def on_step_end(self): if self._prof is not None: @@ -98,7 +97,9 @@ def on_train_end(self): self._prof.__exit__(None, None, None) chrome_path = os.path.join(self._trace_dir, 'chrome_trace.json') self._prof.export_chrome_trace(chrome_path) - print(f'[torch_profiler] trace saved rank={self._local_rank} node={self._node_rank} ' - f'chrome={chrome_path}', flush=True) + print( + f'[torch_profiler] trace saved rank={self._local_rank} node={self._node_rank} ' + f'chrome={chrome_path}', + flush=True) self._prof = None self._trace_dir = None