From f2b3747ce741769a1ea501ea555314a5edbbce43 Mon Sep 17 00:00:00 2001
From: yumingxuan <yumingxuan@meituan.com>
Date: Wed, 20 May 2026 16:35:30 +0800
Subject: [PATCH 1/6] feat: add torch profiler support alongside nsys

- rename nsys.py -> profiler.py with NsysCallback + TorchProfilerCallback
- add profiler_type arg (nsys/torch) to megatron_args
- add profile_rank arg (-1=all, 0=rank0 default)
- TorchProfilerCallback: torch.profiler.profile with step schedule, saves to output_dir/torch_profiler/
- NsysCallback: add check_error on cudaProfilerStart/Stop
- update mapping.py with torch_profiler entry
---
 swift/megatron/arguments/megatron_args.py |   7 ++
 swift/megatron/callbacks/mapping.py       |   3 +
 swift/megatron/callbacks/profiler.py      | 103 ++++++++++++++++++++++
 3 files changed, 113 insertions(+)
 create mode 100644 swift/megatron/callbacks/profiler.py

diff --git a/swift/megatron/arguments/megatron_args.py b/swift/megatron/arguments/megatron_args.py
index 8fc1cfc1b3..1493a4c8d3 100644
--- a/swift/megatron/arguments/megatron_args.py
+++ b/swift/megatron/arguments/megatron_args.py
@@ -595,6 +595,10 @@ class MegatronArguments(RLHFMegatronArgumentsMixin, MegatronTunerMixin):
     problem_type: Literal['regression', 'single_label_classification', 'multi_label_classification'] = None
     save_strategy: Literal['steps', 'epoch'] = 'steps'
     callbacks: List[str] = field(default_factory=list)
+    nsys_profile_start: int = 0  # 1-based; 0 = disabled
+    nsys_profile_end: int = 5
+    profiler_type: str = 'nsys'  # nsys or torch
+    profile_rank: int = 0  # local rank to profile; -1 = all ranks
 
     @staticmethod
     def load_args_config(ckpt_dir: Optional[str]) -> Dict[str, Any]:
@@ -705,6 +709,9 @@ def __post_init__(self):
                 self.gradient_accumulation_fusion = False
         self.callbacks += ['print', 'default_flow']
         self.callbacks += self.report_to
+        if self.nsys_profile_start > 0:
+            cb = 'torch_profiler' if self.profiler_type == 'torch' else 'nsys'
+            self.callbacks.append(cb)
         if self.save_total_limit is not None:
             if self.async_save:
                 raise ValueError('async_save is not supported with save_total_limit.')
diff --git a/swift/megatron/callbacks/mapping.py b/swift/megatron/callbacks/mapping.py
index 68e13269c1..bb5179d202 100644
--- a/swift/megatron/callbacks/mapping.py
+++ b/swift/megatron/callbacks/mapping.py
@@ -1,5 +1,6 @@
 # Copyright (c) ModelScope Contributors. All rights reserved.
 from .default_flow import DefaultFlowCallback
+from .profiler import NsysCallback, TorchProfilerCallback
 from .print import PrintCallback
 from .swanlab import SwanlabCallback
 from .tensorboard import TensorboardCallback
@@ -8,6 +9,8 @@
 megatron_callbacks_map = {
     'print': PrintCallback,
     'default_flow': DefaultFlowCallback,
+    'nsys': NsysCallback,
+    'torch_profiler': TorchProfilerCallback,
     'swanlab': SwanlabCallback,
     'wandb': WandbCallback,
     'tensorboard': TensorboardCallback,
diff --git a/swift/megatron/callbacks/profiler.py b/swift/megatron/callbacks/profiler.py
new file mode 100644
index 0000000000..e4dc7f134e
--- /dev/null
+++ b/swift/megatron/callbacks/profiler.py
@@ -0,0 +1,103 @@
+# Copyright (c) ModelScope Contributors. All rights reserved.
+import os
+import torch
+import torch.profiler
+from swift.megatron.callbacks.base import MegatronCallback
+
+
+class NsysCallback(MegatronCallback):
+    """Profile steps [nsys_profile_start, nsys_profile_end] via cudaProfilerStart/Stop.
+
+    Requires nsys launched with --start-later --capture-range=cudaProfilerApi.
+    profile_rank controls which local rank triggers profiling (default 0).
+    Set NSYS_PER_RANK=1 to override and profile every rank independently.
+    Step numbers are 1-based.
+    """
+
+    def __init__(self, trainer):
+        super().__init__(trainer)
+        self.start_step = getattr(self.args, 'nsys_profile_start', 5)
+        self.end_step = getattr(self.args, 'nsys_profile_end', 5)
+        self._local_rank = int(os.environ.get('LOCAL_RANK', 0))
+        self._profile_rank = getattr(self.args, 'profile_rank', 0)
+        self._per_rank = os.environ.get('NSYS_PER_RANK', '0') == '1'
+        self._profiling = False
+
+    def _should_profile(self):
+        if self._per_rank:
+            return True
+        return self._profile_rank == -1 or self._local_rank == self._profile_rank
+
+    def on_step_begin(self):
+        if not self._should_profile():
+            return
+        step = self.state.iteration + 1
+        if step == self.start_step and not self._profiling:
+            print(f'[nsys] cudaProfilerStart at step {step} (local_rank={self._local_rank})',
+                  flush=True)
+            torch.cuda.check_error(torch.cuda.cudart().cudaProfilerStart())
+            self._profiling = True
+
+    def on_step_end(self):
+        if not self._should_profile():
+            return
+        step = self.state.iteration
+        if self._profiling and step >= self.end_step:
+            print(f'[nsys] cudaProfilerStop after step {step} (local_rank={self._local_rank})',
+                  flush=True)
+            torch.cuda.check_error(torch.cuda.cudart().cudaProfilerStop())
+            self._profiling = False
+
+
+class TorchProfilerCallback(MegatronCallback):
+    """Profile steps [nsys_profile_start, nsys_profile_end] via torch.profiler.
+
+    profile_rank controls which local rank profiles (default 0); -1 = all ranks.
+    Saves TensorBoard traces to {tensorboard_dir}/torch_profiler/rank{R}_node{N}/.
+    Step numbers are 1-based.
+    """
+
+    def __init__(self, trainer):
+        super().__init__(trainer)
+        self.start_step = getattr(self.args, 'nsys_profile_start', 5)
+        self.end_step = getattr(self.args, 'nsys_profile_end', 5)
+        self._local_rank = int(os.environ.get('LOCAL_RANK', 0))
+        self._node_rank = int(os.environ.get('NODE_RANK', 0))
+        self._profile_rank = getattr(self.args, 'profile_rank', 0)
+        self._prof = None
+
+    def _should_profile(self):
+        return self._profile_rank == -1 or self._local_rank == self._profile_rank
+
+    def on_train_begin(self):
+        if not self._should_profile():
+            return
+        wait = max(0, self.start_step - 1)
+        active = max(1, self.end_step - self.start_step + 1)
+        base_dir = self.args.output_dir
+        trace_dir = os.path.join(
+            base_dir, 'torch_profiler',
+            f'rank{self._local_rank}_node{self._node_rank}')
+        os.makedirs(trace_dir, exist_ok=True)
+        self._prof = torch.profiler.profile(
+            schedule=torch.profiler.schedule(wait=wait, warmup=0, active=active, repeat=1),
+            on_trace_ready=torch.profiler.tensorboard_trace_handler(trace_dir),
+            record_shapes=True,
+            profile_memory=True,
+            with_stack=True,
+            with_flops=True,
+        )
+        self._prof.__enter__()
+        print(f'[torch_profiler] started rank={self._local_rank} node={self._node_rank} '
+              f'wait={wait} active={active} trace_dir={trace_dir}', flush=True)
+
+    def on_step_end(self):
+        if self._prof is not None:
+            self._prof.step()
+
+    def on_train_end(self):
+        if self._prof is not None:
+            self._prof.__exit__(None, None, None)
+            print(f'[torch_profiler] trace saved rank={self._local_rank} node={self._node_rank}',
+                  flush=True)
+            self._prof = None

From 32edf43b7c90502e2b21614dc727fea160bdf3d3 Mon Sep 17 00:00:00 2001
From: yumingxuan <yumingxuan@meituan.com>
Date: Wed, 20 May 2026 16:55:20 +0800
Subject: [PATCH 2/6] defaults to none

---
 swift/megatron/arguments/megatron_args.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/swift/megatron/arguments/megatron_args.py b/swift/megatron/arguments/megatron_args.py
index d1d5b2ec5b..98bf538fd7 100644
--- a/swift/megatron/arguments/megatron_args.py
+++ b/swift/megatron/arguments/megatron_args.py
@@ -641,7 +641,7 @@ class MegatronArguments(RLHFMegatronArgumentsMixin, MegatronTunerMixin):
     callbacks: List[str] = field(default_factory=list)
     nsys_profile_start: int = 0  # 1-based; 0 = disabled
     nsys_profile_end: int = 5
-    profiler_type: str = 'nsys'  # nsys or torch
+    profiler_type: str = 'none'  # nsys or torch
     profile_rank: int = 0  # local rank to profile; -1 = all ranks
 
     @staticmethod
@@ -758,7 +758,7 @@ def __post_init__(self):
                 self.gradient_accumulation_fusion = False
         self.callbacks += ['print', 'default_flow']
         self.callbacks += self.report_to
-        if self.nsys_profile_start > 0:
+        if self.profiler_type != 'none':
             cb = 'torch_profiler' if self.profiler_type == 'torch' else 'nsys'
             self.callbacks.append(cb)
         if self.save_total_limit is not None:

From 79d75dfbdf84dc3dc61369401b761142ae6eb058 Mon Sep 17 00:00:00 2001
From: yumingxuan <yumingxuan@meituan.com>
Date: Wed, 20 May 2026 17:00:05 +0800
Subject: [PATCH 3/6] change default value

---
 swift/megatron/arguments/megatron_args.py |  4 ++--
 swift/megatron/callbacks/profiler.py      | 11 +++--------
 2 files changed, 5 insertions(+), 10 deletions(-)

diff --git a/swift/megatron/arguments/megatron_args.py b/swift/megatron/arguments/megatron_args.py
index 98bf538fd7..b00ddc8fe4 100644
--- a/swift/megatron/arguments/megatron_args.py
+++ b/swift/megatron/arguments/megatron_args.py
@@ -639,8 +639,8 @@ class MegatronArguments(RLHFMegatronArgumentsMixin, MegatronTunerMixin):
     problem_type: Literal['regression', 'single_label_classification', 'multi_label_classification'] = None
     save_strategy: Literal['steps', 'epoch'] = 'steps'
     callbacks: List[str] = field(default_factory=list)
-    nsys_profile_start: int = 0  # 1-based; 0 = disabled
-    nsys_profile_end: int = 5
+    nsys_profile_start: int = -1  # 1-based; 0 = disabled
+    nsys_profile_end: int = -1 
     profiler_type: str = 'none'  # nsys or torch
     profile_rank: int = 0  # local rank to profile; -1 = all ranks
 
diff --git a/swift/megatron/callbacks/profiler.py b/swift/megatron/callbacks/profiler.py
index e4dc7f134e..37180e9d78 100644
--- a/swift/megatron/callbacks/profiler.py
+++ b/swift/megatron/callbacks/profiler.py
@@ -10,22 +10,17 @@ class NsysCallback(MegatronCallback):
 
     Requires nsys launched with --start-later --capture-range=cudaProfilerApi.
     profile_rank controls which local rank triggers profiling (default 0).
-    Set NSYS_PER_RANK=1 to override and profile every rank independently.
-    Step numbers are 1-based.
     """
 
     def __init__(self, trainer):
         super().__init__(trainer)
-        self.start_step = getattr(self.args, 'nsys_profile_start', 5)
-        self.end_step = getattr(self.args, 'nsys_profile_end', 5)
+        self.start_step = getattr(self.args, 'nsys_profile_start', -1)
+        self.end_step = getattr(self.args, 'nsys_profile_end', -1)
         self._local_rank = int(os.environ.get('LOCAL_RANK', 0))
         self._profile_rank = getattr(self.args, 'profile_rank', 0)
-        self._per_rank = os.environ.get('NSYS_PER_RANK', '0') == '1'
         self._profiling = False
 
     def _should_profile(self):
-        if self._per_rank:
-            return True
         return self._profile_rank == -1 or self._local_rank == self._profile_rank
 
     def on_step_begin(self):
@@ -61,9 +56,9 @@ def __init__(self, trainer):
         super().__init__(trainer)
         self.start_step = getattr(self.args, 'nsys_profile_start', 5)
         self.end_step = getattr(self.args, 'nsys_profile_end', 5)
+        self._profile_rank = getattr(self.args, 'profile_rank', 0)
         self._local_rank = int(os.environ.get('LOCAL_RANK', 0))
         self._node_rank = int(os.environ.get('NODE_RANK', 0))
-        self._profile_rank = getattr(self.args, 'profile_rank', 0)
         self._prof = None
 
     def _should_profile(self):

From c002e290983c7a68b4290897ce507d9ad18e5891 Mon Sep 17 00:00:00 2001
From: yumingxuan <yumingxuan@meituan.com>
Date: Wed, 20 May 2026 17:13:48 +0800
Subject: [PATCH 4/6] use chrome trace

---
 swift/megatron/callbacks/profiler.py | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/swift/megatron/callbacks/profiler.py b/swift/megatron/callbacks/profiler.py
index 37180e9d78..79feb453a7 100644
--- a/swift/megatron/callbacks/profiler.py
+++ b/swift/megatron/callbacks/profiler.py
@@ -60,6 +60,7 @@ def __init__(self, trainer):
         self._local_rank = int(os.environ.get('LOCAL_RANK', 0))
         self._node_rank = int(os.environ.get('NODE_RANK', 0))
         self._prof = None
+        self._trace_dir = None
 
     def _should_profile(self):
         return self._profile_rank == -1 or self._local_rank == self._profile_rank
@@ -76,12 +77,12 @@ def on_train_begin(self):
         os.makedirs(trace_dir, exist_ok=True)
         self._prof = torch.profiler.profile(
             schedule=torch.profiler.schedule(wait=wait, warmup=0, active=active, repeat=1),
-            on_trace_ready=torch.profiler.tensorboard_trace_handler(trace_dir),
             record_shapes=True,
             profile_memory=True,
             with_stack=True,
             with_flops=True,
         )
+        self._trace_dir = trace_dir
         self._prof.__enter__()
         print(f'[torch_profiler] started rank={self._local_rank} node={self._node_rank} '
               f'wait={wait} active={active} trace_dir={trace_dir}', flush=True)
@@ -93,6 +94,9 @@ def on_step_end(self):
     def on_train_end(self):
         if self._prof is not None:
             self._prof.__exit__(None, None, None)
-            print(f'[torch_profiler] trace saved rank={self._local_rank} node={self._node_rank}',
-                  flush=True)
+            chrome_path = os.path.join(self._trace_dir, 'chrome_trace.json')
+            self._prof.export_chrome_trace(chrome_path)
+            print(f'[torch_profiler] trace saved rank={self._local_rank} node={self._node_rank} '
+                  f'chrome={chrome_path}', flush=True)
             self._prof = None
+            self._trace_dir = None

From 774d0653b52838c6de4c5357e5db178d7eeb7885 Mon Sep 17 00:00:00 2001
From: yumingxuan <yumingxuan@meituan.com>
Date: Wed, 20 May 2026 17:24:48 +0800
Subject: [PATCH 5/6] global ranks

---
 swift/megatron/arguments/megatron_args.py |  2 +-
 swift/megatron/callbacks/profiler.py      | 14 ++++++++------
 2 files changed, 9 insertions(+), 7 deletions(-)

diff --git a/swift/megatron/arguments/megatron_args.py b/swift/megatron/arguments/megatron_args.py
index b00ddc8fe4..b56d1d01e9 100644
--- a/swift/megatron/arguments/megatron_args.py
+++ b/swift/megatron/arguments/megatron_args.py
@@ -642,7 +642,7 @@ class MegatronArguments(RLHFMegatronArgumentsMixin, MegatronTunerMixin):
     nsys_profile_start: int = -1  # 1-based; 0 = disabled
     nsys_profile_end: int = -1 
     profiler_type: str = 'none'  # nsys or torch
-    profile_rank: int = 0  # local rank to profile; -1 = all ranks
+    profile_rank: Optional[List[int]] = None  # global ranks to profile; None = all ranks
 
     @staticmethod
     def load_args_config(ckpt_dir: Optional[str]) -> Dict[str, Any]:
diff --git a/swift/megatron/callbacks/profiler.py b/swift/megatron/callbacks/profiler.py
index 79feb453a7..5f41291fbb 100644
--- a/swift/megatron/callbacks/profiler.py
+++ b/swift/megatron/callbacks/profiler.py
@@ -9,7 +9,7 @@ class NsysCallback(MegatronCallback):
     """Profile steps [nsys_profile_start, nsys_profile_end] via cudaProfilerStart/Stop.
 
     Requires nsys launched with --start-later --capture-range=cudaProfilerApi.
-    profile_rank controls which local rank triggers profiling (default 0).
+    profile_rank: list of global ranks to profile; None = all ranks.
     """
 
     def __init__(self, trainer):
@@ -17,11 +17,12 @@ def __init__(self, trainer):
         self.start_step = getattr(self.args, 'nsys_profile_start', -1)
         self.end_step = getattr(self.args, 'nsys_profile_end', -1)
         self._local_rank = int(os.environ.get('LOCAL_RANK', 0))
-        self._profile_rank = getattr(self.args, 'profile_rank', 0)
+        self._global_rank = int(os.environ.get('RANK', 0))
+        self._profile_ranks = getattr(self.args, 'profile_rank', None)
         self._profiling = False
 
     def _should_profile(self):
-        return self._profile_rank == -1 or self._local_rank == self._profile_rank
+        return self._profile_ranks is None or self._global_rank in self._profile_ranks
 
     def on_step_begin(self):
         if not self._should_profile():
@@ -47,7 +48,7 @@ def on_step_end(self):
 class TorchProfilerCallback(MegatronCallback):
     """Profile steps [nsys_profile_start, nsys_profile_end] via torch.profiler.
 
-    profile_rank controls which local rank profiles (default 0); -1 = all ranks.
+    profile_rank: list of global ranks to profile; None = all ranks.
     Saves TensorBoard traces to {tensorboard_dir}/torch_profiler/rank{R}_node{N}/.
     Step numbers are 1-based.
     """
@@ -56,14 +57,15 @@ def __init__(self, trainer):
         super().__init__(trainer)
         self.start_step = getattr(self.args, 'nsys_profile_start', 5)
         self.end_step = getattr(self.args, 'nsys_profile_end', 5)
-        self._profile_rank = getattr(self.args, 'profile_rank', 0)
         self._local_rank = int(os.environ.get('LOCAL_RANK', 0))
+        self._global_rank = int(os.environ.get('RANK', 0))
         self._node_rank = int(os.environ.get('NODE_RANK', 0))
+        self._profile_ranks = getattr(self.args, 'profile_rank', None)
         self._prof = None
         self._trace_dir = None
 
     def _should_profile(self):
-        return self._profile_rank == -1 or self._local_rank == self._profile_rank
+        return self._profile_ranks is None or self._global_rank in self._profile_ranks
 
     def on_train_begin(self):
         if not self._should_profile():

From 905f1f085c4c5ee3844c8f9238431ae857a78a23 Mon Sep 17 00:00:00 2001
From: yumingxuan <yumingxuan@meituan.com>
Date: Wed, 20 May 2026 17:30:50 +0800
Subject: [PATCH 6/6] style: apply pre-commit formatting fixes

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 swift/megatron/arguments/megatron_args.py |  2 +-
 swift/megatron/callbacks/mapping.py       |  2 +-
 swift/megatron/callbacks/profiler.py      | 23 ++++++++++++-----------
 3 files changed, 14 insertions(+), 13 deletions(-)

diff --git a/swift/megatron/arguments/megatron_args.py b/swift/megatron/arguments/megatron_args.py
index b56d1d01e9..f416236c72 100644
--- a/swift/megatron/arguments/megatron_args.py
+++ b/swift/megatron/arguments/megatron_args.py
@@ -640,7 +640,7 @@ class MegatronArguments(RLHFMegatronArgumentsMixin, MegatronTunerMixin):
     save_strategy: Literal['steps', 'epoch'] = 'steps'
     callbacks: List[str] = field(default_factory=list)
     nsys_profile_start: int = -1  # 1-based; 0 = disabled
-    nsys_profile_end: int = -1 
+    nsys_profile_end: int = -1
     profiler_type: str = 'none'  # nsys or torch
     profile_rank: Optional[List[int]] = None  # global ranks to profile; None = all ranks
 
diff --git a/swift/megatron/callbacks/mapping.py b/swift/megatron/callbacks/mapping.py
index bb5179d202..4cf6f3ede8 100644
--- a/swift/megatron/callbacks/mapping.py
+++ b/swift/megatron/callbacks/mapping.py
@@ -1,7 +1,7 @@
 # Copyright (c) ModelScope Contributors. All rights reserved.
 from .default_flow import DefaultFlowCallback
-from .profiler import NsysCallback, TorchProfilerCallback
 from .print import PrintCallback
+from .profiler import NsysCallback, TorchProfilerCallback
 from .swanlab import SwanlabCallback
 from .tensorboard import TensorboardCallback
 from .wandb import WandbCallback
diff --git a/swift/megatron/callbacks/profiler.py b/swift/megatron/callbacks/profiler.py
index 5f41291fbb..ed00060f1b 100644
--- a/swift/megatron/callbacks/profiler.py
+++ b/swift/megatron/callbacks/profiler.py
@@ -2,6 +2,7 @@
 import os
 import torch
 import torch.profiler
+
 from swift.megatron.callbacks.base import MegatronCallback
 
 
@@ -29,8 +30,7 @@ def on_step_begin(self):
             return
         step = self.state.iteration + 1
         if step == self.start_step and not self._profiling:
-            print(f'[nsys] cudaProfilerStart at step {step} (local_rank={self._local_rank})',
-                  flush=True)
+            print(f'[nsys] cudaProfilerStart at step {step} (local_rank={self._local_rank})', flush=True)
             torch.cuda.check_error(torch.cuda.cudart().cudaProfilerStart())
             self._profiling = True
 
@@ -39,8 +39,7 @@ def on_step_end(self):
             return
         step = self.state.iteration
         if self._profiling and step >= self.end_step:
-            print(f'[nsys] cudaProfilerStop after step {step} (local_rank={self._local_rank})',
-                  flush=True)
+            print(f'[nsys] cudaProfilerStop after step {step} (local_rank={self._local_rank})', flush=True)
             torch.cuda.check_error(torch.cuda.cudart().cudaProfilerStop())
             self._profiling = False
 
@@ -73,9 +72,7 @@ def on_train_begin(self):
         wait = max(0, self.start_step - 1)
         active = max(1, self.end_step - self.start_step + 1)
         base_dir = self.args.output_dir
-        trace_dir = os.path.join(
-            base_dir, 'torch_profiler',
-            f'rank{self._local_rank}_node{self._node_rank}')
+        trace_dir = os.path.join(base_dir, 'torch_profiler', f'rank{self._local_rank}_node{self._node_rank}')
         os.makedirs(trace_dir, exist_ok=True)
         self._prof = torch.profiler.profile(
             schedule=torch.profiler.schedule(wait=wait, warmup=0, active=active, repeat=1),
@@ -86,8 +83,10 @@ def on_train_begin(self):
         )
         self._trace_dir = trace_dir
         self._prof.__enter__()
-        print(f'[torch_profiler] started rank={self._local_rank} node={self._node_rank} '
-              f'wait={wait} active={active} trace_dir={trace_dir}', flush=True)
+        print(
+            f'[torch_profiler] started rank={self._local_rank} node={self._node_rank} '
+            f'wait={wait} active={active} trace_dir={trace_dir}',
+            flush=True)
 
     def on_step_end(self):
         if self._prof is not None:
@@ -98,7 +97,9 @@ def on_train_end(self):
             self._prof.__exit__(None, None, None)
             chrome_path = os.path.join(self._trace_dir, 'chrome_trace.json')
             self._prof.export_chrome_trace(chrome_path)
-            print(f'[torch_profiler] trace saved rank={self._local_rank} node={self._node_rank} '
-                  f'chrome={chrome_path}', flush=True)
+            print(
+                f'[torch_profiler] trace saved rank={self._local_rank} node={self._node_rank} '
+                f'chrome={chrome_path}',
+                flush=True)
             self._prof = None
             self._trace_dir = None