From 0b4fd059b1602527d099767781eb7a93d37edd17 Mon Sep 17 00:00:00 2001 From: "1243196045@qq" <1243196045@qq.com> Date: Fri, 29 May 2026 20:55:08 +0800 Subject: [PATCH 01/17] feat: Add native support for PyTorch Profiler in ms-swift --- swift/arguments/base_args/base_args.py | 7 +- swift/arguments/base_args/profile_args.py | 55 ++++++++ swift/callbacks/mapping.py | 4 +- swift/callbacks/profiler.py | 26 ++++ swift/megatron/callbacks/mapping.py | 2 + swift/megatron/callbacks/profiler.py | 23 ++++ swift/trainers/arguments.py | 20 +++ swift/utils/profiler/__init__.py | 7 ++ swift/utils/profiler/config.py | 142 +++++++++++++++++++++ swift/utils/profiler/profile.py | 146 +++++++++++++++++++++ swift/utils/profiler/torch_profile.py | 147 ++++++++++++++++++++++ 11 files changed, 575 insertions(+), 4 deletions(-) create mode 100644 swift/arguments/base_args/profile_args.py create mode 100644 swift/callbacks/profiler.py create mode 100644 swift/megatron/callbacks/profiler.py create mode 100644 swift/utils/profiler/__init__.py create mode 100644 swift/utils/profiler/config.py create mode 100644 swift/utils/profiler/profile.py create mode 100644 swift/utils/profiler/torch_profile.py diff --git a/swift/arguments/base_args/base_args.py b/swift/arguments/base_args/base_args.py index 03697db861..1cf3f03050 100644 --- a/swift/arguments/base_args/base_args.py +++ b/swift/arguments/base_args/base_args.py @@ -17,6 +17,7 @@ from .data_args import DataArguments from .generation_args import GenerationArguments from .model_args import ModelArguments +from .profile_args import ProfilerArguments from .quant_args import QuantizeArguments from .template_args import TemplateArguments @@ -30,12 +31,11 @@ def get_supported_tuners(): @dataclass class BaseArguments(GenerationArguments, QuantizeArguments, DataArguments, TemplateArguments, ModelArguments, - RayArguments): + RayArguments, ProfilerArguments): """BaseArguments class is a dataclass that inherits from multiple argument classes. This class consolidates arguments from GenerationArguments, QuantizeArguments, DataArguments, - TemplateArguments, ModelArguments, RayArguments. - + TemplateArguments, ModelArguments, RayArguments, and ProfilerArguments. Args: tuner_backend (str): The tuner backend to use. Choices are 'peft' or 'unsloth'. Default is 'peft'. tuner_type (str): The tuner type. Choices include 'lora', 'full', 'longlora', 'adalora', 'llamapro', @@ -171,6 +171,7 @@ def __post_init__(self): TemplateArguments.__post_init__(self) DataArguments.__post_init__(self) RayArguments.__post_init__(self) + ProfilerArguments.__post_init__(self) self._init_stream() if self.max_length is None and self.model_info is not None: self.max_length = self.model_info.max_model_len diff --git a/swift/arguments/base_args/profile_args.py b/swift/arguments/base_args/profile_args.py new file mode 100644 index 0000000000..10b51e448f --- /dev/null +++ b/swift/arguments/base_args/profile_args.py @@ -0,0 +1,55 @@ +# Copyright (c) ModelScope Contributors. All rights reserved. +from dataclasses import dataclass, field +from typing import List, Optional + +from swift.utils import get_logger + +logger = get_logger() + + +@dataclass +class ProfilerArguments: + + enable_profiler: bool = False + profiler_save_path: Optional[str] = None + profiler_all_ranks: bool = False + profiler_ranks: List[int] = field(default_factory=list) + profiler_contents: List[str] = field(default_factory=list) # e.g., "cpu", "cuda", "stack", "memory"."shape" + profiler_discrete: bool = False + profiler_tool: Optional[str] = 'torch' + profiler_steps: Optional[List[int]] = field(default_factory=list) # Steps to profile + + def __post_init__(self): + assert not self.profiler_discrete, \ + 'Profiler discrete mode is not supported yet, please set profiler_discrete to false' + if self.enable_profiler and 'profiler' not in self.callbacks: + self.callbacks.append('profiler') + if 'profiler' in self.callbacks and not self.enable_profiler: + self.enable_profiler = True + if self.enable_profiler: + assert self.profiler_save_path is not None, \ + 'Profiler save path must be specified when profiler is enabled.' + assert self.profiler_contents, \ + 'Profiler contents must be specified when profiler is enabled.' + assert self.profiler_steps, \ + 'Profiler steps must be specified when profiler is enabled.' + assert self.profiler_ranks != [] or self.profiler_all_ranks, \ + 'Either profiler_ranks must be specified or profiler_all_ranks must be set to True.' + if self.enable_profiler: + assert 'profiler' in self.callbacks, \ + 'Profiler callback must be included in callbacks when profiler is enabled.' + if 'profiler' in self.callbacks: + assert self.enable_profiler, \ + 'Profiler callback is included in callbacks but profiler is not enabled.' + + def get_profiler_kwargs(self): + return { + 'enable_profiler': self.enable_profiler, + 'profiler_save_path': self.profiler_save_path, + 'profiler_all_ranks': self.profiler_all_ranks, + 'profiler_ranks': self.profiler_ranks, + 'profiler_contents': self.profiler_contents, + 'profiler_discrete': self.profiler_discrete, + 'profiler_tool': self.profiler_tool, + 'profiler_steps': self.profiler_steps, + } diff --git a/swift/callbacks/mapping.py b/swift/callbacks/mapping.py index 3f18235e79..f6219945f0 100644 --- a/swift/callbacks/mapping.py +++ b/swift/callbacks/mapping.py @@ -5,6 +5,7 @@ from .early_stop import EarlyStopCallback from .lisa import LISACallback from .perf_log import PerfMetricsLogCallback +from .profiler import ProfilerCallback callbacks_map = { 'activation_cpu_offload': ActivationCpuOffloadCallBack, @@ -13,5 +14,6 @@ 'early_stop': EarlyStopCallback, 'graceful_exit': GracefulExitCallback, 'lisa': LISACallback, - 'perf_log': PerfMetricsLogCallback + 'perf_log': PerfMetricsLogCallback, + 'profiler': ProfilerCallback, } diff --git a/swift/callbacks/profiler.py b/swift/callbacks/profiler.py new file mode 100644 index 0000000000..796ce91598 --- /dev/null +++ b/swift/callbacks/profiler.py @@ -0,0 +1,26 @@ +# Copyright (c) ModelScope Contributors. All rights reserved. +from transformers.trainer_callback import ProgressCallback, TrainerControl, TrainerState + +from swift.utils import get_logger +from swift.utils.profiler import DistProfiler + +logger = get_logger() + + +class ProfilerCallback(ProgressCallback): + + def __init__(self, args, trainer): + super().__init__() + self.args = args + self.trainer = trainer + self.trainer.profiler = DistProfiler(global_config=args) + + def on_step_begin(self, args, state: TrainerState, control: TrainerControl, **kwargs): + if self.args.profiler_steps and state.global_step in self.args.profiler_steps: + self.trainer.profiler.start() + super().on_step_begin(args, state, control, **kwargs) + + def on_step_end(self, args, state: TrainerState, control: TrainerControl, **kwargs): + if self.args.profiler_steps and state.global_step + 1 not in self.args.profiler_steps: + self.trainer.profiler.stop() + super().on_step_end(args, state, control, **kwargs) diff --git a/swift/megatron/callbacks/mapping.py b/swift/megatron/callbacks/mapping.py index 68e13269c1..72c8d69075 100644 --- a/swift/megatron/callbacks/mapping.py +++ b/swift/megatron/callbacks/mapping.py @@ -1,6 +1,7 @@ # Copyright (c) ModelScope Contributors. All rights reserved. from .default_flow import DefaultFlowCallback from .print import PrintCallback +from .profiler import ProfilerCallback from .swanlab import SwanlabCallback from .tensorboard import TensorboardCallback from .wandb import WandbCallback @@ -11,4 +12,5 @@ 'swanlab': SwanlabCallback, 'wandb': WandbCallback, 'tensorboard': TensorboardCallback, + 'profiler': ProfilerCallback, } diff --git a/swift/megatron/callbacks/profiler.py b/swift/megatron/callbacks/profiler.py new file mode 100644 index 0000000000..78e25fb55e --- /dev/null +++ b/swift/megatron/callbacks/profiler.py @@ -0,0 +1,23 @@ +# Copyright (c) ModelScope Contributors. All rights reserved. +from swift.utils import get_logger +from swift.utils.profiler import DistProfiler +from .base import MegatronCallback + +logger = get_logger() + + +class ProfilerCallback(MegatronCallback): + + def __init__(self, trainer): + super().__init__(trainer) + self.args = trainer.args + self.trainer = trainer + self.trainer.profiler = DistProfiler(global_config=self.args) + + def on_step_begin(self): + if self.args.profiler_steps and self.state.global_step in self.args.profiler_steps: + self.trainer.profiler.start() + + def on_step_end(self): + if self.args.profiler_steps and self.state.global_step + 1 not in self.args.profiler_steps: + self.trainer.profiler.stop() diff --git a/swift/trainers/arguments.py b/swift/trainers/arguments.py index 96f63dfc0d..768879d33b 100644 --- a/swift/trainers/arguments.py +++ b/swift/trainers/arguments.py @@ -121,6 +121,16 @@ class TrainArgumentsMixin: shared memory and then asynchronously persisted to disk. Currently does not support the safetensors format. It is recommended to use this with `PYTORCH_CUDA_ALLOC_CONF="expandable_segments:True"` to prevent CUDA OOM errors during training. Defaults to False. + + enable_profiler (bool): Master switch to enable or disable performance profiling. Default is False. + profiler_save_path (Optional[str]): Directory path where the profiling results and trace files will be saved. + profiler_all_ranks (bool): If True, collects profiling data from all distributed processes. + profiler_ranks (List[int]): A list of specific rank IDs to collect profiling data from. + profiler_contents (List[str]): List of data categories to record, such as "cpu", "cuda", "memory", or "stack". + profiler_discrete (bool): If True, records data for each step independently. + profiler_tool (Optional[str]): Specifies the backend tool used for profiling. + profiler_steps (Optional[List[int]]): A list of specific training steps during which profiling should be active. + """ per_device_train_batch_size: int = 1 per_device_eval_batch_size: int = 1 @@ -202,6 +212,16 @@ class TrainArgumentsMixin: # dlrover flash_checkpoint use_flash_ckpt: bool = False + # profiler + enable_profiler: bool = False + profiler_save_path: Optional[str] = None + profiler_all_ranks: bool = False + profiler_ranks: List[int] = field(default_factory=list) + profiler_contents: List[str] = field(default_factory=list) # e.g., "cpu", "cuda", "stack", "memory"."shape" + profiler_discrete: bool = False + profiler_tool: Optional[str] = None + profiler_steps: Optional[List[int]] = field(default_factory=list) # Steps to profile + @staticmethod def _patch_liger_kernel(): # fix logits_to_keep diff --git a/swift/utils/profiler/__init__.py b/swift/utils/profiler/__init__.py new file mode 100644 index 0000000000..be059d404c --- /dev/null +++ b/swift/utils/profiler/__init__.py @@ -0,0 +1,7 @@ +from .profile import DistProfiler, DistProfilerExtension, ProfilerConfig + +__all__ = [ + 'DistProfiler', + 'DistProfilerExtension', + 'ProfilerConfig', +] diff --git a/swift/utils/profiler/config.py b/swift/utils/profiler/config.py new file mode 100644 index 0000000000..7ba3dcf856 --- /dev/null +++ b/swift/utils/profiler/config.py @@ -0,0 +1,142 @@ +import collections +from dataclasses import FrozenInstanceError, dataclass, field, fields +from typing import Any, Optional + +# BaseConfig class inherits from collections.abc.Mapping, which means it can act like a dictionary + + +@dataclass +class BaseConfig(collections.abc.Mapping): + """The BaseConfig provides dict-like interface for a dataclass config. + + By default all fields in the config is not mutable, unless specified in + "_mutable_fields". The BaseConfig class implements the Mapping Abstract Base Class. + This allows instances of this class to be used like dictionaries. + """ + + _mutable_fields = set() + _target_: str = '' + + def __setattr__(self, name: str, value): + """Set the value of an attribute. Check if the attr is mutable before setting the value.""" + # If the field already exists, it's considered frozen unless it's in _mutable_fields + if name in self.__dict__ and name not in getattr(self, '_mutable_fields', set()): + raise FrozenInstanceError(f"Field '{name}' is frozen and cannot be modified") + super().__setattr__(name, value) + + def get(self, key: str, default: Any = None) -> Any: + """Get the value associated with the given key. If the key does not exist, return the default value. + + Args: + key (str): The attribute name to retrieve. + default (Any, optional): The value to return if the attribute does not exist. Defaults to None. + + Returns: + Any: The value of the attribute or the default value. + """ + try: + return getattr(self, key) + except AttributeError: + return default + + def __getitem__(self, key: str): + """Implement the [] operator for the class. Allows accessing attributes like dictionary items. + + Args: + key (str): The attribute name to retrieve. + + Returns: + Any: The value of the attribute. + + Raises: + AttributeError: If the attribute does not exist. + TypeError: If the key type is not string + """ + return getattr(self, key) + + def __iter__(self): + """Implement the iterator protocol. Allows iterating over the attribute names of the instance. + + Yields: + str: The name of each field in the dataclass. + """ + for f in fields(self): + yield f.name + + def __len__(self): + """ + Return the number of fields in the dataclass. + + Returns: + int: The number of fields in the dataclass. + """ + return len(fields(self)) + + +@dataclass +class ProfilerConfig(BaseConfig): + """Worker profiler config. + + Args: + discrete (bool): True for each task has its own database, False for all tasks in one training step + share one database. + all_ranks (bool): Whether to profile all ranks. + ranks (list[int]): The ranks that will be profiled. Defaults to []. + global_tool_config (Any): Global tool configuration for all profiling tools. + """ + + tool: Optional[str] = None + enable: bool = False + all_ranks: bool = False + ranks: list[int] = field(default_factory=list) + save_path: Optional[str] = None + tool_config: Any = None + global_tool_config: Optional[Any] = None # Global tool configuration for all profiling tools + + def union(self, other: 'ProfilerConfig') -> 'ProfilerConfig': + assert self.tool == other.tool, f"Cannot union ProfilerConfig with different tools: {self.tool} vs {other.tool}" + return ProfilerConfig( + tool=self.tool, + enable=self.enable or other.enable, + all_ranks=self.all_ranks or other.all_ranks, + ranks=list(set(self.ranks or []) | set(other.ranks or [])), + save_path=self.save_path, + tool_config=self.tool_config, + global_tool_config=self.global_tool_config or other.global_tool_config, + ) + + def intersect(self, other: 'ProfilerConfig') -> 'ProfilerConfig': + assert self.tool == other.tool, ( + f"Cannot intersect ProfilerConfig with different tools: {self.tool} vs {other.tool}") + return ProfilerConfig( + tool=self.tool, + enable=self.enable and other.enable, + all_ranks=self.all_ranks and other.all_ranks, + ranks=list(set(self.ranks or []) & set(other.ranks or [])), + save_path=self.save_path, + tool_config=self.tool_config, + global_tool_config=self.global_tool_config if self.global_tool_config else other.global_tool_config, + ) + + def __post_init__(self) -> None: + """config validation logics go here""" + assert isinstance(self.ranks, + set | list | tuple), (f"Profiler ranks must be of type list, got {type(self.ranks)}") + + +@dataclass +class TorchProfilerToolConfig(BaseConfig): + """Torch profiler tool config.""" + + # options: cuda, cpu, memory, shapes, stack + contents: list[str] = field(default_factory=list) + discrete: bool = False + name: str = 'torch' + + def __post_init__(self) -> None: + """config validation logics go here""" + __support_contents = ['cuda', 'cpu', 'memory', 'shapes', 'stack'] + for content in self.contents: + assert content in __support_contents, ( + f"Profiler contents only supports {__support_contents}, but gets {content}") + assert isinstance(self.contents, list), f"Profiler contents must be of type list, got {type(self.contents)}" diff --git a/swift/utils/profiler/profile.py b/swift/utils/profiler/profile.py new file mode 100644 index 0000000000..764e9ef2fb --- /dev/null +++ b/swift/utils/profiler/profile.py @@ -0,0 +1,146 @@ +import functools +import os +import torch +from typing import Callable, Optional + +from .config import ProfilerConfig, TorchProfilerToolConfig + + +class DistProfiler: + + def __init__(self, + global_config=None, + rank: int = None, + config: Optional[ProfilerConfig] = None, + tool_config: Optional[object] = None, + **kwargs): + # Default config + if rank is None: + if torch.distributed.is_initialized(): + rank = torch.distributed.get_rank() + else: + rank = int(os.environ.get('RANK', 0)) + print(f"Warning: torch.distributed is not initialized, using RANK env var for rank: {rank}") + if global_config is not None: + config = ProfilerConfig( + tool=global_config.profiler_tool, + enable=global_config.enable_profiler, + all_ranks=global_config.profiler_all_ranks, + ranks=global_config.profiler_ranks, + save_path=global_config.profiler_save_path, + tool_config=tool_config or TorchProfilerToolConfig( + contents=global_config.profiler_contents, discrete=global_config.profiler_discrete), + ) + elif not config: + config = ProfilerConfig(ranks=[], enable=False, tool_config=None) + + if tool_config is None: + tool_config = config.tool_config + + self.config = config + self.tool_config = tool_config + + self._impl = None + self._tool = getattr(config, 'tool', None) + self._enable = config.enable + self._this_step = False + + # Normalize rank selection + self._this_rank = False + if config.all_ranks: + self._this_rank = True + elif config.ranks: + self._this_rank = rank in config.ranks + else: + # default rank 0 if enabled but ranks unspecified + self._this_rank = (rank == 0) if self._enable else False + + self._discrete = getattr(tool_config, 'discrete', False) if tool_config else False + + if self._tool == 'torch': + from .torch_profile import Profiler as _Torch + + self._impl = _Torch(rank=rank, config=config, tool_config=tool_config) + else: + # Fallback to a no-op impl + self._impl = _NoOpProfiler() + + def check_enable(self): + return self._enable + + def check_this_rank(self): + return self._this_rank + + def check_this_step(self): + return self._this_step + + def is_discrete_mode(self): + return self._discrete + + def start(self, **kwargs): + if self.check_enable() and self.check_this_rank(): + self._this_step = True + return getattr(self._impl, 'start', lambda **_: None)(**kwargs) + + def stop(self): + if self.check_enable() and self.check_this_rank(): + self._this_step = False + return getattr(self._impl, 'stop', lambda: None)() + + @classmethod + def annotate( + cls, + message: Optional[str] = None, + color: Optional[str] = None, + domain: Optional[str] = None, + category: Optional[str] = None, + **kwargs_outer, + ) -> Callable: + + def decorator(func): + + @functools.wraps(func) + def wrapper(self_instance, *args, **kwargs_inner): + profiler = getattr(self_instance, 'profiler', None) + + if (not profiler or not profiler.check_enable() or not profiler.check_this_step() + or not profiler.check_this_rank()): + return func(self_instance, *args, **kwargs_inner) + + impl = profiler._impl + if hasattr(impl, 'annotate'): + try: + actual_decorator = impl.annotate( + message=message, color=color, domain=domain, category=category, **kwargs_outer) + + return actual_decorator(func)(self_instance, *args, **kwargs_inner) + except Exception: + return func(self_instance, *args, **kwargs_inner) + return func(self_instance, *args, **kwargs_inner) + + return wrapper + + return decorator + + +class DistProfilerExtension: + + def __init__(self, profiler: DistProfiler): + self.profiler = profiler + + def start_profile(self, **kwargs) -> None: + """Start profiling for the current rank in the current training step.""" + self.profiler.start(**kwargs) + + def stop_profile(self) -> None: + """Stop profiling for the current rank in the current training step.""" + self.profiler.stop() + + +class _NoOpProfiler: + + def start(self, **kwargs): + return + + def stop(self): + return diff --git a/swift/utils/profiler/torch_profile.py b/swift/utils/profiler/torch_profile.py new file mode 100644 index 0000000000..663234f8d9 --- /dev/null +++ b/swift/utils/profiler/torch_profile.py @@ -0,0 +1,147 @@ +import functools +import os +import torch +from datetime import datetime, timezone +from typing import Callable, Optional + +from .config import ProfilerConfig, TorchProfilerToolConfig +from .profile import DistProfiler + + +def get_torch_profiler( + contents: list[str], + save_path: str, + role: Optional[str] = None, + save_file_prefix: Optional[str] = None, + rank: int = 0, +): + if role: + save_path = os.path.join(save_path, role) + + os.makedirs(save_path, exist_ok=True) + + current_time = datetime.now(tz=timezone.utc).astimezone() + timestamp = current_time.strftime('%Y%m%d%H%M%S%f')[:-3] + pid = os.getpid() + + save_file_name = f"prof_rank-{rank}_{pid}_{timestamp}.json.gz" + if save_file_prefix: + save_file_name = f"{save_file_prefix}_{save_file_name}" + save_path = os.path.join(save_path, save_file_name) + + def _trace_handler(prof): + print(f"[Profiler] Saving trace to {save_path}") + prof.export_chrome_trace(save_path) + + contents = set(contents) if contents else set() + activities = [] + if not contents or 'cpu' in contents: + activities.append(torch.profiler.ProfilerActivity.CPU) + if not contents or 'cuda' in contents: + activities.append(torch.profiler.ProfilerActivity.CUDA) + + return torch.profiler.profile( + activities=activities, + with_stack='stack' in contents, + record_shapes='shapes' in contents, + profile_memory='memory' in contents, + on_trace_ready=_trace_handler, + ) + + +class Profiler(DistProfiler): + + _define_count = 0 + + def __init__( + self, + rank, + config: ProfilerConfig, + tool_config: Optional[TorchProfilerToolConfig] = None, + save_file_prefix=None, + ): + # note : if we do not set use_profile, it will be set as None, so that all function will be skip + config = config or ProfilerConfig(ranks=[], enable=False) + self.save_file_prefix = save_file_prefix + + if not tool_config: + assert not config.enable, 'tool_config must be provided when profiler is enabled' + + self.prof = None + self.rank = rank + self.config = config + self.tool_config = tool_config + self.contents = self.tool_config.contents + self.save_path = self.config.save_path + # Align with other profilers: read discrete mode, default to False for torch profiler + self.discrete = getattr(self.tool_config, 'discrete', False) + + def check(self): + return self.prof is not None + + def start(self, **kwargs): + role = kwargs.get('role', None) + if not self.discrete and Profiler._define_count == 0: + self.prof = get_torch_profiler( + contents=self.contents, + save_path=self.save_path, + role=role, + save_file_prefix=self.save_file_prefix, + rank=self.rank, + ) + print(f"[Profiler] started for rank {self.rank}") + self.prof.start() + Profiler._define_count += 1 + + def step(self): + if self.check(): + self.prof.step() + + def stop(self): + if not self.discrete and Profiler._define_count == 1: + self.step() + print(f"[Profiler] stopped for rank {self.rank}") + self.prof.stop() + Profiler._define_count -= 1 + + def annotate(self, message: Optional[str] = None, role: Optional[str] = None, **kwargs_outer) -> Callable: + """Decorate a Worker member function to profile the current rank in the current training step. + + Requires the target function to be a member function of a Worker, + which has a member field `profiler` with Profiler type. + + Args: + message (str, optional): + The message to be displayed in the profiler. Defaults to None. + role (str, optional): + The role of the current data collection. Defaults to None. + """ + + def decorator(func): + + @functools.wraps(func) + def wrapper(*args, **kwargs_inner): + profile_name = message or func.__name__ + + if not self.discrete: + # In continuous mode, we just record function, profiler started globally + with torch.profiler.record_function(profile_name): + return func(*args, **kwargs_inner) + + # In discrete mode, we start/stop profiler around the function + prof = get_torch_profiler( + contents=self.contents, + save_path=self.save_path, + role=role, + save_file_prefix=self.save_file_prefix, + rank=self.rank, + ) + prof.start() + with torch.profiler.record_function(profile_name): + result = func(*args, **kwargs_inner) + prof.stop() + return result + + return wrapper + + return decorator From 8c5aab60dd9774c8954536e7917a3157cb8b37ba Mon Sep 17 00:00:00 2001 From: Amis <62925330+qq1243196045@users.noreply.github.com> Date: Fri, 29 May 2026 21:08:46 +0800 Subject: [PATCH 02/17] Update swift/utils/profiler/config.py Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com> --- swift/utils/profiler/config.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/swift/utils/profiler/config.py b/swift/utils/profiler/config.py index 7ba3dcf856..ba774ee314 100644 --- a/swift/utils/profiler/config.py +++ b/swift/utils/profiler/config.py @@ -121,7 +121,7 @@ def intersect(self, other: 'ProfilerConfig') -> 'ProfilerConfig': def __post_init__(self) -> None: """config validation logics go here""" assert isinstance(self.ranks, - set | list | tuple), (f"Profiler ranks must be of type list, got {type(self.ranks)}") + (set, list, tuple)), (f"Profiler ranks must be of type list, got {type(self.ranks)}") @dataclass From 740d94251de693b0a7b308ca0da9ed37cc566763 Mon Sep 17 00:00:00 2001 From: Amis <62925330+qq1243196045@users.noreply.github.com> Date: Fri, 29 May 2026 21:13:53 +0800 Subject: [PATCH 03/17] Update swift/utils/profiler/config.py Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com> --- swift/utils/profiler/config.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/swift/utils/profiler/config.py b/swift/utils/profiler/config.py index ba774ee314..6e3cb45c4a 100644 --- a/swift/utils/profiler/config.py +++ b/swift/utils/profiler/config.py @@ -100,8 +100,8 @@ def union(self, other: 'ProfilerConfig') -> 'ProfilerConfig': enable=self.enable or other.enable, all_ranks=self.all_ranks or other.all_ranks, ranks=list(set(self.ranks or []) | set(other.ranks or [])), - save_path=self.save_path, - tool_config=self.tool_config, + save_path=self.save_path or other.save_path, + tool_config=self.tool_config or other.tool_config, global_tool_config=self.global_tool_config or other.global_tool_config, ) From 99f464c58896cd08320e2e422805db2f656f6f63 Mon Sep 17 00:00:00 2001 From: Amis <62925330+qq1243196045@users.noreply.github.com> Date: Fri, 29 May 2026 21:14:18 +0800 Subject: [PATCH 04/17] Update swift/utils/profiler/torch_profile.py Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com> --- swift/utils/profiler/torch_profile.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/swift/utils/profiler/torch_profile.py b/swift/utils/profiler/torch_profile.py index 663234f8d9..687075ff50 100644 --- a/swift/utils/profiler/torch_profile.py +++ b/swift/utils/profiler/torch_profile.py @@ -30,7 +30,7 @@ def get_torch_profiler( save_path = os.path.join(save_path, save_file_name) def _trace_handler(prof): - print(f"[Profiler] Saving trace to {save_path}") + logger.info(f"[Profiler] Saving trace to {save_path}") prof.export_chrome_trace(save_path) contents = set(contents) if contents else set() From 23e93994ff0d7e9717ab46c24d457476ae800176 Mon Sep 17 00:00:00 2001 From: Amis <62925330+qq1243196045@users.noreply.github.com> Date: Fri, 29 May 2026 21:14:31 +0800 Subject: [PATCH 05/17] Update swift/utils/profiler/torch_profile.py Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com> --- swift/utils/profiler/torch_profile.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/swift/utils/profiler/torch_profile.py b/swift/utils/profiler/torch_profile.py index 687075ff50..00182e0159 100644 --- a/swift/utils/profiler/torch_profile.py +++ b/swift/utils/profiler/torch_profile.py @@ -100,7 +100,7 @@ def step(self): def stop(self): if not self.discrete and Profiler._define_count == 1: self.step() - print(f"[Profiler] stopped for rank {self.rank}") + logger.info(f"[Profiler] stopped for rank {self.rank}") self.prof.stop() Profiler._define_count -= 1 From 717d5f27ac170df2a0e39c8ff12a28dfdb200f82 Mon Sep 17 00:00:00 2001 From: Amis <62925330+qq1243196045@users.noreply.github.com> Date: Fri, 29 May 2026 21:14:45 +0800 Subject: [PATCH 06/17] Update swift/trainers/arguments.py Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com> --- swift/trainers/arguments.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/swift/trainers/arguments.py b/swift/trainers/arguments.py index 768879d33b..ab1a5b863d 100644 --- a/swift/trainers/arguments.py +++ b/swift/trainers/arguments.py @@ -219,7 +219,7 @@ class TrainArgumentsMixin: profiler_ranks: List[int] = field(default_factory=list) profiler_contents: List[str] = field(default_factory=list) # e.g., "cpu", "cuda", "stack", "memory"."shape" profiler_discrete: bool = False - profiler_tool: Optional[str] = None + profiler_tool: Optional[str] = 'torch' profiler_steps: Optional[List[int]] = field(default_factory=list) # Steps to profile @staticmethod From 6bb140bbdb1cb9b80089bc1d25aa63da139c265b Mon Sep 17 00:00:00 2001 From: Amis <62925330+qq1243196045@users.noreply.github.com> Date: Fri, 29 May 2026 21:15:08 +0800 Subject: [PATCH 07/17] Update swift/utils/profiler/torch_profile.py Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com> --- swift/utils/profiler/torch_profile.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/swift/utils/profiler/torch_profile.py b/swift/utils/profiler/torch_profile.py index 00182e0159..0f30120686 100644 --- a/swift/utils/profiler/torch_profile.py +++ b/swift/utils/profiler/torch_profile.py @@ -89,7 +89,7 @@ def start(self, **kwargs): save_file_prefix=self.save_file_prefix, rank=self.rank, ) - print(f"[Profiler] started for rank {self.rank}") + logger.info(f"[Profiler] started for rank {self.rank}") self.prof.start() Profiler._define_count += 1 From 269be95f9cac4356c75d838b60692aab39f64140 Mon Sep 17 00:00:00 2001 From: Amis <62925330+qq1243196045@users.noreply.github.com> Date: Fri, 29 May 2026 21:15:35 +0800 Subject: [PATCH 08/17] Update swift/utils/profiler/profile.py Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com> --- swift/utils/profiler/profile.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/swift/utils/profiler/profile.py b/swift/utils/profiler/profile.py index 764e9ef2fb..baf2d968b5 100644 --- a/swift/utils/profiler/profile.py +++ b/swift/utils/profiler/profile.py @@ -20,7 +20,7 @@ def __init__(self, rank = torch.distributed.get_rank() else: rank = int(os.environ.get('RANK', 0)) - print(f"Warning: torch.distributed is not initialized, using RANK env var for rank: {rank}") + logger.warning(f"Warning: torch.distributed is not initialized, using RANK env var for rank: {rank}") if global_config is not None: config = ProfilerConfig( tool=global_config.profiler_tool, From 32a69d19d80c1876b54422c2d0997f2fb51c18e2 Mon Sep 17 00:00:00 2001 From: Amis <62925330+qq1243196045@users.noreply.github.com> Date: Fri, 29 May 2026 21:19:26 +0800 Subject: [PATCH 09/17] Update swift/utils/profiler/torch_profile.py Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com> --- swift/utils/profiler/torch_profile.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/swift/utils/profiler/torch_profile.py b/swift/utils/profiler/torch_profile.py index 0f30120686..b2c5c3c9de 100644 --- a/swift/utils/profiler/torch_profile.py +++ b/swift/utils/profiler/torch_profile.py @@ -71,7 +71,7 @@ def __init__( self.rank = rank self.config = config self.tool_config = tool_config - self.contents = self.tool_config.contents + self.contents = self.tool_config.contents if self.tool_config else [] self.save_path = self.config.save_path # Align with other profilers: read discrete mode, default to False for torch profiler self.discrete = getattr(self.tool_config, 'discrete', False) From bf99454b01dff0b7c8f1b6eeec05e4590b38acdb Mon Sep 17 00:00:00 2001 From: "1243196045@qq" <1243196045@qq.com> Date: Fri, 29 May 2026 21:24:58 +0800 Subject: [PATCH 10/17] add swift logger --- swift/utils/profiler/torch_profile.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/swift/utils/profiler/torch_profile.py b/swift/utils/profiler/torch_profile.py index b2c5c3c9de..fb76d9d735 100644 --- a/swift/utils/profiler/torch_profile.py +++ b/swift/utils/profiler/torch_profile.py @@ -3,10 +3,11 @@ import torch from datetime import datetime, timezone from typing import Callable, Optional - +from swift.utils import get_logger from .config import ProfilerConfig, TorchProfilerToolConfig from .profile import DistProfiler +logger = get_logger() def get_torch_profiler( contents: list[str], From 0498984a343f9ce320687dbf76c3ad67b00a24f2 Mon Sep 17 00:00:00 2001 From: "1243196045@qq" <1243196045@qq.com> Date: Fri, 29 May 2026 21:31:36 +0800 Subject: [PATCH 11/17] updtate Profile --- swift/arguments/base_args/profile_args.py | 22 ++++++++++++---------- swift/utils/profiler/torch_profile.py | 2 ++ 2 files changed, 14 insertions(+), 10 deletions(-) diff --git a/swift/arguments/base_args/profile_args.py b/swift/arguments/base_args/profile_args.py index 10b51e448f..4ed45cf3e6 100644 --- a/swift/arguments/base_args/profile_args.py +++ b/swift/arguments/base_args/profile_args.py @@ -22,10 +22,18 @@ class ProfilerArguments: def __post_init__(self): assert not self.profiler_discrete, \ 'Profiler discrete mode is not supported yet, please set profiler_discrete to false' - if self.enable_profiler and 'profiler' not in self.callbacks: - self.callbacks.append('profiler') - if 'profiler' in self.callbacks and not self.enable_profiler: - self.enable_profiler = True + + if hasattr(self, 'callbacks'): + if self.enable_profiler and 'profiler' not in self.callbacks: + self.callbacks.append('profiler') + if 'profiler' in self.callbacks and not self.enable_profiler: + self.enable_profiler = True + if self.enable_profiler: + assert 'profiler' in self.callbacks, \ + 'Profiler callback must be included in callbacks when profiler is enabled.' + if 'profiler' in self.callbacks: + assert self.enable_profiler, \ + 'Profiler callback is included in callbacks but profiler is not enabled.' if self.enable_profiler: assert self.profiler_save_path is not None, \ 'Profiler save path must be specified when profiler is enabled.' @@ -35,12 +43,6 @@ def __post_init__(self): 'Profiler steps must be specified when profiler is enabled.' assert self.profiler_ranks != [] or self.profiler_all_ranks, \ 'Either profiler_ranks must be specified or profiler_all_ranks must be set to True.' - if self.enable_profiler: - assert 'profiler' in self.callbacks, \ - 'Profiler callback must be included in callbacks when profiler is enabled.' - if 'profiler' in self.callbacks: - assert self.enable_profiler, \ - 'Profiler callback is included in callbacks but profiler is not enabled.' def get_profiler_kwargs(self): return { diff --git a/swift/utils/profiler/torch_profile.py b/swift/utils/profiler/torch_profile.py index fb76d9d735..1a337af762 100644 --- a/swift/utils/profiler/torch_profile.py +++ b/swift/utils/profiler/torch_profile.py @@ -3,12 +3,14 @@ import torch from datetime import datetime, timezone from typing import Callable, Optional + from swift.utils import get_logger from .config import ProfilerConfig, TorchProfilerToolConfig from .profile import DistProfiler logger = get_logger() + def get_torch_profiler( contents: list[str], save_path: str, From 407ea6b0171e2d8533ac923f4d57bce58f722fbc Mon Sep 17 00:00:00 2001 From: "1243196045@qq" <1243196045@qq.com> Date: Fri, 29 May 2026 21:33:48 +0800 Subject: [PATCH 12/17] update Profile --- swift/arguments/base_args/profile_args.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/swift/arguments/base_args/profile_args.py b/swift/arguments/base_args/profile_args.py index 4ed45cf3e6..e77e457f92 100644 --- a/swift/arguments/base_args/profile_args.py +++ b/swift/arguments/base_args/profile_args.py @@ -34,6 +34,9 @@ def __post_init__(self): if 'profiler' in self.callbacks: assert self.enable_profiler, \ 'Profiler callback is included in callbacks but profiler is not enabled.' + else: + assert not self.enable_profiler, \ + 'Profiler cannot be enabled without callbacks attribute or with profiler callback missing in callbacks.' if self.enable_profiler: assert self.profiler_save_path is not None, \ 'Profiler save path must be specified when profiler is enabled.' From 452d04134f51afc8c47a6035aa9f15ed490ce9ec Mon Sep 17 00:00:00 2001 From: Amis <62925330+qq1243196045@users.noreply.github.com> Date: Fri, 29 May 2026 21:59:28 +0800 Subject: [PATCH 13/17] Update swift/utils/profiler/config.py Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com> --- swift/utils/profiler/config.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/swift/utils/profiler/config.py b/swift/utils/profiler/config.py index 6e3cb45c4a..9750fc1ea4 100644 --- a/swift/utils/profiler/config.py +++ b/swift/utils/profiler/config.py @@ -135,8 +135,8 @@ class TorchProfilerToolConfig(BaseConfig): def __post_init__(self) -> None: """config validation logics go here""" + assert isinstance(self.contents, list), f"Profiler contents must be of type list, got {type(self.contents)}" __support_contents = ['cuda', 'cpu', 'memory', 'shapes', 'stack'] for content in self.contents: assert content in __support_contents, ( f"Profiler contents only supports {__support_contents}, but gets {content}") - assert isinstance(self.contents, list), f"Profiler contents must be of type list, got {type(self.contents)}" From 1f9069d97b50b55aa379b97b5344bc84546b3dd3 Mon Sep 17 00:00:00 2001 From: "1243196045@qq" <1243196045@qq.com> Date: Fri, 29 May 2026 22:03:31 +0800 Subject: [PATCH 14/17] update --- swift/utils/profiler/profile.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/swift/utils/profiler/profile.py b/swift/utils/profiler/profile.py index baf2d968b5..0bfb768de7 100644 --- a/swift/utils/profiler/profile.py +++ b/swift/utils/profiler/profile.py @@ -3,8 +3,9 @@ import torch from typing import Callable, Optional +from swift.utils import get_logger from .config import ProfilerConfig, TorchProfilerToolConfig - +logger = get_logger() class DistProfiler: From da28c9f1cbb61ae68eabc49e25448654d213010c Mon Sep 17 00:00:00 2001 From: "1243196045@qq" <1243196045@qq.com> Date: Fri, 29 May 2026 22:06:05 +0800 Subject: [PATCH 15/17] update --- swift/utils/profiler/profile.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/swift/utils/profiler/profile.py b/swift/utils/profiler/profile.py index 0bfb768de7..a3be0c5e0b 100644 --- a/swift/utils/profiler/profile.py +++ b/swift/utils/profiler/profile.py @@ -5,8 +5,10 @@ from swift.utils import get_logger from .config import ProfilerConfig, TorchProfilerToolConfig + logger = get_logger() + class DistProfiler: def __init__(self, From f79da7c42109a4168708aaf5d8c8bb56abd6187b Mon Sep 17 00:00:00 2001 From: Amis <62925330+qq1243196045@users.noreply.github.com> Date: Mon, 1 Jun 2026 11:05:03 +0800 Subject: [PATCH 16/17] Update swift/utils/profiler/torch_profile.py Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com> --- swift/utils/profiler/torch_profile.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/swift/utils/profiler/torch_profile.py b/swift/utils/profiler/torch_profile.py index 1a337af762..ff8f62e08d 100644 --- a/swift/utils/profiler/torch_profile.py +++ b/swift/utils/profiler/torch_profile.py @@ -140,9 +140,11 @@ def wrapper(*args, **kwargs_inner): rank=self.rank, ) prof.start() - with torch.profiler.record_function(profile_name): - result = func(*args, **kwargs_inner) - prof.stop() + try: + with torch.profiler.record_function(profile_name): + result = func(*args, **kwargs_inner) + finally: + prof.stop() return result return wrapper From 6cffadbb0f72b7bb48225289896bac3902420f42 Mon Sep 17 00:00:00 2001 From: Amis <62925330+qq1243196045@users.noreply.github.com> Date: Mon, 1 Jun 2026 11:11:57 +0800 Subject: [PATCH 17/17] Update swift/utils/profiler/torch_profile.py Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com> --- swift/utils/profiler/torch_profile.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/swift/utils/profiler/torch_profile.py b/swift/utils/profiler/torch_profile.py index ff8f62e08d..0d0f74c408 100644 --- a/swift/utils/profiler/torch_profile.py +++ b/swift/utils/profiler/torch_profile.py @@ -101,7 +101,7 @@ def step(self): self.prof.step() def stop(self): - if not self.discrete and Profiler._define_count == 1: + if not self.discrete and Profiler._define_count == 1 and self.check(): self.step() logger.info(f"[Profiler] stopped for rank {self.rank}") self.prof.stop()