From ff3f0be708da7c924ca4b742a9b418c09c06ea41 Mon Sep 17 00:00:00 2001 From: lijialin03 Date: Fri, 10 Oct 2025 02:48:57 +0000 Subject: [PATCH 1/2] feat:clone and convert tensors on the orginal device --- padiff/utils/utils.py | 36 +++++++++++++++++++++++++----------- 1 file changed, 25 insertions(+), 11 deletions(-) diff --git a/padiff/utils/utils.py b/padiff/utils/utils.py index fad452f..abde6d6 100644 --- a/padiff/utils/utils.py +++ b/padiff/utils/utils.py @@ -24,6 +24,9 @@ from .log import logger +_bf16_warning_shown = False + + def set_seed(seed=42): np.random.seed(seed) paddle.seed(seed) @@ -33,12 +36,28 @@ def set_seed(seed=42): def get_numpy_from_tensor(tensor): + global _bf16_warning_shown + + bf16_warning = False if tensor.dtype == torch.bfloat16: - np_array = tensor.cpu().detach().float().numpy() + tensor = tensor.to(torch.float32) + bf16_warning = not _bf16_warning_shown elif tensor.dtype == paddle.bfloat16: - np_array = tensor.cpu().detach().astype("float32").numpy() - else: - np_array = tensor.cpu().detach().numpy() + tensor = tensor.astype("float32") + bf16_warning = not _bf16_warning_shown + + if isinstance(tensor, torch.Tensor): + tensor = tensor.detach().cpu() + + np_array = tensor.numpy() + if bf16_warning: + logger.warning( + "Precision Warning: The model contains 'bfloat16' tensors. " + "Due to the inherent lower precision of bfloat16 and potential differences in conversion " + "between PyTorch and PaddlePaddle, numerical comparisons may show larger-than-expected differences. " + "Consider using 'float32' for critical alignment checks or adjusting 'atol'/'rtol' accordingly." + ) + _bf16_warning_shown = True return np_array @@ -73,14 +92,9 @@ def set_require_grad(x): x.stop_gradient = False -def _clone_tensor(inp): # to cpu +def _clone_tensor(inp): # no device changes if isinstance(inp, (torch.Tensor, paddle.Tensor)): - if inp.numel() == 0: - if isinstance(inp, torch.Tensor): - return torch.tensor([], dtype=inp.dtype) - else: - return paddle.to_tensor([], dtype=inp.dtype) - new_t = inp.detach().cpu().clone() + new_t = inp.detach().clone() if is_require_grad(inp): set_require_grad(new_t) return new_t From 2bcb880934cb93793ff6fc0255a1ac49c7229779 Mon Sep 17 00:00:00 2001 From: lijialin03 Date: Wed, 15 Oct 2025 07:20:47 +0000 Subject: [PATCH 2/2] feat:add warning_once and debug_once --- docs/config_example.yaml | 7 +-- padiff/abstracts/hooks/guard.py | 1 - padiff/abstracts/hooks/hook.py | 77 ++++++++++++++------------------- padiff/comparison/actions.py | 8 ++-- padiff/utils/log.py | 9 ++++ padiff/utils/utils.py | 25 ++++++----- 6 files changed, 64 insertions(+), 63 deletions(-) diff --git a/docs/config_example.yaml b/docs/config_example.yaml index c5ef01a..838002d 100644 --- a/docs/config_example.yaml +++ b/docs/config_example.yaml @@ -8,7 +8,7 @@ CLI: pd_model_name: "pd_model" # PaddlePaddle 模型变量名 pt_optim_name: "pt_optimizer" # (可选) PyTorch 优化器变量名 pd_optim_name: "pd_optimizer" # (可选) PaddlePaddle 优化器变量名 - base_framework: "torch" # (可选) 设置作为 base 的框架 + base_framework: "torch" # (可选) 设置作为基准的框架名 log_dir: "./padiff_log" # (可选) 日志目录 # --- COMPARE 部分 --- @@ -16,8 +16,9 @@ CLI: COMPARE: atol: 1.0e-06 # (可选) 绝对误差 rtol: 1.0e-06 # (可选) 相对误差 - compare_mode: "mean" # (可选) 对比模式 - action_name: "equal" # (可选) 对比动作 + compare_mode: "mean" # (可选) 数值对比模式 + action_name: "equal" # (可选) 层对比策略 + check_mode: "fast" # (可选) 模型对比策略 # --- PaDiffGuard 部分 --- # 定义模型对齐行为 diff --git a/padiff/abstracts/hooks/guard.py b/padiff/abstracts/hooks/guard.py index bca357d..c6f9e5e 100644 --- a/padiff/abstracts/hooks/guard.py +++ b/padiff/abstracts/hooks/guard.py @@ -112,7 +112,6 @@ def SingleStepGuard(diff_phase, base_dump_path): def AlignmentGuard(model, seed=42): """Prepare the model environment for accuracy alignment.""" logger.debug(f"AlignmentGuard: Initializing for {model}") - model.model.train() model.toggle_dropout(enable=False) set_seed(seed) try: diff --git a/padiff/abstracts/hooks/hook.py b/padiff/abstracts/hooks/hook.py index 5b5167c..3291108 100644 --- a/padiff/abstracts/hooks/hook.py +++ b/padiff/abstracts/hooks/hook.py @@ -32,9 +32,6 @@ from .base import current_report, find_base_report_node, single_step_state -_seen_warnings = set() - - @contextlib.contextmanager def register_hooker(model): marker = model.marker @@ -274,22 +271,20 @@ def inner(input_): if cur_idx >= len(numpy_file_list): warning_key = ("single-step: output_count_mismatch", current_name) - if warning_key not in _seen_warnings: - logger.warning( - f"\n ⚠️ Single-step alignment SKIPPED: the {cur_idx + 1}st output is requested, " - f"but only {len(numpy_file_list)} pre-saved from base model, skip the current output." - "\n ⚠️ This warning will not repeat for this layer." - f"\n 📌 Layer Name: {current_name}(raw)" - "\n 💡 Possible Causes and Solutions:" - "\n - The number of outputs from the current layer in the raw model is bigger than" - "that of its corresponding layer in the base model." - "\n - Verify that both models have identical architectures for this layer." - "\n - If the corresponding relationship of the current layer is correct, " - "please disable single step mode, or add the layer to blacklist to skip the check of this layer." - "\n - Or when you are sure that the extra output does not need to be compared, " - "you can swap the execution order of the base model and the raw model." - ) - _seen_warnings.add(warning_key) + logger.warning_once( + f"\n ⚠️ Single-step alignment SKIPPED: the {cur_idx + 1}st output is requested, " + f"but only {len(numpy_file_list)} pre-saved from base model, skip the current output." + "\n ⚠️ This warning will not repeat for this layer." + f"\n 📌 Layer Name: {current_name}(raw)" + "\n 💡 Possible Causes and Solutions:" + "\n - The number of outputs from the current layer in the raw model is bigger than" + "that of its corresponding layer in the base model." + "\n - Verify that both models have identical architectures for this layer." + "\n - If the corresponding relationship of the current layer is correct, " + "please disable single step mode, or add the layer to blacklist to skip the check of this layer." + "\n - Or when you are sure that the extra output does not need to be compared, " + "you can swap the execution order of the base model and the raw model." + ) return input_ value = np.load(numpy_file_list[cur_idx]["path"]) @@ -302,25 +297,21 @@ def inner(input_): elif np.prod(base_shape) != np.prod(raw_shape): warning_key = ("single-step: shape_mismatch", current_name) - if warning_key not in _seen_warnings: - logger.warning( - f"\n ⚠️ Single-step alignment SKIPPED: shape mismatch." - "\n ⚠️ This warning will not repeat for this layer." - f"\n 📌 Layer Name: {current_name}(raw)" - f"\n 📌 Shape: {base_shape}(base) vs {raw_shape}(raw)" - ) - _seen_warnings.add(warning_key) + logger.warning_once( + f"\n ⚠️ Single-step alignment SKIPPED: shape mismatch." + "\n ⚠️ This warning will not repeat for this layer." + f"\n 📌 Layer Name: {current_name}(raw)" + f"\n 📌 Shape: {base_shape}(base) vs {raw_shape}(raw)" + ) return input_ else: value = value.reshape(input_.shape) debug_key = ("single-step: reshape_used", current_name) - if debug_key not in _seen_warnings: - logger.debug( - f"\n ⚠️ Try to reshape loaded value to input's shape of layer {current_name}(raw). " - "This may lead to numerical errors even if reshape succeeds." - "\n ⚠️ This warning will not repeat for this layer." - ) - _seen_warnings.add(debug_key) + logger.debug_once( + f"\n ⚠️ Try to reshape loaded value to input's shape of layer {current_name}(raw). " + "This may lead to numerical errors even if reshape succeeds." + "\n ⚠️ This warning will not repeat for this layer." + ) if isinstance(input_, paddle.Tensor): return paddle.to_tensor(value, dtype=input_.dtype) @@ -336,16 +327,14 @@ def single_step_check(report, net_id, step_idx, current_name, node_type, bwd_ite base_report_node = find_base_report_node(net_id, step_idx) if base_report_node["name"] != current_name: warning_key = ("single-step: name_mismatch", current_name) - if warning_key not in _seen_warnings: - logger.warning( - f"\n ⚠️ Single-step alignment WARNING: {node_type} with net_id={net_id} mismatch!" - "\n ⚠️ This warning will not repeat for this layer." - f"\n 📌 Mismatch {node_type.capitalize()}: {base_report_node['name']}(base) vs {current_name}(raw)" - "\n 💡 Suggestion: Models have different architectures or class name or initialization order. " - "Please check the model implementation or decrease 'align_depth' to reduce the alignment " - "granularity, or add layers that do not require alignment to the blacklist." - ) - _seen_warnings.add(warning_key) + logger.warning_once( + f"\n ⚠️ Single-step alignment WARNING: {node_type} with net_id={net_id} mismatch!" + "\n ⚠️ This warning will not repeat for this layer." + f"\n 📌 Mismatch {node_type.capitalize()}: {base_report_node['name']}(base) vs {current_name}(raw)" + "\n 💡 Suggestion: Models have different architectures or class name or initialization order. " + "Please check the model implementation or decrease 'align_depth' to reduce the alignment " + "granularity, or add layers that do not require alignment to the blacklist." + ) else: logger.debug(f"Single Step: {current_name}(net_id={net_id})") diff --git a/padiff/comparison/actions.py b/padiff/comparison/actions.py index 0840035..e9b8498 100644 --- a/padiff/comparison/actions.py +++ b/padiff/comparison/actions.py @@ -110,7 +110,7 @@ def priority(self): def __call__(self, file_list_0, file_list_1, cfg): len_fl_0, len_fl_1 = len(file_list_0), len(file_list_1) if len_fl_0 != len_fl_1: - logger.warning(f"number of tensors for compare is not equal, {len_fl_0} vs {len_fl_1}") + logger.warning_once(f"number of tensors for compare is not equal, {len_fl_0} vs {len_fl_1}") min_len = min(len_fl_0, len_fl_1) @@ -123,7 +123,7 @@ def __call__(self, file_list_0, file_list_1, cfg): tensor_1 = np.transpose(tensor_1) if tensor_0.size == 0 or tensor_1.size == 0: - logger.debug("Found empty tensor, compare skipped!") + logger.debug_once("Found empty tensor, compare skipped!") continue if tensor_0.shape != tensor_1.shape: @@ -140,10 +140,10 @@ def __call__(self, file_list_0, file_list_1, cfg): tensor_1 = np.reshape(tensor_1, tensor_0.shape) else: debug_msg += "however tensors cannot be converted to each other, skip!" - logger.debug(debug_msg) + logger.debug_once(debug_msg) continue - logger.debug(debug_msg) + logger.debug_once(debug_msg) assert_tensor_equal(tensor_0, tensor_1, cfg) num_success += 1 diff --git a/padiff/utils/log.py b/padiff/utils/log.py index e469559..2d0049b 100644 --- a/padiff/utils/log.py +++ b/padiff/utils/log.py @@ -16,6 +16,7 @@ import shutil import logging import colorlog +import functools log_config = { @@ -87,6 +88,10 @@ def warning(self, *args): else: print(f"[AutoDiff] [WARNING] {' '.join(map(str, args))}") + @functools.lru_cache(maxsize=None) + def warning_once(self, *args): + self.warning(*args) + def error(self, *args): if self._logger is not None: self._logger.error(" ".join(map(str, args))) @@ -99,6 +104,10 @@ def debug(self, *args): else: print(f"[AutoDiff] [DEBUG] {' '.join(map(str, args))}") + @functools.lru_cache(maxsize=None) + def debug_once(self, *args): + self.debug(*args) + def reset_dir(self, path): if os.path.exists(path): shutil.rmtree(path) diff --git a/padiff/utils/utils.py b/padiff/utils/utils.py index abde6d6..1c5460e 100644 --- a/padiff/utils/utils.py +++ b/padiff/utils/utils.py @@ -14,6 +14,7 @@ import json import collections.abc +import hashlib import numpy as np import paddle @@ -24,9 +25,6 @@ from .log import logger -_bf16_warning_shown = False - - def set_seed(seed=42): np.random.seed(seed) paddle.seed(seed) @@ -36,28 +34,26 @@ def set_seed(seed=42): def get_numpy_from_tensor(tensor): - global _bf16_warning_shown + bf16_warning = True - bf16_warning = False if tensor.dtype == torch.bfloat16: tensor = tensor.to(torch.float32) - bf16_warning = not _bf16_warning_shown elif tensor.dtype == paddle.bfloat16: tensor = tensor.astype("float32") - bf16_warning = not _bf16_warning_shown + else: + bf16_warning = False - if isinstance(tensor, torch.Tensor): - tensor = tensor.detach().cpu() + tensor = tensor.detach().cpu() np_array = tensor.numpy() + if bf16_warning: - logger.warning( + logger.warning_once( "Precision Warning: The model contains 'bfloat16' tensors. " "Due to the inherent lower precision of bfloat16 and potential differences in conversion " "between PyTorch and PaddlePaddle, numerical comparisons may show larger-than-expected differences. " "Consider using 'float32' for critical alignment checks or adjusting 'atol'/'rtol' accordingly." ) - _bf16_warning_shown = True return np_array @@ -285,8 +281,15 @@ def assert_tensor_equal(tensor1, tensor2, cfg): np.testing.assert_allclose(tensor1.mean(), tensor2.mean(), atol=atol, rtol=rtol) elif compare_mode == "strict": np.testing.assert_allclose(tensor1, tensor2, atol=atol, rtol=rtol) + elif compare_mode == "abs_strict": + np.testing.assert_allclose(abs(tensor1), abs(tensor2), atol=atol, rtol=rtol) elif compare_mode == "abs_mean": np.testing.assert_allclose(abs(tensor1).mean(), abs(tensor2).mean(), atol=atol, rtol=rtol) + elif compare_mode == "md5": + md5_hash1 = hashlib.md5(tensor1.tobytes()).hexdigest() + md5_hash2 = hashlib.md5(tensor2.tobytes()).hexdigest() + if md5_hash1 != md5_hash2: + raise ValueError(f"MD5 diff: {md5_hash1} vs {md5_hash2}") else: raise RuntimeError(f"Invalid compare_mode {compare_mode}")