Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 4 additions & 3 deletions docs/config_example.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -8,16 +8,17 @@ CLI:
pd_model_name: "pd_model" # PaddlePaddle 模型变量名
pt_optim_name: "pt_optimizer" # (可选) PyTorch 优化器变量名
pd_optim_name: "pd_optimizer" # (可选) PaddlePaddle 优化器变量名
base_framework: "torch" # (可选) 设置作为 base 的框架
base_framework: "torch" # (可选) 设置作为基准的框架名
log_dir: "./padiff_log" # (可选) 日志目录

# --- COMPARE 部分 ---
# 定义结果对比逻辑
COMPARE:
atol: 1.0e-06 # (可选) 绝对误差
rtol: 1.0e-06 # (可选) 相对误差
compare_mode: "mean" # (可选) 对比模式
action_name: "equal" # (可选) 对比动作
compare_mode: "mean" # (可选) 数值对比模式
action_name: "equal" # (可选) 层对比策略
check_mode: "fast" # (可选) 模型对比策略

# --- PaDiffGuard 部分 ---
# 定义模型对齐行为
Expand Down
1 change: 0 additions & 1 deletion padiff/abstracts/hooks/guard.py
Original file line number Diff line number Diff line change
Expand Up @@ -112,7 +112,6 @@ def SingleStepGuard(diff_phase, base_dump_path):
def AlignmentGuard(model, seed=42):
"""Prepare the model environment for accuracy alignment."""
logger.debug(f"AlignmentGuard: Initializing for {model}")
model.model.train()
model.toggle_dropout(enable=False)
set_seed(seed)
try:
Expand Down
77 changes: 33 additions & 44 deletions padiff/abstracts/hooks/hook.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,9 +32,6 @@
from .base import current_report, find_base_report_node, single_step_state


_seen_warnings = set()


@contextlib.contextmanager
def register_hooker(model):
marker = model.marker
Expand Down Expand Up @@ -274,22 +271,20 @@ def inner(input_):

if cur_idx >= len(numpy_file_list):
warning_key = ("single-step: output_count_mismatch", current_name)
if warning_key not in _seen_warnings:
logger.warning(
f"\n ⚠️ Single-step alignment SKIPPED: the {cur_idx + 1}st output is requested, "
f"but only {len(numpy_file_list)} pre-saved from base model, skip the current output."
"\n ⚠️ This warning will not repeat for this layer."
f"\n 📌 Layer Name: {current_name}(raw)"
"\n 💡 Possible Causes and Solutions:"
"\n - The number of outputs from the current layer in the raw model is bigger than"
"that of its corresponding layer in the base model."
"\n - Verify that both models have identical architectures for this layer."
"\n - If the corresponding relationship of the current layer is correct, "
"please disable single step mode, or add the layer to blacklist to skip the check of this layer."
"\n - Or when you are sure that the extra output does not need to be compared, "
"you can swap the execution order of the base model and the raw model."
)
_seen_warnings.add(warning_key)
logger.warning_once(
f"\n ⚠️ Single-step alignment SKIPPED: the {cur_idx + 1}st output is requested, "
f"but only {len(numpy_file_list)} pre-saved from base model, skip the current output."
"\n ⚠️ This warning will not repeat for this layer."
f"\n 📌 Layer Name: {current_name}(raw)"
"\n 💡 Possible Causes and Solutions:"
"\n - The number of outputs from the current layer in the raw model is bigger than"
"that of its corresponding layer in the base model."
"\n - Verify that both models have identical architectures for this layer."
"\n - If the corresponding relationship of the current layer is correct, "
"please disable single step mode, or add the layer to blacklist to skip the check of this layer."
"\n - Or when you are sure that the extra output does not need to be compared, "
"you can swap the execution order of the base model and the raw model."
)
return input_

value = np.load(numpy_file_list[cur_idx]["path"])
Expand All @@ -302,25 +297,21 @@ def inner(input_):

elif np.prod(base_shape) != np.prod(raw_shape):
warning_key = ("single-step: shape_mismatch", current_name)
if warning_key not in _seen_warnings:
logger.warning(
f"\n ⚠️ Single-step alignment SKIPPED: shape mismatch."
"\n ⚠️ This warning will not repeat for this layer."
f"\n 📌 Layer Name: {current_name}(raw)"
f"\n 📌 Shape: {base_shape}(base) vs {raw_shape}(raw)"
)
_seen_warnings.add(warning_key)
logger.warning_once(
f"\n ⚠️ Single-step alignment SKIPPED: shape mismatch."
"\n ⚠️ This warning will not repeat for this layer."
f"\n 📌 Layer Name: {current_name}(raw)"
f"\n 📌 Shape: {base_shape}(base) vs {raw_shape}(raw)"
)
return input_
else:
value = value.reshape(input_.shape)
debug_key = ("single-step: reshape_used", current_name)
if debug_key not in _seen_warnings:
logger.debug(
f"\n ⚠️ Try to reshape loaded value to input's shape of layer {current_name}(raw). "
"This may lead to numerical errors even if reshape succeeds."
"\n ⚠️ This warning will not repeat for this layer."
)
_seen_warnings.add(debug_key)
logger.debug_once(
f"\n ⚠️ Try to reshape loaded value to input's shape of layer {current_name}(raw). "
"This may lead to numerical errors even if reshape succeeds."
"\n ⚠️ This warning will not repeat for this layer."
)

if isinstance(input_, paddle.Tensor):
return paddle.to_tensor(value, dtype=input_.dtype)
Expand All @@ -336,16 +327,14 @@ def single_step_check(report, net_id, step_idx, current_name, node_type, bwd_ite
base_report_node = find_base_report_node(net_id, step_idx)
if base_report_node["name"] != current_name:
warning_key = ("single-step: name_mismatch", current_name)
if warning_key not in _seen_warnings:
logger.warning(
f"\n ⚠️ Single-step alignment WARNING: {node_type} with net_id={net_id} mismatch!"
"\n ⚠️ This warning will not repeat for this layer."
f"\n 📌 Mismatch {node_type.capitalize()}: {base_report_node['name']}(base) vs {current_name}(raw)"
"\n 💡 Suggestion: Models have different architectures or class name or initialization order. "
"Please check the model implementation or decrease 'align_depth' to reduce the alignment "
"granularity, or add layers that do not require alignment to the blacklist."
)
_seen_warnings.add(warning_key)
logger.warning_once(
f"\n ⚠️ Single-step alignment WARNING: {node_type} with net_id={net_id} mismatch!"
"\n ⚠️ This warning will not repeat for this layer."
f"\n 📌 Mismatch {node_type.capitalize()}: {base_report_node['name']}(base) vs {current_name}(raw)"
"\n 💡 Suggestion: Models have different architectures or class name or initialization order. "
"Please check the model implementation or decrease 'align_depth' to reduce the alignment "
"granularity, or add layers that do not require alignment to the blacklist."
)
else:
logger.debug(f"Single Step: {current_name}(net_id={net_id})")

Expand Down
8 changes: 4 additions & 4 deletions padiff/comparison/actions.py
Original file line number Diff line number Diff line change
Expand Up @@ -110,7 +110,7 @@ def priority(self):
def __call__(self, file_list_0, file_list_1, cfg):
len_fl_0, len_fl_1 = len(file_list_0), len(file_list_1)
if len_fl_0 != len_fl_1:
logger.warning(f"number of tensors for compare is not equal, {len_fl_0} vs {len_fl_1}")
logger.warning_once(f"number of tensors for compare is not equal, {len_fl_0} vs {len_fl_1}")

min_len = min(len_fl_0, len_fl_1)

Expand All @@ -123,7 +123,7 @@ def __call__(self, file_list_0, file_list_1, cfg):
tensor_1 = np.transpose(tensor_1)

if tensor_0.size == 0 or tensor_1.size == 0:
logger.debug("Found empty tensor, compare skipped!")
logger.debug_once("Found empty tensor, compare skipped!")
continue

if tensor_0.shape != tensor_1.shape:
Expand All @@ -140,10 +140,10 @@ def __call__(self, file_list_0, file_list_1, cfg):
tensor_1 = np.reshape(tensor_1, tensor_0.shape)
else:
debug_msg += "however tensors cannot be converted to each other, skip!"
logger.debug(debug_msg)
logger.debug_once(debug_msg)
continue

logger.debug(debug_msg)
logger.debug_once(debug_msg)

assert_tensor_equal(tensor_0, tensor_1, cfg)
num_success += 1
Expand Down
9 changes: 9 additions & 0 deletions padiff/utils/log.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
import shutil
import logging
import colorlog
import functools


log_config = {
Expand Down Expand Up @@ -87,6 +88,10 @@ def warning(self, *args):
else:
print(f"[AutoDiff] [WARNING] {' '.join(map(str, args))}")

@functools.lru_cache(maxsize=None)
def warning_once(self, *args):
self.warning(*args)

def error(self, *args):
if self._logger is not None:
self._logger.error(" ".join(map(str, args)))
Expand All @@ -99,6 +104,10 @@ def debug(self, *args):
else:
print(f"[AutoDiff] [DEBUG] {' '.join(map(str, args))}")

@functools.lru_cache(maxsize=None)
def debug_once(self, *args):
self.debug(*args)

def reset_dir(self, path):
if os.path.exists(path):
shutil.rmtree(path)
Expand Down
37 changes: 27 additions & 10 deletions padiff/utils/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@

import json
import collections.abc
import hashlib

import numpy as np
import paddle
Expand All @@ -33,12 +34,26 @@ def set_seed(seed=42):


def get_numpy_from_tensor(tensor):
bf16_warning = True

if tensor.dtype == torch.bfloat16:
np_array = tensor.cpu().detach().float().numpy()
tensor = tensor.to(torch.float32)
elif tensor.dtype == paddle.bfloat16:
np_array = tensor.cpu().detach().astype("float32").numpy()
tensor = tensor.astype("float32")
else:
np_array = tensor.cpu().detach().numpy()
bf16_warning = False

tensor = tensor.detach().cpu()

np_array = tensor.numpy()

if bf16_warning:
logger.warning_once(
"Precision Warning: The model contains 'bfloat16' tensors. "
"Due to the inherent lower precision of bfloat16 and potential differences in conversion "
"between PyTorch and PaddlePaddle, numerical comparisons may show larger-than-expected differences. "
"Consider using 'float32' for critical alignment checks or adjusting 'atol'/'rtol' accordingly."
)
return np_array


Expand Down Expand Up @@ -73,14 +88,9 @@ def set_require_grad(x):
x.stop_gradient = False


def _clone_tensor(inp): # to cpu
def _clone_tensor(inp): # no device changes
if isinstance(inp, (torch.Tensor, paddle.Tensor)):
if inp.numel() == 0:
if isinstance(inp, torch.Tensor):
return torch.tensor([], dtype=inp.dtype)
else:
return paddle.to_tensor([], dtype=inp.dtype)
new_t = inp.detach().cpu().clone()
new_t = inp.detach().clone()
if is_require_grad(inp):
set_require_grad(new_t)
return new_t
Expand Down Expand Up @@ -271,8 +281,15 @@ def assert_tensor_equal(tensor1, tensor2, cfg):
np.testing.assert_allclose(tensor1.mean(), tensor2.mean(), atol=atol, rtol=rtol)
elif compare_mode == "strict":
np.testing.assert_allclose(tensor1, tensor2, atol=atol, rtol=rtol)
elif compare_mode == "abs_strict":
np.testing.assert_allclose(abs(tensor1), abs(tensor2), atol=atol, rtol=rtol)
elif compare_mode == "abs_mean":
np.testing.assert_allclose(abs(tensor1).mean(), abs(tensor2).mean(), atol=atol, rtol=rtol)
elif compare_mode == "md5":
md5_hash1 = hashlib.md5(tensor1.tobytes()).hexdigest()
md5_hash2 = hashlib.md5(tensor2.tobytes()).hexdigest()
if md5_hash1 != md5_hash2:
raise ValueError(f"MD5 diff: {md5_hash1} vs {md5_hash2}")
else:
raise RuntimeError(f"Invalid compare_mode {compare_mode}")

Expand Down