From ce90a993b7ee8b2646517e4068f22c207b5f7bed Mon Sep 17 00:00:00 2001
From: krizaltang <krizaltang@tencent.com>
Date: Wed, 27 May 2026 23:10:27 +0800
Subject: [PATCH 1/3] [fix]:uniform naming convention for hy3.

---
 configs/hy3/ptq/hy3_kvcache_calibrate.yaml    |   2 +-
 configs/hy3/ptq/hy3_vllm_calibrate.yaml       |   2 +-
 .../ptq/hy3_vllm_quant_fp8_per_tensor.yaml    |  12 +-
 scripts/ptq/README.md                         |  38 +++--
 ...Y3.sh => run_kvcache_calibrate_for_hy3.sh} |   4 +-
 ...r_HY3.sh => run_vllm_calibrate_for_hy3.sh} |   4 +-
 ...t_for_HY3.sh => run_vllm_quant_for_hy3.sh} |  14 +-
 tools/_yaml_args.py                           |   2 +-
 tools/fp8_quant_with_vllm_activation.py       | 154 +++++++++++++++++-
 tools/kvcache/README.md                       |   4 +-
 10 files changed, 192 insertions(+), 44 deletions(-)
 rename scripts/ptq/{run_kvcache_calibrate_for_HY3.sh => run_kvcache_calibrate_for_hy3.sh} (87%)
 rename scripts/ptq/{run_vllm_calibrate_for_HY3.sh => run_vllm_calibrate_for_hy3.sh} (89%)
 rename scripts/ptq/{run_vllm_quant_for_HY3.sh => run_vllm_quant_for_hy3.sh} (90%)

diff --git a/configs/hy3/ptq/hy3_kvcache_calibrate.yaml b/configs/hy3/ptq/hy3_kvcache_calibrate.yaml
index a22f3538..def75081 100644
--- a/configs/hy3/ptq/hy3_kvcache_calibrate.yaml
+++ b/configs/hy3/ptq/hy3_kvcache_calibrate.yaml
@@ -1,4 +1,4 @@
-# KV-cache calibration + scale search for HY3 (standalone, no weight/MoE hooks)
+# KV-cache calibration + scale search for hy3 (standalone, no weight/MoE hooks)
 # Consumed by: tools/kvcache/run_kvcache_calibrate.py
 #
 # Keys here match the script's argparse `dest` names. Values listed below
diff --git a/configs/hy3/ptq/hy3_vllm_calibrate.yaml b/configs/hy3/ptq/hy3_vllm_calibrate.yaml
index 6350616e..994ff1da 100644
--- a/configs/hy3/ptq/hy3_vllm_calibrate.yaml
+++ b/configs/hy3/ptq/hy3_vllm_calibrate.yaml
@@ -1,4 +1,4 @@
-# vLLM activation calibration for HY3 (collects activation + MoE expert stats,
+# vLLM activation calibration for hy3 (collects activation + MoE expert stats,
 # and optionally KV-cache stats / scale search).
 # Consumed by: tools/run_vllm_calibrate.py
 #
diff --git a/configs/hy3/ptq/hy3_vllm_quant_fp8_per_tensor.yaml b/configs/hy3/ptq/hy3_vllm_quant_fp8_per_tensor.yaml
index 2a9eddfd..8584688c 100644
--- a/configs/hy3/ptq/hy3_vllm_quant_fp8_per_tensor.yaml
+++ b/configs/hy3/ptq/hy3_vllm_quant_fp8_per_tensor.yaml
@@ -1,4 +1,4 @@
-# Stage-2 of the HY3 PTQ pipeline: FP8 quantization of bf16 weights using
+# Stage-2 of the hy3 PTQ pipeline: FP8 quantization of bf16 weights using
 # the activation_stats / moe_expert_stats produced by stage-1 calibration.
 # Consumed by: tools/fp8_quant_with_vllm_activation.py
 #
@@ -7,7 +7,7 @@
 # precedence (e.g. `python3 ... -c this.yaml --num-workers 8` will use 8).
 #
 # IMPORTANT: input_vllm_ac_json_path MUST equal the `output_dir` used in
-# stage 1 (HY3_vllm_calibrate.yaml), otherwise the FP8 quantizer will not
+# stage 1 (hy3_vllm_calibrate.yaml), otherwise the FP8 quantizer will not
 # find activation_stats.json / moe_expert_stats.json.
 
 input_bf16_hf_path: /path/to/input/model
@@ -17,3 +17,11 @@ output_fp8_hf_path: /path/to/output/fp8_model
 # Optional: leave at defaults unless you know what you are doing
 block_size: [-1, -1]
 num_workers: 16
+
+# KV-cache scheme & granularity (must match calibration config)
+# scheme: dynamic => no static scale is saved; granularity forced to per_token_per_head
+#         static  => granularity can be none | per-tensor | per-head
+k_scheme: dynamic                       # dynamic | static
+v_scheme: static                        # dynamic | static
+k_granularity: per-head                 # none | per-tensor | per-head (only used when k_scheme=static)
+v_granularity: per-head                 # none | per-tensor | per-head (only used when v_scheme=static)
diff --git a/scripts/ptq/README.md b/scripts/ptq/README.md
index 89998f2c..1eb3ae66 100644
--- a/scripts/ptq/README.md
+++ b/scripts/ptq/README.md
@@ -8,7 +8,7 @@
 
 ## 一、环境准备（运行校准脚本前必须完成）
 
-> 📌 **硬性要求**（当前 HY3 校准脚本经过验证的配置）：
+> 📌 **硬性要求**（当前 hy3 校准脚本经过验证的配置）：
 > - **算力**：**16 卡**（两个节点 × 每节点 8 卡），用于 TP/PP 跨节点切分
 > - **vLLM 版本**：**v0.20.0**（补丁文件按此版本对齐，其它版本需要重新生成补丁）
 > - **Python 环境**：所有节点保持一致（建议使用同一个 conda / venv）
@@ -21,7 +21,7 @@
 
 ### 1. 准备 Ray 集群（2 节点 × 8 卡 = 16 卡）
 
-HY3 等大模型需要跨节点 TP/PP，校准脚本默认走 vLLM 的 Ray distributed executor，必须先在 **两台 8 卡节点** 上分别拉起 Ray，组成一个 16 卡集群。
+hy3 等大模型需要跨节点 TP/PP，校准脚本默认走 vLLM 的 Ray distributed executor，必须先在 **两台 8 卡节点** 上分别拉起 Ray，组成一个 16 卡集群。
 
 下面给出的环境变量按 **RDMA / 多网卡** 集群的常见配置示例，请按实际网络拓扑调整（特别是 `*_SOCKET_IFNAME`、`NCCL_IB_GID_INDEX`）。
 
@@ -113,19 +113,19 @@ bash tools/vllm_patch/install.sh --help      # 查看完整用法
 
 ---
 
-## 二、HY3.0 系列脚本（Hunyuan-A20B 等 HY3 模型）
+## 二、hy3.0 系列脚本（Hunyuan-A20B 等 hy3 模型）
 
 下面 3 个脚本共享同一套 vLLM 运行时环境（chunked prefill / FlashInfer attention / mp distributed executor / fused MoE 等），区别在于产出物不同。
 
 | 脚本 | 用途 | 入口 |
 | --- | --- | --- |
-| [`run_vllm_quant_for_HY3.sh`](./run_vllm_quant_for_HY3.sh) | ★ 推荐的"一键流水线"：校准 + 量化 | `tools/run_vllm_calibrate.py` + `tools/fp8_quant_with_vllm_activation.py` |
-| [`run_vllm_calibrate_for_HY3.sh`](./run_vllm_calibrate_for_HY3.sh) | 仅 W8A8C8 联合校准 | `tools/run_vllm_calibrate.py` |
-| [`run_kvcache_calibrate_for_HY3.sh`](./run_kvcache_calibrate_for_HY3.sh) | 仅 KV-cache 校准（轻量） | `tools/kvcache/run_kvcache_calibrate.py` |
+| [`run_vllm_quant_for_hy3.sh`](./run_vllm_quant_for_hy3.sh) | ★ 推荐的"一键流水线"：校准 + 量化 | `tools/run_vllm_calibrate.py` + `tools/fp8_quant_with_vllm_activation.py` |
+| [`run_vllm_calibrate_for_hy3.sh`](./run_vllm_calibrate_for_hy3.sh) | 仅 W8A8C8 联合校准 | `tools/run_vllm_calibrate.py` |
+| [`run_kvcache_calibrate_for_hy3.sh`](./run_kvcache_calibrate_for_hy3.sh) | 仅 KV-cache 校准（轻量） | `tools/kvcache/run_kvcache_calibrate.py` |
 
 ---
 
-### 1. `run_vllm_quant_for_HY3.sh` ★推荐的"一键流水线"
+### 1. `run_vllm_quant_for_hy3.sh` ★推荐的"一键流水线"
 
 **功能**：bf16 模型 → vLLM 激活校准 → FP8 HF safetensors，全流程一次完成。
 
@@ -141,22 +141,25 @@ bash tools/vllm_patch/install.sh --help      # 查看完整用法
 #### 阶段 2：调用 `tools/fp8_quant_with_vllm_activation.py`
 
 - 读取 `${stats_dir}` 下的 `activation_stats.json` / `moe_expert_stats.json`，结合原 bf16 权重，做 per-tensor FP8 量化（含 weight + input scale），写出到 `${fp8_path}`。
-- 当存在 per-head KV 统计时，会同时输出 `kv_cache_scales.safetensors`。
+- KV-cache scale 的写入行为由量化 YAML 中的 `k_scheme` / `v_scheme` 控制：
+  - `static`：将校准得到的 scale 写入 `kv_cache_scales.safetensors`，粒度由 `k_granularity` / `v_granularity` 决定（`none` | `per-tensor` | `per-head`）。
+  - `dynamic`：不写入对应的 scale（`model.safetensors.index.json` 中也不包含对应 key），`config.json` 中标记为 `"scheme": "dynamic", "granularity": "per_token_per_head"`（与 `q_quant` 一致）。
+- 产出的 `config.json` 中 `attn_quant_config.kv_cache_quant` 的 `k_quant` 和 `v_quant` 独立配置，支持 K/V 使用不同的 scheme。
 
 #### CLI 开关
 
 ```bash
-bash run_vllm_quant_for_HY3.sh                    # 两阶段都跑
-bash run_vllm_quant_for_HY3.sh --skip-calibrate   # 仅量化（复用已有 stats_dir）
-bash run_vllm_quant_for_HY3.sh --skip-quantize    # 仅校准
-bash run_vllm_quant_for_HY3.sh --help             # 打印用法
+bash run_vllm_quant_for_hy3.sh                    # 两阶段都跑
+bash run_vllm_quant_for_hy3.sh --skip-calibrate   # 仅量化（复用已有 stats_dir）
+bash run_vllm_quant_for_hy3.sh --skip-quantize    # 仅校准
+bash run_vllm_quant_for_hy3.sh --help             # 打印用法
 ```
 
 > 脚本开启 `set -euo pipefail`，任一阶段失败将立即中断。
 
 ---
 
-### 2. `run_vllm_calibrate_for_HY3.sh` — 一键脚本里的"阶段 1"独立版
+### 2. `run_vllm_calibrate_for_hy3.sh` — 一键脚本里的"阶段 1"独立版
 
 **功能**：只跑 W8A8C8 联合校准，不做量化。
 
@@ -168,6 +171,7 @@ bash run_vllm_quant_for_HY3.sh --help             # 打印用法
   VLLM_MOE_COLLECT_STATS_VERBOSE=0
   ```
 - **默认配置**：`--kv-granularity per-head`，并开启 `--search-kv-scale`。
+- **注意**：校准阶段无论后续 scheme 是 dynamic 还是 static，都会正常收集 KV 统计数据。scheme 的判断仅在阶段 2（量化）时生效。
 - **产物**（写入 `${output_dir}`）：
   - `activation_stats.json`
   - `moe_expert_stats.json`
@@ -177,20 +181,20 @@ bash run_vllm_quant_for_HY3.sh --help             # 打印用法
 #### 适用场景
 
 - 想自己接后续量化工具，不走 `fp8_quant_with_vllm_activation.py`。
-- 想单独调校 PTQ 数据集 / `num_samples` / `max_length`，再用 `run_vllm_quant_for_HY3.sh --skip-calibrate` 复用结果。
+- 想单独调校 PTQ 数据集 / `num_samples` / `max_length`，再用 `run_vllm_quant_for_hy3.sh --skip-calibrate` 复用结果。
 - Debug 用 `--skip-weight-loading` 跑 dummy 权重，快速验证 hook 注册流程。
 
 ---
 
-### 3. `run_kvcache_calibrate_for_HY3.sh` — 仅校准 KV-cache（轻量）
+### 3. `run_kvcache_calibrate_for_hy3.sh` — 仅校准 KV-cache（轻量）
 
 **功能**：只校准 KV-cache（K/V min/max），不做 weight / activation / MoE 统计。
 
 - **入口**：`tools/kvcache/run_kvcache_calibrate.py`
 
-#### 关键差异（与 `run_vllm_calibrate_for_HY3.sh` 对比）
+#### 关键差异（与 `run_vllm_calibrate_for_hy3.sh` 对比）
 
-| 维度 | `run_kvcache_calibrate_for_HY3.sh` | `run_vllm_calibrate_for_HY3.sh` |
+| 维度 | `run_kvcache_calibrate_for_hy3.sh` | `run_vllm_calibrate_for_hy3.sh` |
 | --- | --- | --- |
 | MoE / Linear 钩子 | 故意 **NOT** 设置 `VLLM_MOE_COLLECT_STATS`，完全跳过，启动更快、CPU 内存占用更低 | 全开 |
 | KV 搜索范围 | `[0.4, 8.0]`，`num_steps=50`（更窄、更聚焦） | `[0.8, 16.0]` |
diff --git a/scripts/ptq/run_kvcache_calibrate_for_HY3.sh b/scripts/ptq/run_kvcache_calibrate_for_hy3.sh
similarity index 87%
rename from scripts/ptq/run_kvcache_calibrate_for_HY3.sh
rename to scripts/ptq/run_kvcache_calibrate_for_hy3.sh
index b0777dc3..ba519324 100755
--- a/scripts/ptq/run_kvcache_calibrate_for_HY3.sh
+++ b/scripts/ptq/run_kvcache_calibrate_for_hy3.sh
@@ -17,10 +17,10 @@ export ASYNC_SCHEDULING=1
 export VLLM_ENABLE_PREFIX_CACHING=1
 export PRECISIONMODE=HF
 
-CONFIG=configs/HY3/ptq/HY3_kvcache_calibrate.yaml
+CONFIG=configs/hy3/ptq/hy3_kvcache_calibrate.yaml
 
 mkdir -p logs
 
 python3 tools/kvcache/run_kvcache_calibrate.py \
     -c $CONFIG \
-    2>&1 | tee logs/run_kvcache_calibrate_HY3.log
+    2>&1 | tee logs/run_kvcache_calibrate_hy3.log
diff --git a/scripts/ptq/run_vllm_calibrate_for_HY3.sh b/scripts/ptq/run_vllm_calibrate_for_hy3.sh
similarity index 89%
rename from scripts/ptq/run_vllm_calibrate_for_HY3.sh
rename to scripts/ptq/run_vllm_calibrate_for_hy3.sh
index 18a1566d..a28c97c4 100755
--- a/scripts/ptq/run_vllm_calibrate_for_HY3.sh
+++ b/scripts/ptq/run_vllm_calibrate_for_hy3.sh
@@ -22,10 +22,10 @@ export ASYNC_SCHEDULING=1
 export VLLM_ENABLE_PREFIX_CACHING=1
 export PRECISIONMODE=HF
 
-CONFIG=configs/HY3/ptq/HY3_vllm_calibrate.yaml
+CONFIG=configs/hy3/ptq/hy3_vllm_calibrate.yaml
 
 mkdir -p logs
 
 python3 tools/run_vllm_calibrate.py \
     -c $CONFIG \
-    2>&1 | tee logs/run_vllm_calibrate_HY3.log
+    2>&1 | tee logs/run_vllm_calibrate_hy3.log
diff --git a/scripts/ptq/run_vllm_quant_for_HY3.sh b/scripts/ptq/run_vllm_quant_for_hy3.sh
similarity index 90%
rename from scripts/ptq/run_vllm_quant_for_HY3.sh
rename to scripts/ptq/run_vllm_quant_for_hy3.sh
index 092bbabd..528718cd 100755
--- a/scripts/ptq/run_vllm_quant_for_HY3.sh
+++ b/scripts/ptq/run_vllm_quant_for_hy3.sh
@@ -18,13 +18,13 @@
 # ``output_dir`` in CALIB_CONFIG, otherwise stage 2 cannot find the stats.
 #
 # Usage:
-#   bash run_vllm_quant_for_HY3.sh
+#   bash run_vllm_quant_for_hy3.sh
 #       (run both stages back-to-back)
 #
-#   bash run_vllm_quant_for_HY3.sh --skip-calibrate
+#   bash run_vllm_quant_for_hy3.sh --skip-calibrate
 #       (skip stage 1, only quantize using existing stats dir)
 #
-#   bash run_vllm_quant_for_HY3.sh --skip-quantize
+#   bash run_vllm_quant_for_hy3.sh --skip-quantize
 #       (only run stage 1, do not produce the FP8 model)
 # =============================================================================
 
@@ -68,8 +68,8 @@ export PRECISIONMODE=HF
 # ----------------------------------------------------------------------------
 # YAML configs (one per stage)
 # ----------------------------------------------------------------------------
-CALIB_CONFIG=configs/HY3/ptq/HY3_vllm_calibrate.yaml
-QUANT_CONFIG=configs/HY3/ptq/HY3_vllm_quant_fp8.yaml
+CALIB_CONFIG=configs/hy3/ptq/hy3_vllm_calibrate.yaml
+QUANT_CONFIG=configs/hy3/ptq/hy3_vllm_quant_fp8_per_tensor.yaml
 
 mkdir -p logs
 
@@ -82,7 +82,7 @@ if [[ "${do_calibrate}" -eq 1 ]]; then
 
     python3 tools/run_vllm_calibrate.py \
         -c "${CALIB_CONFIG}" \
-        2>&1 | tee "logs/run_vllm_quant_HY3-calibrate.log"
+        2>&1 | tee "logs/run_vllm_quant_hy3-calibrate.log"
 
     echo "[pipeline] Stage 1 finished."
 else
@@ -98,7 +98,7 @@ if [[ "${do_quantize}" -eq 1 ]]; then
 
     python3 tools/fp8_quant_with_vllm_activation.py \
         -c "${QUANT_CONFIG}" \
-        2>&1 | tee "logs/run_vllm_quant_HY3-quantize.log"
+        2>&1 | tee "logs/run_vllm_quant_hy3-quantize.log"
 
     echo "[pipeline] Stage 2 finished."
 else
diff --git a/tools/_yaml_args.py b/tools/_yaml_args.py
index 6a754c01..cead2860 100644
--- a/tools/_yaml_args.py
+++ b/tools/_yaml_args.py
@@ -14,7 +14,7 @@
 
 """Shared YAML-config loader for standalone tool scripts.
 
-The HY3 tool entry points (``tools/run_vllm_calibrate.py``,
+The hy3 tool entry points (``tools/run_vllm_calibrate.py``,
 ``tools/kvcache/run_kvcache_calibrate.py`` and
 ``tools/fp8_quant_with_vllm_activation.py``) all use ``argparse``.  To match
 the style of ``scripts/ptq/run_vllm_quant_for_deepseek_v3.sh`` (shell only
diff --git a/tools/fp8_quant_with_vllm_activation.py b/tools/fp8_quant_with_vllm_activation.py
index 6b1d35d8..8966c7d9 100644
--- a/tools/fp8_quant_with_vllm_activation.py
+++ b/tools/fp8_quant_with_vllm_activation.py
@@ -247,7 +247,50 @@ def main(bf16_path, fp8_path, block_size, ac_json_data):
         config["quantization_config"]["weight_block_size"] = block_size
 
     kv_state_dict = {}
-    kv_granularity = ""
+    k_kv_granularity = ""  # resolved granularity for k-cache
+    v_kv_granularity = ""  # resolved granularity for v-cache
+
+    # Resolve scheme & granularity from CLI/YAML args
+    k_scheme = getattr(args, "k_scheme", "static")
+    v_scheme = getattr(args, "v_scheme", "static")
+    k_granularity_cfg = getattr(args, "k_granularity", "per-head").replace("-", "_")
+    v_granularity_cfg = getattr(args, "v_granularity", "per-head").replace("-", "_")
+
+    # If scheme is dynamic, granularity is forced to per_token_per_head
+    if k_scheme == "dynamic":
+        k_kv_granularity = "per_token_per_head"
+    if v_scheme == "dynamic":
+        v_kv_granularity = "per_token_per_head"
+
+    print(f"[KV-config] k_scheme={k_scheme}, v_scheme={v_scheme}")
+    print(
+        f"[KV-config] k_granularity_cfg={k_granularity_cfg}, v_granularity_cfg={v_granularity_cfg}"
+    )
+
+    # ---- Load per-tensor tuned KV scales (from --search-kv-scale stage) ----
+    # The stage-1 search outputs ``kv_cache_tuned_scales.json`` whose keys
+    # already match the safetensor key naming (e.g.
+    # ``model.layers.X.self_attn.k_cache.scale``).  If the file exists, we
+    # prefer its values for the per-tensor branch instead of falling back
+    # to a recomputed base scale (and certainly not the legacy 1.0).
+    tuned_kv_scales = {}
+    tuned_scales_path = os.path.join(args.input_vllm_ac_json_path, "kv_cache_tuned_scales.json")
+    if os.path.isfile(tuned_scales_path):
+        try:
+            with open(tuned_scales_path, "r", encoding="utf8") as _tsf:
+                tuned_kv_scales = json.load(_tsf)
+            print(
+                f"[KV-scale] Loaded {len(tuned_kv_scales)} tuned per-tensor KV scales "
+                f"from {tuned_scales_path}"
+            )
+        except Exception as _e:
+            print(f"[WARN] failed to load {tuned_scales_path}: {_e}")
+            tuned_kv_scales = {}
+    else:
+        print(
+            f"[KV-scale] {tuned_scales_path} not found; "
+            f"will fall back to min/max-based per-tensor scale."
+        )
 
     # Auto-detect kv_head_repeat from the model's real num_key_value_heads
     # vs the per-head stats vector length collected by AngelSlim.
@@ -271,6 +314,18 @@ def main(bf16_path, fp8_path, block_size, ac_json_data):
     for scale_name, stats in ac_json_data.items():
         if "cache" not in scale_name:
             continue
+
+        # Determine whether this entry is for k-cache or v-cache
+        is_k_cache = "k_cache" in scale_name
+        is_v_cache = "v_cache" in scale_name
+        # Skip writing scale if the corresponding scheme is dynamic
+        if is_k_cache and k_scheme == "dynamic":
+            print(f"[KV-scale] SKIP (k_scheme=dynamic): {scale_name}")
+            continue
+        if is_v_cache and v_scheme == "dynamic":
+            print(f"[KV-scale] SKIP (v_scheme=dynamic): {scale_name}")
+            continue
+
         act_save_name = f"{scale_name.replace('attn.attn', 'attn')}.scale"
         min_val = stats["min"]
         max_val = stats["max"]
@@ -298,25 +353,75 @@ def main(bf16_path, fp8_path, block_size, ac_json_data):
                 )
                 per_head_scales = per_head_scales[::replication]
             tensor_input_scale = torch.tensor(per_head_scales, dtype=torch.float32)
-            kv_granularity = "per_head"
+            detected_granularity = "per_head"
         else:
             # per-tensor: single scalar scale
-            # input_scale = max(abs(min_val), abs(max_val)) / fp8_max
-            # tensor_input_scale = torch.tensor([input_scale], dtype=torch.float32)
-            tensor_input_scale = torch.tensor([1.0])
-            kv_granularity = "per_tensor"
-        print(f"{scale_name}  granularity={kv_granularity}  scale={tensor_input_scale}")
+            #
+            # Preference order:
+            #   1) Use tuned scale from kv_cache_tuned_scales.json if available
+            #      (the search-kv-scale stage already wrote one entry per
+            #       k_cache / v_cache layer using exactly the same key as
+            #       ``act_save_name`` below).
+            #   2) Otherwise, compute base scale from min/max as
+            #      max(|min|, |max|) / fp8_max  (this is the unsearched
+            #      baseline scale; previously this branch was hardcoded
+            #      to 1.0 which is wrong).
+            act_save_name_lookup = f"{scale_name.replace('attn.attn', 'attn')}.scale"
+            if act_save_name_lookup in tuned_kv_scales:
+                scalar_scale = float(tuned_kv_scales[act_save_name_lookup])
+                tensor_input_scale = torch.tensor([scalar_scale], dtype=torch.float32)
+                scale_source = "tuned"
+            else:
+                base_scale = max(abs(min_val), abs(max_val)) / fp8_max
+                tensor_input_scale = torch.tensor([base_scale], dtype=torch.float32)
+                scale_source = "min_max"
+            detected_granularity = "per_tensor"
+
+        # Update resolved granularity based on actual data
+        if is_k_cache and not k_kv_granularity:
+            k_kv_granularity = detected_granularity
+        if is_v_cache and not v_kv_granularity:
+            v_kv_granularity = detected_granularity
+
+        scale_source_tag = locals().get("scale_source", "per_head")
+        print(
+            f"{scale_name}  granularity={detected_granularity}  "
+            f"src={scale_source_tag}  scale={tensor_input_scale}"
+        )
         kv_state_dict[act_save_name] = tensor_input_scale
         index[act_save_name] = "kv_cache_scales.safetensors"
+
+    # Use config-specified granularity if scheme is static and we didn't detect from data
+    if k_scheme == "static" and not k_kv_granularity:
+        k_kv_granularity = k_granularity_cfg
+    if v_scheme == "static" and not v_kv_granularity:
+        v_kv_granularity = v_granularity_cfg
+
+    # Write kv_cache_scales.safetensors only if there are static scales to save
     if len(kv_state_dict) > 0:
         kv_safetensor_file = os.path.join(fp8_path, "kv_cache_scales.safetensors")
         save_file(kv_state_dict, kv_safetensor_file)
         config["quantization_config"]["kv_cache_scheme"] = "static"
+
+    # Build attn_quant_config: k_quant and v_quant depend on their respective schemes
+    k_quant_config = (
+        {"dtype": "fp8_e4m3", "scheme": "dynamic", "granularity": "per_token_per_head"}
+        if k_scheme == "dynamic"
+        else {"dtype": "fp8_e4m3", "scheme": "static", "granularity": k_kv_granularity}
+    )
+    v_quant_config = (
+        {"dtype": "fp8_e4m3", "scheme": "dynamic", "granularity": "per_token_per_head"}
+        if v_scheme == "dynamic"
+        else {"dtype": "fp8_e4m3", "scheme": "static", "granularity": v_kv_granularity}
+    )
+
+    # Only emit attn_quant_config if at least one of k/v has meaningful config
+    if len(kv_state_dict) > 0 or k_scheme == "dynamic" or v_scheme == "dynamic":
         config["attn_quant_config"] = {
             "kv_cache_quant": {
                 "dtype": "fp8_e4m3",
-                "k_quant": {"scheme": "static", "granularity": kv_granularity},
-                "v_quant": {"scheme": "static", "granularity": kv_granularity},
+                "k_quant": k_quant_config,
+                "v_quant": v_quant_config,
             },
             "q_quant": {
                 "dtype": "fp8_e4m3",
@@ -434,6 +539,37 @@ def process_moe_values(data: Dict[str, Dict]) -> Dict[str, Dict]:
         type=str,
         default="",
     )
+    # KV-cache scheme & granularity
+    parser.add_argument(
+        "--k-scheme",
+        type=str,
+        default="static",
+        choices=["dynamic", "static"],
+        help="K-cache quantization scheme: 'dynamic' (no static scale saved, "
+        "granularity forced to per_token_per_head) or 'static' (use calibrated scale).",
+    )
+    parser.add_argument(
+        "--v-scheme",
+        type=str,
+        default="static",
+        choices=["dynamic", "static"],
+        help="V-cache quantization scheme: 'dynamic' (no static scale saved, "
+        "granularity forced to per_token_per_head) or 'static' (use calibrated scale).",
+    )
+    parser.add_argument(
+        "--k-granularity",
+        type=str,
+        default="per-head",
+        choices=["none", "per-tensor", "per-head"],
+        help="K-cache granularity when k_scheme=static (ignored if k_scheme=dynamic).",
+    )
+    parser.add_argument(
+        "--v-granularity",
+        type=str,
+        default="per-head",
+        choices=["none", "per-tensor", "per-head"],
+        help="V-cache granularity when v_scheme=static (ignored if v_scheme=dynamic).",
+    )
     args = parser.parse_args()
 
     # Lazy-import _yaml_args (sibling module in tools/). Done here instead of
diff --git a/tools/kvcache/README.md b/tools/kvcache/README.md
index c24c5965..1313bb78 100644
--- a/tools/kvcache/README.md
+++ b/tools/kvcache/README.md
@@ -60,8 +60,8 @@ python3 tools/kvcache/run_kvcache_calibrate.py \
     --search-kv-num-steps      50
 ```
 
-A ready-to-run wrapper for HY3 lives at
-[`scripts/ptq/run_kvcache_calibrate_for_HY3.sh`](../../scripts/ptq/run_kvcache_calibrate_for_HY3.sh).
+A ready-to-run wrapper for hy3 lives at
+[`scripts/ptq/run_kvcache_calibrate_for_hy3.sh`](../../scripts/ptq/run_kvcache_calibrate_for_hy3.sh).
 
 > ⚠️  Requires the AngelSlim vLLM patch
 > ([`tools/vllm_patch/`](../vllm_patch/)) to be installed in the active

From f7612a67dbcdf72bf27f8fdbd0cfcbceb95331c4 Mon Sep 17 00:00:00 2001
From: krizaltang <krizaltang@tencent.com>
Date: Fri, 29 May 2026 11:52:55 +0800
Subject: [PATCH 2/3] renaming hy3 to Hy3.

---
 .../ptq/Hy3_kvcache_calibrate.yaml}           | 12 +++----
 .../ptq/Hy3_vllm_calibrate.yaml}              |  8 ++---
 .../ptq/Hy3_vllm_quant_fp8_per_tensor.yaml}   | 10 +++---
 scripts/ptq/README.md                         | 32 +++++++++----------
 ...y3.sh => run_kvcache_calibrate_for_Hy3.sh} |  4 +--
 ...r_hy3.sh => run_vllm_calibrate_for_Hy3.sh} |  4 +--
 ...t_for_hy3.sh => run_vllm_quant_for_Hy3.sh} | 14 ++++----
 tools/_yaml_args.py                           |  2 +-
 tools/kvcache/README.md                       |  4 +--
 9 files changed, 45 insertions(+), 45 deletions(-)
 rename configs/{hy3/ptq/hy3_kvcache_calibrate.yaml => Hy3/ptq/Hy3_kvcache_calibrate.yaml} (76%)
 rename configs/{hy3/ptq/hy3_vllm_calibrate.yaml => Hy3/ptq/Hy3_vllm_calibrate.yaml} (85%)
 rename configs/{hy3/ptq/hy3_vllm_quant_fp8_per_tensor.yaml => Hy3/ptq/Hy3_vllm_quant_fp8_per_tensor.yaml} (77%)
 rename scripts/ptq/{run_kvcache_calibrate_for_hy3.sh => run_kvcache_calibrate_for_Hy3.sh} (87%)
 rename scripts/ptq/{run_vllm_calibrate_for_hy3.sh => run_vllm_calibrate_for_Hy3.sh} (89%)
 rename scripts/ptq/{run_vllm_quant_for_hy3.sh => run_vllm_quant_for_Hy3.sh} (90%)

diff --git a/configs/hy3/ptq/hy3_kvcache_calibrate.yaml b/configs/Hy3/ptq/Hy3_kvcache_calibrate.yaml
similarity index 76%
rename from configs/hy3/ptq/hy3_kvcache_calibrate.yaml
rename to configs/Hy3/ptq/Hy3_kvcache_calibrate.yaml
index def75081..1586e981 100644
--- a/configs/hy3/ptq/hy3_kvcache_calibrate.yaml
+++ b/configs/Hy3/ptq/Hy3_kvcache_calibrate.yaml
@@ -1,4 +1,4 @@
-# KV-cache calibration + scale search for hy3 (standalone, no weight/MoE hooks)
+# KV-cache calibration + scale search for Hy3 (standalone, no weight/MoE hooks)
 # Consumed by: tools/kvcache/run_kvcache_calibrate.py
 #
 # Keys here match the script's argparse `dest` names. Values listed below
@@ -19,11 +19,11 @@ distributed_executor_backend: ray       # ray | mp
 skip_weight_loading: false              # true => dummy weights (debug only)
 
 # -------- KV-cache granularity --------
-per_head: true                          # true => per-head scales; false => per-tensor
+per_head: false                         # true => per-head scales; false => per-tensor
 
 # -------- KV-cache scale search --------
-search_kv_scale: true
-search_kv_num_samples: 32
-search_kv_min_multiplier: 0.4
-search_kv_max_multiplier: 8.0
+search_kv_scale: false
+search_kv_num_samples: 64
+search_kv_min_multiplier: 0.8
+search_kv_max_multiplier: 16.0
 search_kv_num_steps: 50
diff --git a/configs/hy3/ptq/hy3_vllm_calibrate.yaml b/configs/Hy3/ptq/Hy3_vllm_calibrate.yaml
similarity index 85%
rename from configs/hy3/ptq/hy3_vllm_calibrate.yaml
rename to configs/Hy3/ptq/Hy3_vllm_calibrate.yaml
index 994ff1da..3a29b001 100644
--- a/configs/hy3/ptq/hy3_vllm_calibrate.yaml
+++ b/configs/Hy3/ptq/Hy3_vllm_calibrate.yaml
@@ -1,4 +1,4 @@
-# vLLM activation calibration for hy3 (collects activation + MoE expert stats,
+# vLLM activation calibration for Hy3 (collects activation + MoE expert stats,
 # and optionally KV-cache stats / scale search).
 # Consumed by: tools/run_vllm_calibrate.py
 #
@@ -14,7 +14,7 @@ output_dir: /path/to/statistics
 # -------- Model loading / runtime --------
 tp_size: 16
 batch_size: 4
-num_samples: 512
+num_samples: 64
 max_length: 16384
 distributed_executor_backend: ray       # ray | mp
 skip_weight_loading: false              # true => dummy weights (debug only)
@@ -27,10 +27,10 @@ num_speculative_tokens: 1
 verbose: false
 
 # -------- KV-cache granularity --------
-kv_granularity: per-head                # none | per-tensor | per-head
+kv_granularity: per-tensor              # none | per-tensor | per-head
 
 # -------- KV-cache scale search --------
-search_kv_scale: true
+search_kv_scale: false
 search_kv_num_samples: 64
 search_kv_min_multiplier: 0.8
 search_kv_max_multiplier: 16.0
diff --git a/configs/hy3/ptq/hy3_vllm_quant_fp8_per_tensor.yaml b/configs/Hy3/ptq/Hy3_vllm_quant_fp8_per_tensor.yaml
similarity index 77%
rename from configs/hy3/ptq/hy3_vllm_quant_fp8_per_tensor.yaml
rename to configs/Hy3/ptq/Hy3_vllm_quant_fp8_per_tensor.yaml
index 8584688c..6f145a3e 100644
--- a/configs/hy3/ptq/hy3_vllm_quant_fp8_per_tensor.yaml
+++ b/configs/Hy3/ptq/Hy3_vllm_quant_fp8_per_tensor.yaml
@@ -1,4 +1,4 @@
-# Stage-2 of the hy3 PTQ pipeline: FP8 quantization of bf16 weights using
+# Stage-2 of the Hy3 PTQ pipeline: FP8 quantization of bf16 weights using
 # the activation_stats / moe_expert_stats produced by stage-1 calibration.
 # Consumed by: tools/fp8_quant_with_vllm_activation.py
 #
@@ -7,7 +7,7 @@
 # precedence (e.g. `python3 ... -c this.yaml --num-workers 8` will use 8).
 #
 # IMPORTANT: input_vllm_ac_json_path MUST equal the `output_dir` used in
-# stage 1 (hy3_vllm_calibrate.yaml), otherwise the FP8 quantizer will not
+# stage 1 (Hy3_vllm_calibrate.yaml), otherwise the FP8 quantizer will not
 # find activation_stats.json / moe_expert_stats.json.
 
 input_bf16_hf_path: /path/to/input/model
@@ -21,7 +21,7 @@ num_workers: 16
 # KV-cache scheme & granularity (must match calibration config)
 # scheme: dynamic => no static scale is saved; granularity forced to per_token_per_head
 #         static  => granularity can be none | per-tensor | per-head
-k_scheme: dynamic                       # dynamic | static
+k_scheme: static                        # dynamic | static
 v_scheme: static                        # dynamic | static
-k_granularity: per-head                 # none | per-tensor | per-head (only used when k_scheme=static)
-v_granularity: per-head                 # none | per-tensor | per-head (only used when v_scheme=static)
+k_granularity: per-tensor               # none | per-tensor | per-head (only used when k_scheme=static)
+v_granularity: per-tensor               # none | per-tensor | per-head (only used when v_scheme=static)
diff --git a/scripts/ptq/README.md b/scripts/ptq/README.md
index 1eb3ae66..7931fe5a 100644
--- a/scripts/ptq/README.md
+++ b/scripts/ptq/README.md
@@ -8,7 +8,7 @@
 
 ## 一、环境准备（运行校准脚本前必须完成）
 
-> 📌 **硬性要求**（当前 hy3 校准脚本经过验证的配置）：
+> 📌 **硬性要求**（当前 Hy3 校准脚本经过验证的配置）：
 > - **算力**：**16 卡**（两个节点 × 每节点 8 卡），用于 TP/PP 跨节点切分
 > - **vLLM 版本**：**v0.20.0**（补丁文件按此版本对齐，其它版本需要重新生成补丁）
 > - **Python 环境**：所有节点保持一致（建议使用同一个 conda / venv）
@@ -21,7 +21,7 @@
 
 ### 1. 准备 Ray 集群（2 节点 × 8 卡 = 16 卡）
 
-hy3 等大模型需要跨节点 TP/PP，校准脚本默认走 vLLM 的 Ray distributed executor，必须先在 **两台 8 卡节点** 上分别拉起 Ray，组成一个 16 卡集群。
+Hy3 等大模型需要跨节点 TP/PP，校准脚本默认走 vLLM 的 Ray distributed executor，必须先在 **两台 8 卡节点** 上分别拉起 Ray，组成一个 16 卡集群。
 
 下面给出的环境变量按 **RDMA / 多网卡** 集群的常见配置示例，请按实际网络拓扑调整（特别是 `*_SOCKET_IFNAME`、`NCCL_IB_GID_INDEX`）。
 
@@ -113,19 +113,19 @@ bash tools/vllm_patch/install.sh --help      # 查看完整用法
 
 ---
 
-## 二、hy3.0 系列脚本（Hunyuan-A20B 等 hy3 模型）
+## 二、Hy3.0 系列脚本（Hunyuan-A20B 等 Hy3 模型）
 
 下面 3 个脚本共享同一套 vLLM 运行时环境（chunked prefill / FlashInfer attention / mp distributed executor / fused MoE 等），区别在于产出物不同。
 
 | 脚本 | 用途 | 入口 |
 | --- | --- | --- |
-| [`run_vllm_quant_for_hy3.sh`](./run_vllm_quant_for_hy3.sh) | ★ 推荐的"一键流水线"：校准 + 量化 | `tools/run_vllm_calibrate.py` + `tools/fp8_quant_with_vllm_activation.py` |
-| [`run_vllm_calibrate_for_hy3.sh`](./run_vllm_calibrate_for_hy3.sh) | 仅 W8A8C8 联合校准 | `tools/run_vllm_calibrate.py` |
-| [`run_kvcache_calibrate_for_hy3.sh`](./run_kvcache_calibrate_for_hy3.sh) | 仅 KV-cache 校准（轻量） | `tools/kvcache/run_kvcache_calibrate.py` |
+| [`run_vllm_quant_for_Hy3.sh`](./run_vllm_quant_for_Hy3.sh) | ★ 推荐的"一键流水线"：校准 + 量化 | `tools/run_vllm_calibrate.py` + `tools/fp8_quant_with_vllm_activation.py` |
+| [`run_vllm_calibrate_for_Hy3.sh`](./run_vllm_calibrate_for_Hy3.sh) | 仅 W8A8C8 联合校准 | `tools/run_vllm_calibrate.py` |
+| [`run_kvcache_calibrate_for_Hy3.sh`](./run_kvcache_calibrate_for_Hy3.sh) | 仅 KV-cache 校准（轻量） | `tools/kvcache/run_kvcache_calibrate.py` |
 
 ---
 
-### 1. `run_vllm_quant_for_hy3.sh` ★推荐的"一键流水线"
+### 1. `run_vllm_quant_for_Hy3.sh` ★推荐的"一键流水线"
 
 **功能**：bf16 模型 → vLLM 激活校准 → FP8 HF safetensors，全流程一次完成。
 
@@ -149,17 +149,17 @@ bash tools/vllm_patch/install.sh --help      # 查看完整用法
 #### CLI 开关
 
 ```bash
-bash run_vllm_quant_for_hy3.sh                    # 两阶段都跑
-bash run_vllm_quant_for_hy3.sh --skip-calibrate   # 仅量化（复用已有 stats_dir）
-bash run_vllm_quant_for_hy3.sh --skip-quantize    # 仅校准
-bash run_vllm_quant_for_hy3.sh --help             # 打印用法
+bash run_vllm_quant_for_Hy3.sh                    # 两阶段都跑
+bash run_vllm_quant_for_Hy3.sh --skip-calibrate   # 仅量化（复用已有 stats_dir）
+bash run_vllm_quant_for_Hy3.sh --skip-quantize    # 仅校准
+bash run_vllm_quant_for_Hy3.sh --help             # 打印用法
 ```
 
 > 脚本开启 `set -euo pipefail`，任一阶段失败将立即中断。
 
 ---
 
-### 2. `run_vllm_calibrate_for_hy3.sh` — 一键脚本里的"阶段 1"独立版
+### 2. `run_vllm_calibrate_for_Hy3.sh` — 一键脚本里的"阶段 1"独立版
 
 **功能**：只跑 W8A8C8 联合校准，不做量化。
 
@@ -181,20 +181,20 @@ bash run_vllm_quant_for_hy3.sh --help             # 打印用法
 #### 适用场景
 
 - 想自己接后续量化工具，不走 `fp8_quant_with_vllm_activation.py`。
-- 想单独调校 PTQ 数据集 / `num_samples` / `max_length`，再用 `run_vllm_quant_for_hy3.sh --skip-calibrate` 复用结果。
+- 想单独调校 PTQ 数据集 / `num_samples` / `max_length`，再用 `run_vllm_quant_for_Hy3.sh --skip-calibrate` 复用结果。
 - Debug 用 `--skip-weight-loading` 跑 dummy 权重，快速验证 hook 注册流程。
 
 ---
 
-### 3. `run_kvcache_calibrate_for_hy3.sh` — 仅校准 KV-cache（轻量）
+### 3. `run_kvcache_calibrate_for_Hy3.sh` — 仅校准 KV-cache（轻量）
 
 **功能**：只校准 KV-cache（K/V min/max），不做 weight / activation / MoE 统计。
 
 - **入口**：`tools/kvcache/run_kvcache_calibrate.py`
 
-#### 关键差异（与 `run_vllm_calibrate_for_hy3.sh` 对比）
+#### 关键差异（与 `run_vllm_calibrate_for_Hy3.sh` 对比）
 
-| 维度 | `run_kvcache_calibrate_for_hy3.sh` | `run_vllm_calibrate_for_hy3.sh` |
+| 维度 | `run_kvcache_calibrate_for_Hy3.sh` | `run_vllm_calibrate_for_Hy3.sh` |
 | --- | --- | --- |
 | MoE / Linear 钩子 | 故意 **NOT** 设置 `VLLM_MOE_COLLECT_STATS`，完全跳过，启动更快、CPU 内存占用更低 | 全开 |
 | KV 搜索范围 | `[0.4, 8.0]`，`num_steps=50`（更窄、更聚焦） | `[0.8, 16.0]` |
diff --git a/scripts/ptq/run_kvcache_calibrate_for_hy3.sh b/scripts/ptq/run_kvcache_calibrate_for_Hy3.sh
similarity index 87%
rename from scripts/ptq/run_kvcache_calibrate_for_hy3.sh
rename to scripts/ptq/run_kvcache_calibrate_for_Hy3.sh
index ba519324..eb3413cb 100755
--- a/scripts/ptq/run_kvcache_calibrate_for_hy3.sh
+++ b/scripts/ptq/run_kvcache_calibrate_for_Hy3.sh
@@ -17,10 +17,10 @@ export ASYNC_SCHEDULING=1
 export VLLM_ENABLE_PREFIX_CACHING=1
 export PRECISIONMODE=HF
 
-CONFIG=configs/hy3/ptq/hy3_kvcache_calibrate.yaml
+CONFIG=configs/Hy3/ptq/Hy3_kvcache_calibrate.yaml
 
 mkdir -p logs
 
 python3 tools/kvcache/run_kvcache_calibrate.py \
     -c $CONFIG \
-    2>&1 | tee logs/run_kvcache_calibrate_hy3.log
+    2>&1 | tee logs/run_kvcache_calibrate_Hy3.log
diff --git a/scripts/ptq/run_vllm_calibrate_for_hy3.sh b/scripts/ptq/run_vllm_calibrate_for_Hy3.sh
similarity index 89%
rename from scripts/ptq/run_vllm_calibrate_for_hy3.sh
rename to scripts/ptq/run_vllm_calibrate_for_Hy3.sh
index a28c97c4..c7c9c812 100755
--- a/scripts/ptq/run_vllm_calibrate_for_hy3.sh
+++ b/scripts/ptq/run_vllm_calibrate_for_Hy3.sh
@@ -22,10 +22,10 @@ export ASYNC_SCHEDULING=1
 export VLLM_ENABLE_PREFIX_CACHING=1
 export PRECISIONMODE=HF
 
-CONFIG=configs/hy3/ptq/hy3_vllm_calibrate.yaml
+CONFIG=configs/Hy3/ptq/Hy3_vllm_calibrate.yaml
 
 mkdir -p logs
 
 python3 tools/run_vllm_calibrate.py \
     -c $CONFIG \
-    2>&1 | tee logs/run_vllm_calibrate_hy3.log
+    2>&1 | tee logs/run_vllm_calibrate_Hy3.log
diff --git a/scripts/ptq/run_vllm_quant_for_hy3.sh b/scripts/ptq/run_vllm_quant_for_Hy3.sh
similarity index 90%
rename from scripts/ptq/run_vllm_quant_for_hy3.sh
rename to scripts/ptq/run_vllm_quant_for_Hy3.sh
index 528718cd..e5f53df8 100755
--- a/scripts/ptq/run_vllm_quant_for_hy3.sh
+++ b/scripts/ptq/run_vllm_quant_for_Hy3.sh
@@ -18,13 +18,13 @@
 # ``output_dir`` in CALIB_CONFIG, otherwise stage 2 cannot find the stats.
 #
 # Usage:
-#   bash run_vllm_quant_for_hy3.sh
+#   bash run_vllm_quant_for_Hy3.sh
 #       (run both stages back-to-back)
 #
-#   bash run_vllm_quant_for_hy3.sh --skip-calibrate
+#   bash run_vllm_quant_for_Hy3.sh --skip-calibrate
 #       (skip stage 1, only quantize using existing stats dir)
 #
-#   bash run_vllm_quant_for_hy3.sh --skip-quantize
+#   bash run_vllm_quant_for_Hy3.sh --skip-quantize
 #       (only run stage 1, do not produce the FP8 model)
 # =============================================================================
 
@@ -68,8 +68,8 @@ export PRECISIONMODE=HF
 # ----------------------------------------------------------------------------
 # YAML configs (one per stage)
 # ----------------------------------------------------------------------------
-CALIB_CONFIG=configs/hy3/ptq/hy3_vllm_calibrate.yaml
-QUANT_CONFIG=configs/hy3/ptq/hy3_vllm_quant_fp8_per_tensor.yaml
+CALIB_CONFIG=configs/Hy3/ptq/Hy3_vllm_calibrate.yaml
+QUANT_CONFIG=configs/Hy3/ptq/Hy3_vllm_quant_fp8_per_tensor.yaml
 
 mkdir -p logs
 
@@ -82,7 +82,7 @@ if [[ "${do_calibrate}" -eq 1 ]]; then
 
     python3 tools/run_vllm_calibrate.py \
         -c "${CALIB_CONFIG}" \
-        2>&1 | tee "logs/run_vllm_quant_hy3-calibrate.log"
+        2>&1 | tee "logs/run_vllm_quant_Hy3-calibrate.log"
 
     echo "[pipeline] Stage 1 finished."
 else
@@ -98,7 +98,7 @@ if [[ "${do_quantize}" -eq 1 ]]; then
 
     python3 tools/fp8_quant_with_vllm_activation.py \
         -c "${QUANT_CONFIG}" \
-        2>&1 | tee "logs/run_vllm_quant_hy3-quantize.log"
+        2>&1 | tee "logs/run_vllm_quant_Hy3-quantize.log"
 
     echo "[pipeline] Stage 2 finished."
 else
diff --git a/tools/_yaml_args.py b/tools/_yaml_args.py
index cead2860..74625454 100644
--- a/tools/_yaml_args.py
+++ b/tools/_yaml_args.py
@@ -14,7 +14,7 @@
 
 """Shared YAML-config loader for standalone tool scripts.
 
-The hy3 tool entry points (``tools/run_vllm_calibrate.py``,
+The Hy3 tool entry points (``tools/run_vllm_calibrate.py``,
 ``tools/kvcache/run_kvcache_calibrate.py`` and
 ``tools/fp8_quant_with_vllm_activation.py``) all use ``argparse``.  To match
 the style of ``scripts/ptq/run_vllm_quant_for_deepseek_v3.sh`` (shell only
diff --git a/tools/kvcache/README.md b/tools/kvcache/README.md
index 1313bb78..c673d0c5 100644
--- a/tools/kvcache/README.md
+++ b/tools/kvcache/README.md
@@ -60,8 +60,8 @@ python3 tools/kvcache/run_kvcache_calibrate.py \
     --search-kv-num-steps      50
 ```
 
-A ready-to-run wrapper for hy3 lives at
-[`scripts/ptq/run_kvcache_calibrate_for_hy3.sh`](../../scripts/ptq/run_kvcache_calibrate_for_hy3.sh).
+A ready-to-run wrapper for Hy3 lives at
+[`scripts/ptq/run_kvcache_calibrate_for_Hy3.sh`](../../scripts/ptq/run_kvcache_calibrate_for_Hy3.sh).
 
 > ⚠️  Requires the AngelSlim vLLM patch
 > ([`tools/vllm_patch/`](../vllm_patch/)) to be installed in the active

From 6bcdf3db3849bbcbfc22177138abb0140c7efbc2 Mon Sep 17 00:00:00 2001
From: krizaltang <krizaltang@tencent.com>
Date: Fri, 29 May 2026 14:27:57 +0800
Subject: [PATCH 3/3] add kv per head yaml.

---
 configs/Hy3/ptq/Hy3_vllm_calibrate.yaml       | 37 ----------
 .../ptq/Hy3_vllm_quant_fp8_per_tensor.yaml    | 27 -------
 .../ptq/{ => fp8}/Hy3_kvcache_calibrate.yaml  |  0
 .../Hy3/ptq/fp8/Hy3_vllm_ptq_kv_per_head.yaml | 70 +++++++++++++++++++
 .../Hy3/ptq/fp8/Hy3_vllm_ptq_per_tensor.yaml  | 70 +++++++++++++++++++
 scripts/ptq/README.md                         | 10 ++-
 scripts/ptq/run_kvcache_calibrate_for_Hy3.sh  |  2 +-
 scripts/ptq/run_vllm_calibrate_for_Hy3.sh     |  2 +-
 scripts/ptq/run_vllm_quant_for_Hy3.sh         | 25 +++----
 tools/fp8_quant_with_vllm_activation.py       | 37 ++++++++--
 10 files changed, 194 insertions(+), 86 deletions(-)
 delete mode 100644 configs/Hy3/ptq/Hy3_vllm_calibrate.yaml
 delete mode 100644 configs/Hy3/ptq/Hy3_vllm_quant_fp8_per_tensor.yaml
 rename configs/Hy3/ptq/{ => fp8}/Hy3_kvcache_calibrate.yaml (100%)
 create mode 100644 configs/Hy3/ptq/fp8/Hy3_vllm_ptq_kv_per_head.yaml
 create mode 100644 configs/Hy3/ptq/fp8/Hy3_vllm_ptq_per_tensor.yaml

diff --git a/configs/Hy3/ptq/Hy3_vllm_calibrate.yaml b/configs/Hy3/ptq/Hy3_vllm_calibrate.yaml
deleted file mode 100644
index 3a29b001..00000000
--- a/configs/Hy3/ptq/Hy3_vllm_calibrate.yaml
+++ /dev/null
@@ -1,37 +0,0 @@
-# vLLM activation calibration for Hy3 (collects activation + MoE expert stats,
-# and optionally KV-cache stats / scale search).
-# Consumed by: tools/run_vllm_calibrate.py
-#
-# Keys here match the script's argparse `dest` names. Values listed below
-# override argparse defaults; explicit command-line flags still take final
-# precedence (e.g. `python3 ... -c this.yaml --tp-size 8` will use 8).
-
-# -------- Paths --------
-model_path: /path/to/model
-ptq_data_path: /path/to/dataset
-output_dir: /path/to/statistics
-
-# -------- Model loading / runtime --------
-tp_size: 16
-batch_size: 4
-num_samples: 64
-max_length: 16384
-distributed_executor_backend: ray       # ray | mp
-skip_weight_loading: false              # true => dummy weights (debug only)
-
-# -------- MTP (Multi-Token Prediction) --------
-enable_mtp: false
-num_speculative_tokens: 1
-
-# -------- Debug --------
-verbose: false
-
-# -------- KV-cache granularity --------
-kv_granularity: per-tensor              # none | per-tensor | per-head
-
-# -------- KV-cache scale search --------
-search_kv_scale: false
-search_kv_num_samples: 64
-search_kv_min_multiplier: 0.8
-search_kv_max_multiplier: 16.0
-search_kv_num_steps: 50
diff --git a/configs/Hy3/ptq/Hy3_vllm_quant_fp8_per_tensor.yaml b/configs/Hy3/ptq/Hy3_vllm_quant_fp8_per_tensor.yaml
deleted file mode 100644
index 6f145a3e..00000000
--- a/configs/Hy3/ptq/Hy3_vllm_quant_fp8_per_tensor.yaml
+++ /dev/null
@@ -1,27 +0,0 @@
-# Stage-2 of the Hy3 PTQ pipeline: FP8 quantization of bf16 weights using
-# the activation_stats / moe_expert_stats produced by stage-1 calibration.
-# Consumed by: tools/fp8_quant_with_vllm_activation.py
-#
-# Keys here match the script's argparse `dest` names. Values listed below
-# override argparse defaults; explicit command-line flags still take final
-# precedence (e.g. `python3 ... -c this.yaml --num-workers 8` will use 8).
-#
-# IMPORTANT: input_vllm_ac_json_path MUST equal the `output_dir` used in
-# stage 1 (Hy3_vllm_calibrate.yaml), otherwise the FP8 quantizer will not
-# find activation_stats.json / moe_expert_stats.json.
-
-input_bf16_hf_path: /path/to/input/model
-input_vllm_ac_json_path: /path/to/statistics
-output_fp8_hf_path: /path/to/output/fp8_model
-
-# Optional: leave at defaults unless you know what you are doing
-block_size: [-1, -1]
-num_workers: 16
-
-# KV-cache scheme & granularity (must match calibration config)
-# scheme: dynamic => no static scale is saved; granularity forced to per_token_per_head
-#         static  => granularity can be none | per-tensor | per-head
-k_scheme: static                        # dynamic | static
-v_scheme: static                        # dynamic | static
-k_granularity: per-tensor               # none | per-tensor | per-head (only used when k_scheme=static)
-v_granularity: per-tensor               # none | per-tensor | per-head (only used when v_scheme=static)
diff --git a/configs/Hy3/ptq/Hy3_kvcache_calibrate.yaml b/configs/Hy3/ptq/fp8/Hy3_kvcache_calibrate.yaml
similarity index 100%
rename from configs/Hy3/ptq/Hy3_kvcache_calibrate.yaml
rename to configs/Hy3/ptq/fp8/Hy3_kvcache_calibrate.yaml
diff --git a/configs/Hy3/ptq/fp8/Hy3_vllm_ptq_kv_per_head.yaml b/configs/Hy3/ptq/fp8/Hy3_vllm_ptq_kv_per_head.yaml
new file mode 100644
index 00000000..3f4fd4e1
--- /dev/null
+++ b/configs/Hy3/ptq/fp8/Hy3_vllm_ptq_kv_per_head.yaml
@@ -0,0 +1,70 @@
+# Unified YAML config for the Hy3 PTQ pipeline.
+# Consumed by BOTH stages of scripts/ptq/run_vllm_quant_for_Hy3.sh:
+#   - tools/run_vllm_calibrate.py            (stage 1: activation calibration)
+#   - tools/fp8_quant_with_vllm_activation.py (stage 2: FP8 quantization)
+#
+# Each stage's argparse picks up only the keys it knows about; unrelated
+# keys are ignored with a single "[yaml-config] WARNING: unknown keys" log.
+# Keys here use underscore form (matching argparse `dest`). Explicit CLI
+# flags still take final precedence over YAML values.
+
+# ============================================================================
+# Shared paths (model_path / output_dir are defined for stage 1; stage 2
+# reuses them via fallbacks: input_bf16_hf_path defaults to model_path,
+# input_vllm_ac_json_path defaults to output_dir).
+# ============================================================================
+model_path: /path/to/model              # stage1: bf16 model dir; stage2 reuses as input_bf16_hf_path
+ptq_data_path: /path/to/dataset         # stage1 only
+output_dir: /path/to/statistics         # stage1: where stats are written; stage2 reuses as input_vllm_ac_json_path
+output_fp8_hf_path: /path/to/output/fp8_model  # stage2 only
+
+# ============================================================================
+# Stage 1 — vLLM activation calibration (tools/run_vllm_calibrate.py)
+# ============================================================================
+
+# -------- Model loading / runtime --------
+tp_size: 16
+batch_size: 4
+num_samples: 512
+max_length: 16384
+distributed_executor_backend: ray       # ray | mp
+skip_weight_loading: false              # true => dummy weights (debug only)
+
+# -------- MTP (Multi-Token Prediction) --------
+enable_mtp: false
+num_speculative_tokens: 1
+
+# -------- Debug --------
+verbose: false
+
+# -------- KV-cache calibration granularity --------
+# Used by stage 1 to decide how KV stats are *collected*. Calibration always
+# runs regardless of the stage-2 scheme; this only controls the granularity
+# of the collected scale (none | per-tensor | per-head).
+kv_granularity: per-head
+
+# -------- KV-cache scale search (stage 1) --------
+search_kv_scale: true
+search_kv_num_samples: 64
+search_kv_min_multiplier: 0.8
+search_kv_max_multiplier: 16.0
+search_kv_num_steps: 50
+
+# ============================================================================
+# Stage 2 — FP8 quantization (tools/fp8_quant_with_vllm_activation.py)
+# ============================================================================
+
+# Optional: leave at defaults unless you know what you are doing
+block_size: [-1, -1]
+num_workers: 16
+
+# KV-cache scheme & granularity at *quantization* time.
+#   scheme = dynamic => no static scale is saved; granularity is forced to
+#                       per_token_per_head in config.json.
+#   scheme = static  => calibrated scale is written to kv_cache_scales.safetensors,
+#                       at the granularity selected below
+#                       (none | per-tensor | per-head).
+k_scheme: dynamic                       # dynamic | static
+v_scheme: static                        # dynamic | static
+quant_k_granularity: per-head           # only used when k_scheme=static
+quant_v_granularity: per-head           # only used when v_scheme=static
diff --git a/configs/Hy3/ptq/fp8/Hy3_vllm_ptq_per_tensor.yaml b/configs/Hy3/ptq/fp8/Hy3_vllm_ptq_per_tensor.yaml
new file mode 100644
index 00000000..aa93074c
--- /dev/null
+++ b/configs/Hy3/ptq/fp8/Hy3_vllm_ptq_per_tensor.yaml
@@ -0,0 +1,70 @@
+# Unified YAML config for the Hy3 PTQ pipeline.
+# Consumed by BOTH stages of scripts/ptq/run_vllm_quant_for_Hy3.sh:
+#   - tools/run_vllm_calibrate.py            (stage 1: activation calibration)
+#   - tools/fp8_quant_with_vllm_activation.py (stage 2: FP8 quantization)
+#
+# Each stage's argparse picks up only the keys it knows about; unrelated
+# keys are ignored with a single "[yaml-config] WARNING: unknown keys" log.
+# Keys here use underscore form (matching argparse `dest`). Explicit CLI
+# flags still take final precedence over YAML values.
+
+# ============================================================================
+# Shared paths (model_path / output_dir are defined for stage 1; stage 2
+# reuses them via fallbacks: input_bf16_hf_path defaults to model_path,
+# input_vllm_ac_json_path defaults to output_dir).
+# ============================================================================
+model_path: /path/to/model              # stage1: bf16 model dir; stage2 reuses as input_bf16_hf_path
+ptq_data_path: /path/to/dataset         # stage1 only
+output_dir: /path/to/statistics         # stage1: where stats are written; stage2 reuses as input_vllm_ac_json_path
+output_fp8_hf_path: /path/to/output/fp8_model  # stage2 only
+
+# ============================================================================
+# Stage 1 — vLLM activation calibration (tools/run_vllm_calibrate.py)
+# ============================================================================
+
+# -------- Model loading / runtime --------
+tp_size: 16
+batch_size: 4
+num_samples: 512
+max_length: 16384
+distributed_executor_backend: ray       # ray | mp
+skip_weight_loading: false              # true => dummy weights (debug only)
+
+# -------- MTP (Multi-Token Prediction) --------
+enable_mtp: false
+num_speculative_tokens: 1
+
+# -------- Debug --------
+verbose: false
+
+# -------- KV-cache calibration granularity --------
+# Used by stage 1 to decide how KV stats are *collected*. Calibration always
+# runs regardless of the stage-2 scheme; this only controls the granularity
+# of the collected scale (none | per-tensor | per-head).
+kv_granularity: per-tensor
+
+# -------- KV-cache scale search (stage 1) --------
+search_kv_scale: true
+search_kv_num_samples: 64
+search_kv_min_multiplier: 0.8
+search_kv_max_multiplier: 16.0
+search_kv_num_steps: 50
+
+# ============================================================================
+# Stage 2 — FP8 quantization (tools/fp8_quant_with_vllm_activation.py)
+# ============================================================================
+
+# Optional: leave at defaults unless you know what you are doing
+block_size: [-1, -1]
+num_workers: 16
+
+# KV-cache scheme & granularity at *quantization* time.
+#   scheme = dynamic => no static scale is saved; granularity is forced to
+#                       per_token_per_head in config.json.
+#   scheme = static  => calibrated scale is written to kv_cache_scales.safetensors,
+#                       at the granularity selected below
+#                       (none | per-tensor | per-head).
+k_scheme: static                        # dynamic | static
+v_scheme: static                        # dynamic | static
+quant_k_granularity: per-tensor           # only used when k_scheme=static
+quant_v_granularity: per-tensor           # only used when v_scheme=static
diff --git a/scripts/ptq/README.md b/scripts/ptq/README.md
index 7931fe5a..9201460f 100644
--- a/scripts/ptq/README.md
+++ b/scripts/ptq/README.md
@@ -141,8 +141,14 @@ bash tools/vllm_patch/install.sh --help      # 查看完整用法
 #### 阶段 2：调用 `tools/fp8_quant_with_vllm_activation.py`
 
 - 读取 `${stats_dir}` 下的 `activation_stats.json` / `moe_expert_stats.json`，结合原 bf16 权重，做 per-tensor FP8 量化（含 weight + input scale），写出到 `${fp8_path}`。
-- KV-cache scale 的写入行为由量化 YAML 中的 `k_scheme` / `v_scheme` 控制：
-  - `static`：将校准得到的 scale 写入 `kv_cache_scales.safetensors`，粒度由 `k_granularity` / `v_granularity` 决定（`none` | `per-tensor` | `per-head`）。
+- 校准（stage-1）与量化（stage-2）共享 **同一份 YAML**：[`configs/Hy3/ptq/fp8/Hy3_vllm_ptq_per_tensor.yaml`](../../configs/Hy3/ptq/fp8/Hy3_vllm_ptq_per_tensor.yaml)。
+  - 路径只配一次：stage-2 的 `input_bf16_hf_path` 默认回退到 stage-1 的 `model_path`，`input_vllm_ac_json_path` 默认回退到 stage-1 的 `output_dir`。
+  - 每个阶段只读取自己关心的字段，不认识的字段会打一行 `[yaml-config] WARNING: unknown keys` 然后忽略，属于正常现象。
+- KV-cache 的"校准粒度"与"量化粒度"分开控制：
+  - 校准阶段（stage-1）由 `kv_granularity`（`none` | `per-tensor` | `per-head`）决定 KV scale 的收集粒度。
+  - 量化阶段（stage-2）由 `k_scheme` / `v_scheme`（`dynamic` | `static`）决定是否把 scale 写进 safetensor；当 scheme=`static` 时，再由 `quant_k_granularity` / `quant_v_granularity`（`none` | `per-tensor` | `per-head`）决定写入粒度。
+- KV-cache scale 的写入行为由量化阶段的 `k_scheme` / `v_scheme` 控制：
+  - `static`：将校准得到的 scale 写入 `kv_cache_scales.safetensors`，粒度由 `quant_k_granularity` / `quant_v_granularity` 决定（`none` | `per-tensor` | `per-head`）。
   - `dynamic`：不写入对应的 scale（`model.safetensors.index.json` 中也不包含对应 key），`config.json` 中标记为 `"scheme": "dynamic", "granularity": "per_token_per_head"`（与 `q_quant` 一致）。
 - 产出的 `config.json` 中 `attn_quant_config.kv_cache_quant` 的 `k_quant` 和 `v_quant` 独立配置，支持 K/V 使用不同的 scheme。
 
diff --git a/scripts/ptq/run_kvcache_calibrate_for_Hy3.sh b/scripts/ptq/run_kvcache_calibrate_for_Hy3.sh
index eb3413cb..d30a70b0 100755
--- a/scripts/ptq/run_kvcache_calibrate_for_Hy3.sh
+++ b/scripts/ptq/run_kvcache_calibrate_for_Hy3.sh
@@ -17,7 +17,7 @@ export ASYNC_SCHEDULING=1
 export VLLM_ENABLE_PREFIX_CACHING=1
 export PRECISIONMODE=HF
 
-CONFIG=configs/Hy3/ptq/Hy3_kvcache_calibrate.yaml
+CONFIG=configs/Hy3/ptq/fp8/Hy3_kvcache_calibrate.yaml
 
 mkdir -p logs
 
diff --git a/scripts/ptq/run_vllm_calibrate_for_Hy3.sh b/scripts/ptq/run_vllm_calibrate_for_Hy3.sh
index c7c9c812..90dd3add 100755
--- a/scripts/ptq/run_vllm_calibrate_for_Hy3.sh
+++ b/scripts/ptq/run_vllm_calibrate_for_Hy3.sh
@@ -22,7 +22,7 @@ export ASYNC_SCHEDULING=1
 export VLLM_ENABLE_PREFIX_CACHING=1
 export PRECISIONMODE=HF
 
-CONFIG=configs/Hy3/ptq/Hy3_vllm_calibrate.yaml
+CONFIG=configs/Hy3/ptq/fp8/Hy3_vllm_ptq_per_tensor.yaml
 
 mkdir -p logs
 
diff --git a/scripts/ptq/run_vllm_quant_for_Hy3.sh b/scripts/ptq/run_vllm_quant_for_Hy3.sh
index e5f53df8..1b688290 100755
--- a/scripts/ptq/run_vllm_quant_for_Hy3.sh
+++ b/scripts/ptq/run_vllm_quant_for_Hy3.sh
@@ -6,16 +6,17 @@
 # Stage 1: tools/run_vllm_calibrate.py
 #   * Loads the bf16 model with vLLM, runs forward passes on the PTQ dataset,
 #     and dumps activation_stats.json / moe_expert_stats.json / kv_cache_*
-#     into the directory given by ``output_dir`` in CALIB_CONFIG.
+#     into the directory given by ``output_dir`` in PTQ_CONFIG.
 #
 # Stage 2: tools/fp8_quant_with_vllm_activation.py
 #   * Reads activation_stats.json (+ moe_expert_stats.json if any) plus the
 #     original bf16 weights, applies per-tensor FP8 quantization with
 #     calibrated input scales, and writes the FP8 HF model into the directory
-#     given by ``output_fp8_hf_path`` in QUANT_CONFIG.
+#     given by ``output_fp8_hf_path`` in PTQ_CONFIG.
 #
-# IMPORTANT: ``input_vllm_ac_json_path`` in QUANT_CONFIG must equal
-# ``output_dir`` in CALIB_CONFIG, otherwise stage 2 cannot find the stats.
+# Both stages share a SINGLE unified YAML (PTQ_CONFIG); stage 2 reuses stage
+# 1's ``model_path`` as ``input_bf16_hf_path`` and ``output_dir`` as
+# ``input_vllm_ac_json_path``, so paths only need to be set once.
 #
 # Usage:
 #   bash run_vllm_quant_for_Hy3.sh
@@ -41,7 +42,7 @@ for arg in "$@"; do
         --skip-calibrate) do_calibrate=0 ;;
         --skip-quantize)  do_quantize=0  ;;
         -h|--help)
-            sed -n '2,30p' "${BASH_SOURCE[0]}" | sed 's/^# \{0,1\}//'
+            sed -n '2,32p' "${BASH_SOURCE[0]}" | sed 's/^# \{0,1\}//'
             exit 0
             ;;
         *)
@@ -66,10 +67,10 @@ export VLLM_ENABLE_PREFIX_CACHING=1
 export PRECISIONMODE=HF
 
 # ----------------------------------------------------------------------------
-# YAML configs (one per stage)
+# Unified YAML config (drives BOTH stages; each stage's argparse picks up
+# only the keys it knows about, and unknown keys are warned-and-ignored).
 # ----------------------------------------------------------------------------
-CALIB_CONFIG=configs/Hy3/ptq/Hy3_vllm_calibrate.yaml
-QUANT_CONFIG=configs/Hy3/ptq/Hy3_vllm_quant_fp8_per_tensor.yaml
+PTQ_CONFIG=configs/Hy3/ptq/fp8/Hy3_vllm_ptq_per_tensor.yaml
 
 mkdir -p logs
 
@@ -78,10 +79,10 @@ mkdir -p logs
 # ============================================================================
 if [[ "${do_calibrate}" -eq 1 ]]; then
     echo "[pipeline] === Stage 1/2: activation calibration ==="
-    echo "[pipeline] CALIB_CONFIG=${CALIB_CONFIG}"
+    echo "[pipeline] PTQ_CONFIG=${PTQ_CONFIG}"
 
     python3 tools/run_vllm_calibrate.py \
-        -c "${CALIB_CONFIG}" \
+        -c "${PTQ_CONFIG}" \
         2>&1 | tee "logs/run_vllm_quant_Hy3-calibrate.log"
 
     echo "[pipeline] Stage 1 finished."
@@ -94,10 +95,10 @@ fi
 # ============================================================================
 if [[ "${do_quantize}" -eq 1 ]]; then
     echo "[pipeline] === Stage 2/2: FP8 quantization ==="
-    echo "[pipeline] QUANT_CONFIG=${QUANT_CONFIG}"
+    echo "[pipeline] PTQ_CONFIG=${PTQ_CONFIG}"
 
     python3 tools/fp8_quant_with_vllm_activation.py \
-        -c "${QUANT_CONFIG}" \
+        -c "${PTQ_CONFIG}" \
         2>&1 | tee "logs/run_vllm_quant_Hy3-quantize.log"
 
     echo "[pipeline] Stage 2 finished."
diff --git a/tools/fp8_quant_with_vllm_activation.py b/tools/fp8_quant_with_vllm_activation.py
index 8966c7d9..a4dca258 100644
--- a/tools/fp8_quant_with_vllm_activation.py
+++ b/tools/fp8_quant_with_vllm_activation.py
@@ -1,3 +1,4 @@
+import argparse
 import json
 import math
 import multiprocessing as mp
@@ -253,8 +254,8 @@ def main(bf16_path, fp8_path, block_size, ac_json_data):
     # Resolve scheme & granularity from CLI/YAML args
     k_scheme = getattr(args, "k_scheme", "static")
     v_scheme = getattr(args, "v_scheme", "static")
-    k_granularity_cfg = getattr(args, "k_granularity", "per-head").replace("-", "_")
-    v_granularity_cfg = getattr(args, "v_granularity", "per-head").replace("-", "_")
+    k_granularity_cfg = getattr(args, "quant_k_granularity", "per-head").replace("-", "_")
+    v_granularity_cfg = getattr(args, "quant_v_granularity", "per-head").replace("-", "_")
 
     # If scheme is dynamic, granularity is forced to per_token_per_head
     if k_scheme == "dynamic":
@@ -557,19 +558,27 @@ def process_moe_values(data: Dict[str, Dict]) -> Dict[str, Dict]:
         "granularity forced to per_token_per_head) or 'static' (use calibrated scale).",
     )
     parser.add_argument(
-        "--k-granularity",
+        "--quant-k-granularity",
         type=str,
         default="per-head",
         choices=["none", "per-tensor", "per-head"],
-        help="K-cache granularity when k_scheme=static (ignored if k_scheme=dynamic).",
+        help="K-cache granularity used at *quantization* time when k_scheme=static "
+        "(ignored if k_scheme=dynamic). Distinct from the calibration-time "
+        "granularity controlled by stage-1's --kv-granularity.",
     )
     parser.add_argument(
-        "--v-granularity",
+        "--quant-v-granularity",
         type=str,
         default="per-head",
         choices=["none", "per-tensor", "per-head"],
-        help="V-cache granularity when v_scheme=static (ignored if v_scheme=dynamic).",
+        help="V-cache granularity used at *quantization* time when v_scheme=static "
+        "(ignored if v_scheme=dynamic). Distinct from the calibration-time "
+        "granularity controlled by stage-1's --kv-granularity.",
     )
+    # Stage-1 path keys (model_path / output_dir) are accepted as fallbacks
+    # so that one unified YAML can drive both stages.
+    parser.add_argument("--model-path", type=str, default="", help=argparse.SUPPRESS)
+    parser.add_argument("--output-dir", type=str, default="", help=argparse.SUPPRESS)
     args = parser.parse_args()
 
     # Lazy-import _yaml_args (sibling module in tools/). Done here instead of
@@ -584,6 +593,22 @@ def process_moe_values(data: Dict[str, Dict]) -> Dict[str, Dict]:
 
     apply_yaml_config(parser, args)
 
+    # Path fallbacks: when running with the unified Hy3 YAML, stage 2 reuses
+    # stage 1's `model_path` as the bf16 input dir, and `output_dir` (where
+    # stage 1 wrote stats) as the activation-json dir.
+    if not getattr(args, "input_bf16_hf_path", "") and getattr(args, "model_path", ""):
+        args.input_bf16_hf_path = args.model_path
+        print(
+            f"[yaml-config] input_bf16_hf_path not set; falling back to "
+            f"model_path={args.input_bf16_hf_path!r}"
+        )
+    if not getattr(args, "input_vllm_ac_json_path", "") and getattr(args, "output_dir", ""):
+        args.input_vllm_ac_json_path = args.output_dir
+        print(
+            f"[yaml-config] input_vllm_ac_json_path not set; falling back to "
+            f"output_dir={args.input_vllm_ac_json_path!r}"
+        )
+
     # Validate required paths (may come from CLI or YAML).
     missing = [
         name