From ce90a993b7ee8b2646517e4068f22c207b5f7bed Mon Sep 17 00:00:00 2001 From: krizaltang Date: Wed, 27 May 2026 23:10:27 +0800 Subject: [PATCH 1/3] [fix]:uniform naming convention for hy3. --- configs/hy3/ptq/hy3_kvcache_calibrate.yaml | 2 +- configs/hy3/ptq/hy3_vllm_calibrate.yaml | 2 +- .../ptq/hy3_vllm_quant_fp8_per_tensor.yaml | 12 +- scripts/ptq/README.md | 38 +++-- ...Y3.sh => run_kvcache_calibrate_for_hy3.sh} | 4 +- ...r_HY3.sh => run_vllm_calibrate_for_hy3.sh} | 4 +- ...t_for_HY3.sh => run_vllm_quant_for_hy3.sh} | 14 +- tools/_yaml_args.py | 2 +- tools/fp8_quant_with_vllm_activation.py | 154 +++++++++++++++++- tools/kvcache/README.md | 4 +- 10 files changed, 192 insertions(+), 44 deletions(-) rename scripts/ptq/{run_kvcache_calibrate_for_HY3.sh => run_kvcache_calibrate_for_hy3.sh} (87%) rename scripts/ptq/{run_vllm_calibrate_for_HY3.sh => run_vllm_calibrate_for_hy3.sh} (89%) rename scripts/ptq/{run_vllm_quant_for_HY3.sh => run_vllm_quant_for_hy3.sh} (90%) diff --git a/configs/hy3/ptq/hy3_kvcache_calibrate.yaml b/configs/hy3/ptq/hy3_kvcache_calibrate.yaml index a22f3538..def75081 100644 --- a/configs/hy3/ptq/hy3_kvcache_calibrate.yaml +++ b/configs/hy3/ptq/hy3_kvcache_calibrate.yaml @@ -1,4 +1,4 @@ -# KV-cache calibration + scale search for HY3 (standalone, no weight/MoE hooks) +# KV-cache calibration + scale search for hy3 (standalone, no weight/MoE hooks) # Consumed by: tools/kvcache/run_kvcache_calibrate.py # # Keys here match the script's argparse `dest` names. Values listed below diff --git a/configs/hy3/ptq/hy3_vllm_calibrate.yaml b/configs/hy3/ptq/hy3_vllm_calibrate.yaml index 6350616e..994ff1da 100644 --- a/configs/hy3/ptq/hy3_vllm_calibrate.yaml +++ b/configs/hy3/ptq/hy3_vllm_calibrate.yaml @@ -1,4 +1,4 @@ -# vLLM activation calibration for HY3 (collects activation + MoE expert stats, +# vLLM activation calibration for hy3 (collects activation + MoE expert stats, # and optionally KV-cache stats / scale search). # Consumed by: tools/run_vllm_calibrate.py # diff --git a/configs/hy3/ptq/hy3_vllm_quant_fp8_per_tensor.yaml b/configs/hy3/ptq/hy3_vllm_quant_fp8_per_tensor.yaml index 2a9eddfd..8584688c 100644 --- a/configs/hy3/ptq/hy3_vllm_quant_fp8_per_tensor.yaml +++ b/configs/hy3/ptq/hy3_vllm_quant_fp8_per_tensor.yaml @@ -1,4 +1,4 @@ -# Stage-2 of the HY3 PTQ pipeline: FP8 quantization of bf16 weights using +# Stage-2 of the hy3 PTQ pipeline: FP8 quantization of bf16 weights using # the activation_stats / moe_expert_stats produced by stage-1 calibration. # Consumed by: tools/fp8_quant_with_vllm_activation.py # @@ -7,7 +7,7 @@ # precedence (e.g. `python3 ... -c this.yaml --num-workers 8` will use 8). # # IMPORTANT: input_vllm_ac_json_path MUST equal the `output_dir` used in -# stage 1 (HY3_vllm_calibrate.yaml), otherwise the FP8 quantizer will not +# stage 1 (hy3_vllm_calibrate.yaml), otherwise the FP8 quantizer will not # find activation_stats.json / moe_expert_stats.json. input_bf16_hf_path: /path/to/input/model @@ -17,3 +17,11 @@ output_fp8_hf_path: /path/to/output/fp8_model # Optional: leave at defaults unless you know what you are doing block_size: [-1, -1] num_workers: 16 + +# KV-cache scheme & granularity (must match calibration config) +# scheme: dynamic => no static scale is saved; granularity forced to per_token_per_head +# static => granularity can be none | per-tensor | per-head +k_scheme: dynamic # dynamic | static +v_scheme: static # dynamic | static +k_granularity: per-head # none | per-tensor | per-head (only used when k_scheme=static) +v_granularity: per-head # none | per-tensor | per-head (only used when v_scheme=static) diff --git a/scripts/ptq/README.md b/scripts/ptq/README.md index 89998f2c..1eb3ae66 100644 --- a/scripts/ptq/README.md +++ b/scripts/ptq/README.md @@ -8,7 +8,7 @@ ## 一、环境准备(运行校准脚本前必须完成) -> 📌 **硬性要求**(当前 HY3 校准脚本经过验证的配置): +> 📌 **硬性要求**(当前 hy3 校准脚本经过验证的配置): > - **算力**:**16 卡**(两个节点 × 每节点 8 卡),用于 TP/PP 跨节点切分 > - **vLLM 版本**:**v0.20.0**(补丁文件按此版本对齐,其它版本需要重新生成补丁) > - **Python 环境**:所有节点保持一致(建议使用同一个 conda / venv) @@ -21,7 +21,7 @@ ### 1. 准备 Ray 集群(2 节点 × 8 卡 = 16 卡) -HY3 等大模型需要跨节点 TP/PP,校准脚本默认走 vLLM 的 Ray distributed executor,必须先在 **两台 8 卡节点** 上分别拉起 Ray,组成一个 16 卡集群。 +hy3 等大模型需要跨节点 TP/PP,校准脚本默认走 vLLM 的 Ray distributed executor,必须先在 **两台 8 卡节点** 上分别拉起 Ray,组成一个 16 卡集群。 下面给出的环境变量按 **RDMA / 多网卡** 集群的常见配置示例,请按实际网络拓扑调整(特别是 `*_SOCKET_IFNAME`、`NCCL_IB_GID_INDEX`)。 @@ -113,19 +113,19 @@ bash tools/vllm_patch/install.sh --help # 查看完整用法 --- -## 二、HY3.0 系列脚本(Hunyuan-A20B 等 HY3 模型) +## 二、hy3.0 系列脚本(Hunyuan-A20B 等 hy3 模型) 下面 3 个脚本共享同一套 vLLM 运行时环境(chunked prefill / FlashInfer attention / mp distributed executor / fused MoE 等),区别在于产出物不同。 | 脚本 | 用途 | 入口 | | --- | --- | --- | -| [`run_vllm_quant_for_HY3.sh`](./run_vllm_quant_for_HY3.sh) | ★ 推荐的"一键流水线":校准 + 量化 | `tools/run_vllm_calibrate.py` + `tools/fp8_quant_with_vllm_activation.py` | -| [`run_vllm_calibrate_for_HY3.sh`](./run_vllm_calibrate_for_HY3.sh) | 仅 W8A8C8 联合校准 | `tools/run_vllm_calibrate.py` | -| [`run_kvcache_calibrate_for_HY3.sh`](./run_kvcache_calibrate_for_HY3.sh) | 仅 KV-cache 校准(轻量) | `tools/kvcache/run_kvcache_calibrate.py` | +| [`run_vllm_quant_for_hy3.sh`](./run_vllm_quant_for_hy3.sh) | ★ 推荐的"一键流水线":校准 + 量化 | `tools/run_vllm_calibrate.py` + `tools/fp8_quant_with_vllm_activation.py` | +| [`run_vllm_calibrate_for_hy3.sh`](./run_vllm_calibrate_for_hy3.sh) | 仅 W8A8C8 联合校准 | `tools/run_vllm_calibrate.py` | +| [`run_kvcache_calibrate_for_hy3.sh`](./run_kvcache_calibrate_for_hy3.sh) | 仅 KV-cache 校准(轻量) | `tools/kvcache/run_kvcache_calibrate.py` | --- -### 1. `run_vllm_quant_for_HY3.sh` ★推荐的"一键流水线" +### 1. `run_vllm_quant_for_hy3.sh` ★推荐的"一键流水线" **功能**:bf16 模型 → vLLM 激活校准 → FP8 HF safetensors,全流程一次完成。 @@ -141,22 +141,25 @@ bash tools/vllm_patch/install.sh --help # 查看完整用法 #### 阶段 2:调用 `tools/fp8_quant_with_vllm_activation.py` - 读取 `${stats_dir}` 下的 `activation_stats.json` / `moe_expert_stats.json`,结合原 bf16 权重,做 per-tensor FP8 量化(含 weight + input scale),写出到 `${fp8_path}`。 -- 当存在 per-head KV 统计时,会同时输出 `kv_cache_scales.safetensors`。 +- KV-cache scale 的写入行为由量化 YAML 中的 `k_scheme` / `v_scheme` 控制: + - `static`:将校准得到的 scale 写入 `kv_cache_scales.safetensors`,粒度由 `k_granularity` / `v_granularity` 决定(`none` | `per-tensor` | `per-head`)。 + - `dynamic`:不写入对应的 scale(`model.safetensors.index.json` 中也不包含对应 key),`config.json` 中标记为 `"scheme": "dynamic", "granularity": "per_token_per_head"`(与 `q_quant` 一致)。 +- 产出的 `config.json` 中 `attn_quant_config.kv_cache_quant` 的 `k_quant` 和 `v_quant` 独立配置,支持 K/V 使用不同的 scheme。 #### CLI 开关 ```bash -bash run_vllm_quant_for_HY3.sh # 两阶段都跑 -bash run_vllm_quant_for_HY3.sh --skip-calibrate # 仅量化(复用已有 stats_dir) -bash run_vllm_quant_for_HY3.sh --skip-quantize # 仅校准 -bash run_vllm_quant_for_HY3.sh --help # 打印用法 +bash run_vllm_quant_for_hy3.sh # 两阶段都跑 +bash run_vllm_quant_for_hy3.sh --skip-calibrate # 仅量化(复用已有 stats_dir) +bash run_vllm_quant_for_hy3.sh --skip-quantize # 仅校准 +bash run_vllm_quant_for_hy3.sh --help # 打印用法 ``` > 脚本开启 `set -euo pipefail`,任一阶段失败将立即中断。 --- -### 2. `run_vllm_calibrate_for_HY3.sh` — 一键脚本里的"阶段 1"独立版 +### 2. `run_vllm_calibrate_for_hy3.sh` — 一键脚本里的"阶段 1"独立版 **功能**:只跑 W8A8C8 联合校准,不做量化。 @@ -168,6 +171,7 @@ bash run_vllm_quant_for_HY3.sh --help # 打印用法 VLLM_MOE_COLLECT_STATS_VERBOSE=0 ``` - **默认配置**:`--kv-granularity per-head`,并开启 `--search-kv-scale`。 +- **注意**:校准阶段无论后续 scheme 是 dynamic 还是 static,都会正常收集 KV 统计数据。scheme 的判断仅在阶段 2(量化)时生效。 - **产物**(写入 `${output_dir}`): - `activation_stats.json` - `moe_expert_stats.json` @@ -177,20 +181,20 @@ bash run_vllm_quant_for_HY3.sh --help # 打印用法 #### 适用场景 - 想自己接后续量化工具,不走 `fp8_quant_with_vllm_activation.py`。 -- 想单独调校 PTQ 数据集 / `num_samples` / `max_length`,再用 `run_vllm_quant_for_HY3.sh --skip-calibrate` 复用结果。 +- 想单独调校 PTQ 数据集 / `num_samples` / `max_length`,再用 `run_vllm_quant_for_hy3.sh --skip-calibrate` 复用结果。 - Debug 用 `--skip-weight-loading` 跑 dummy 权重,快速验证 hook 注册流程。 --- -### 3. `run_kvcache_calibrate_for_HY3.sh` — 仅校准 KV-cache(轻量) +### 3. `run_kvcache_calibrate_for_hy3.sh` — 仅校准 KV-cache(轻量) **功能**:只校准 KV-cache(K/V min/max),不做 weight / activation / MoE 统计。 - **入口**:`tools/kvcache/run_kvcache_calibrate.py` -#### 关键差异(与 `run_vllm_calibrate_for_HY3.sh` 对比) +#### 关键差异(与 `run_vllm_calibrate_for_hy3.sh` 对比) -| 维度 | `run_kvcache_calibrate_for_HY3.sh` | `run_vllm_calibrate_for_HY3.sh` | +| 维度 | `run_kvcache_calibrate_for_hy3.sh` | `run_vllm_calibrate_for_hy3.sh` | | --- | --- | --- | | MoE / Linear 钩子 | 故意 **NOT** 设置 `VLLM_MOE_COLLECT_STATS`,完全跳过,启动更快、CPU 内存占用更低 | 全开 | | KV 搜索范围 | `[0.4, 8.0]`,`num_steps=50`(更窄、更聚焦) | `[0.8, 16.0]` | diff --git a/scripts/ptq/run_kvcache_calibrate_for_HY3.sh b/scripts/ptq/run_kvcache_calibrate_for_hy3.sh similarity index 87% rename from scripts/ptq/run_kvcache_calibrate_for_HY3.sh rename to scripts/ptq/run_kvcache_calibrate_for_hy3.sh index b0777dc3..ba519324 100755 --- a/scripts/ptq/run_kvcache_calibrate_for_HY3.sh +++ b/scripts/ptq/run_kvcache_calibrate_for_hy3.sh @@ -17,10 +17,10 @@ export ASYNC_SCHEDULING=1 export VLLM_ENABLE_PREFIX_CACHING=1 export PRECISIONMODE=HF -CONFIG=configs/HY3/ptq/HY3_kvcache_calibrate.yaml +CONFIG=configs/hy3/ptq/hy3_kvcache_calibrate.yaml mkdir -p logs python3 tools/kvcache/run_kvcache_calibrate.py \ -c $CONFIG \ - 2>&1 | tee logs/run_kvcache_calibrate_HY3.log + 2>&1 | tee logs/run_kvcache_calibrate_hy3.log diff --git a/scripts/ptq/run_vllm_calibrate_for_HY3.sh b/scripts/ptq/run_vllm_calibrate_for_hy3.sh similarity index 89% rename from scripts/ptq/run_vllm_calibrate_for_HY3.sh rename to scripts/ptq/run_vllm_calibrate_for_hy3.sh index 18a1566d..a28c97c4 100755 --- a/scripts/ptq/run_vllm_calibrate_for_HY3.sh +++ b/scripts/ptq/run_vllm_calibrate_for_hy3.sh @@ -22,10 +22,10 @@ export ASYNC_SCHEDULING=1 export VLLM_ENABLE_PREFIX_CACHING=1 export PRECISIONMODE=HF -CONFIG=configs/HY3/ptq/HY3_vllm_calibrate.yaml +CONFIG=configs/hy3/ptq/hy3_vllm_calibrate.yaml mkdir -p logs python3 tools/run_vllm_calibrate.py \ -c $CONFIG \ - 2>&1 | tee logs/run_vllm_calibrate_HY3.log + 2>&1 | tee logs/run_vllm_calibrate_hy3.log diff --git a/scripts/ptq/run_vllm_quant_for_HY3.sh b/scripts/ptq/run_vllm_quant_for_hy3.sh similarity index 90% rename from scripts/ptq/run_vllm_quant_for_HY3.sh rename to scripts/ptq/run_vllm_quant_for_hy3.sh index 092bbabd..528718cd 100755 --- a/scripts/ptq/run_vllm_quant_for_HY3.sh +++ b/scripts/ptq/run_vllm_quant_for_hy3.sh @@ -18,13 +18,13 @@ # ``output_dir`` in CALIB_CONFIG, otherwise stage 2 cannot find the stats. # # Usage: -# bash run_vllm_quant_for_HY3.sh +# bash run_vllm_quant_for_hy3.sh # (run both stages back-to-back) # -# bash run_vllm_quant_for_HY3.sh --skip-calibrate +# bash run_vllm_quant_for_hy3.sh --skip-calibrate # (skip stage 1, only quantize using existing stats dir) # -# bash run_vllm_quant_for_HY3.sh --skip-quantize +# bash run_vllm_quant_for_hy3.sh --skip-quantize # (only run stage 1, do not produce the FP8 model) # ============================================================================= @@ -68,8 +68,8 @@ export PRECISIONMODE=HF # ---------------------------------------------------------------------------- # YAML configs (one per stage) # ---------------------------------------------------------------------------- -CALIB_CONFIG=configs/HY3/ptq/HY3_vllm_calibrate.yaml -QUANT_CONFIG=configs/HY3/ptq/HY3_vllm_quant_fp8.yaml +CALIB_CONFIG=configs/hy3/ptq/hy3_vllm_calibrate.yaml +QUANT_CONFIG=configs/hy3/ptq/hy3_vllm_quant_fp8_per_tensor.yaml mkdir -p logs @@ -82,7 +82,7 @@ if [[ "${do_calibrate}" -eq 1 ]]; then python3 tools/run_vllm_calibrate.py \ -c "${CALIB_CONFIG}" \ - 2>&1 | tee "logs/run_vllm_quant_HY3-calibrate.log" + 2>&1 | tee "logs/run_vllm_quant_hy3-calibrate.log" echo "[pipeline] Stage 1 finished." else @@ -98,7 +98,7 @@ if [[ "${do_quantize}" -eq 1 ]]; then python3 tools/fp8_quant_with_vllm_activation.py \ -c "${QUANT_CONFIG}" \ - 2>&1 | tee "logs/run_vllm_quant_HY3-quantize.log" + 2>&1 | tee "logs/run_vllm_quant_hy3-quantize.log" echo "[pipeline] Stage 2 finished." else diff --git a/tools/_yaml_args.py b/tools/_yaml_args.py index 6a754c01..cead2860 100644 --- a/tools/_yaml_args.py +++ b/tools/_yaml_args.py @@ -14,7 +14,7 @@ """Shared YAML-config loader for standalone tool scripts. -The HY3 tool entry points (``tools/run_vllm_calibrate.py``, +The hy3 tool entry points (``tools/run_vllm_calibrate.py``, ``tools/kvcache/run_kvcache_calibrate.py`` and ``tools/fp8_quant_with_vllm_activation.py``) all use ``argparse``. To match the style of ``scripts/ptq/run_vllm_quant_for_deepseek_v3.sh`` (shell only diff --git a/tools/fp8_quant_with_vllm_activation.py b/tools/fp8_quant_with_vllm_activation.py index 6b1d35d8..8966c7d9 100644 --- a/tools/fp8_quant_with_vllm_activation.py +++ b/tools/fp8_quant_with_vllm_activation.py @@ -247,7 +247,50 @@ def main(bf16_path, fp8_path, block_size, ac_json_data): config["quantization_config"]["weight_block_size"] = block_size kv_state_dict = {} - kv_granularity = "" + k_kv_granularity = "" # resolved granularity for k-cache + v_kv_granularity = "" # resolved granularity for v-cache + + # Resolve scheme & granularity from CLI/YAML args + k_scheme = getattr(args, "k_scheme", "static") + v_scheme = getattr(args, "v_scheme", "static") + k_granularity_cfg = getattr(args, "k_granularity", "per-head").replace("-", "_") + v_granularity_cfg = getattr(args, "v_granularity", "per-head").replace("-", "_") + + # If scheme is dynamic, granularity is forced to per_token_per_head + if k_scheme == "dynamic": + k_kv_granularity = "per_token_per_head" + if v_scheme == "dynamic": + v_kv_granularity = "per_token_per_head" + + print(f"[KV-config] k_scheme={k_scheme}, v_scheme={v_scheme}") + print( + f"[KV-config] k_granularity_cfg={k_granularity_cfg}, v_granularity_cfg={v_granularity_cfg}" + ) + + # ---- Load per-tensor tuned KV scales (from --search-kv-scale stage) ---- + # The stage-1 search outputs ``kv_cache_tuned_scales.json`` whose keys + # already match the safetensor key naming (e.g. + # ``model.layers.X.self_attn.k_cache.scale``). If the file exists, we + # prefer its values for the per-tensor branch instead of falling back + # to a recomputed base scale (and certainly not the legacy 1.0). + tuned_kv_scales = {} + tuned_scales_path = os.path.join(args.input_vllm_ac_json_path, "kv_cache_tuned_scales.json") + if os.path.isfile(tuned_scales_path): + try: + with open(tuned_scales_path, "r", encoding="utf8") as _tsf: + tuned_kv_scales = json.load(_tsf) + print( + f"[KV-scale] Loaded {len(tuned_kv_scales)} tuned per-tensor KV scales " + f"from {tuned_scales_path}" + ) + except Exception as _e: + print(f"[WARN] failed to load {tuned_scales_path}: {_e}") + tuned_kv_scales = {} + else: + print( + f"[KV-scale] {tuned_scales_path} not found; " + f"will fall back to min/max-based per-tensor scale." + ) # Auto-detect kv_head_repeat from the model's real num_key_value_heads # vs the per-head stats vector length collected by AngelSlim. @@ -271,6 +314,18 @@ def main(bf16_path, fp8_path, block_size, ac_json_data): for scale_name, stats in ac_json_data.items(): if "cache" not in scale_name: continue + + # Determine whether this entry is for k-cache or v-cache + is_k_cache = "k_cache" in scale_name + is_v_cache = "v_cache" in scale_name + # Skip writing scale if the corresponding scheme is dynamic + if is_k_cache and k_scheme == "dynamic": + print(f"[KV-scale] SKIP (k_scheme=dynamic): {scale_name}") + continue + if is_v_cache and v_scheme == "dynamic": + print(f"[KV-scale] SKIP (v_scheme=dynamic): {scale_name}") + continue + act_save_name = f"{scale_name.replace('attn.attn', 'attn')}.scale" min_val = stats["min"] max_val = stats["max"] @@ -298,25 +353,75 @@ def main(bf16_path, fp8_path, block_size, ac_json_data): ) per_head_scales = per_head_scales[::replication] tensor_input_scale = torch.tensor(per_head_scales, dtype=torch.float32) - kv_granularity = "per_head" + detected_granularity = "per_head" else: # per-tensor: single scalar scale - # input_scale = max(abs(min_val), abs(max_val)) / fp8_max - # tensor_input_scale = torch.tensor([input_scale], dtype=torch.float32) - tensor_input_scale = torch.tensor([1.0]) - kv_granularity = "per_tensor" - print(f"{scale_name} granularity={kv_granularity} scale={tensor_input_scale}") + # + # Preference order: + # 1) Use tuned scale from kv_cache_tuned_scales.json if available + # (the search-kv-scale stage already wrote one entry per + # k_cache / v_cache layer using exactly the same key as + # ``act_save_name`` below). + # 2) Otherwise, compute base scale from min/max as + # max(|min|, |max|) / fp8_max (this is the unsearched + # baseline scale; previously this branch was hardcoded + # to 1.0 which is wrong). + act_save_name_lookup = f"{scale_name.replace('attn.attn', 'attn')}.scale" + if act_save_name_lookup in tuned_kv_scales: + scalar_scale = float(tuned_kv_scales[act_save_name_lookup]) + tensor_input_scale = torch.tensor([scalar_scale], dtype=torch.float32) + scale_source = "tuned" + else: + base_scale = max(abs(min_val), abs(max_val)) / fp8_max + tensor_input_scale = torch.tensor([base_scale], dtype=torch.float32) + scale_source = "min_max" + detected_granularity = "per_tensor" + + # Update resolved granularity based on actual data + if is_k_cache and not k_kv_granularity: + k_kv_granularity = detected_granularity + if is_v_cache and not v_kv_granularity: + v_kv_granularity = detected_granularity + + scale_source_tag = locals().get("scale_source", "per_head") + print( + f"{scale_name} granularity={detected_granularity} " + f"src={scale_source_tag} scale={tensor_input_scale}" + ) kv_state_dict[act_save_name] = tensor_input_scale index[act_save_name] = "kv_cache_scales.safetensors" + + # Use config-specified granularity if scheme is static and we didn't detect from data + if k_scheme == "static" and not k_kv_granularity: + k_kv_granularity = k_granularity_cfg + if v_scheme == "static" and not v_kv_granularity: + v_kv_granularity = v_granularity_cfg + + # Write kv_cache_scales.safetensors only if there are static scales to save if len(kv_state_dict) > 0: kv_safetensor_file = os.path.join(fp8_path, "kv_cache_scales.safetensors") save_file(kv_state_dict, kv_safetensor_file) config["quantization_config"]["kv_cache_scheme"] = "static" + + # Build attn_quant_config: k_quant and v_quant depend on their respective schemes + k_quant_config = ( + {"dtype": "fp8_e4m3", "scheme": "dynamic", "granularity": "per_token_per_head"} + if k_scheme == "dynamic" + else {"dtype": "fp8_e4m3", "scheme": "static", "granularity": k_kv_granularity} + ) + v_quant_config = ( + {"dtype": "fp8_e4m3", "scheme": "dynamic", "granularity": "per_token_per_head"} + if v_scheme == "dynamic" + else {"dtype": "fp8_e4m3", "scheme": "static", "granularity": v_kv_granularity} + ) + + # Only emit attn_quant_config if at least one of k/v has meaningful config + if len(kv_state_dict) > 0 or k_scheme == "dynamic" or v_scheme == "dynamic": config["attn_quant_config"] = { "kv_cache_quant": { "dtype": "fp8_e4m3", - "k_quant": {"scheme": "static", "granularity": kv_granularity}, - "v_quant": {"scheme": "static", "granularity": kv_granularity}, + "k_quant": k_quant_config, + "v_quant": v_quant_config, }, "q_quant": { "dtype": "fp8_e4m3", @@ -434,6 +539,37 @@ def process_moe_values(data: Dict[str, Dict]) -> Dict[str, Dict]: type=str, default="", ) + # KV-cache scheme & granularity + parser.add_argument( + "--k-scheme", + type=str, + default="static", + choices=["dynamic", "static"], + help="K-cache quantization scheme: 'dynamic' (no static scale saved, " + "granularity forced to per_token_per_head) or 'static' (use calibrated scale).", + ) + parser.add_argument( + "--v-scheme", + type=str, + default="static", + choices=["dynamic", "static"], + help="V-cache quantization scheme: 'dynamic' (no static scale saved, " + "granularity forced to per_token_per_head) or 'static' (use calibrated scale).", + ) + parser.add_argument( + "--k-granularity", + type=str, + default="per-head", + choices=["none", "per-tensor", "per-head"], + help="K-cache granularity when k_scheme=static (ignored if k_scheme=dynamic).", + ) + parser.add_argument( + "--v-granularity", + type=str, + default="per-head", + choices=["none", "per-tensor", "per-head"], + help="V-cache granularity when v_scheme=static (ignored if v_scheme=dynamic).", + ) args = parser.parse_args() # Lazy-import _yaml_args (sibling module in tools/). Done here instead of diff --git a/tools/kvcache/README.md b/tools/kvcache/README.md index c24c5965..1313bb78 100644 --- a/tools/kvcache/README.md +++ b/tools/kvcache/README.md @@ -60,8 +60,8 @@ python3 tools/kvcache/run_kvcache_calibrate.py \ --search-kv-num-steps 50 ``` -A ready-to-run wrapper for HY3 lives at -[`scripts/ptq/run_kvcache_calibrate_for_HY3.sh`](../../scripts/ptq/run_kvcache_calibrate_for_HY3.sh). +A ready-to-run wrapper for hy3 lives at +[`scripts/ptq/run_kvcache_calibrate_for_hy3.sh`](../../scripts/ptq/run_kvcache_calibrate_for_hy3.sh). > ⚠️ Requires the AngelSlim vLLM patch > ([`tools/vllm_patch/`](../vllm_patch/)) to be installed in the active From f7612a67dbcdf72bf27f8fdbd0cfcbceb95331c4 Mon Sep 17 00:00:00 2001 From: krizaltang Date: Fri, 29 May 2026 11:52:55 +0800 Subject: [PATCH 2/3] renaming hy3 to Hy3. --- .../ptq/Hy3_kvcache_calibrate.yaml} | 12 +++---- .../ptq/Hy3_vllm_calibrate.yaml} | 8 ++--- .../ptq/Hy3_vllm_quant_fp8_per_tensor.yaml} | 10 +++--- scripts/ptq/README.md | 32 +++++++++---------- ...y3.sh => run_kvcache_calibrate_for_Hy3.sh} | 4 +-- ...r_hy3.sh => run_vllm_calibrate_for_Hy3.sh} | 4 +-- ...t_for_hy3.sh => run_vllm_quant_for_Hy3.sh} | 14 ++++---- tools/_yaml_args.py | 2 +- tools/kvcache/README.md | 4 +-- 9 files changed, 45 insertions(+), 45 deletions(-) rename configs/{hy3/ptq/hy3_kvcache_calibrate.yaml => Hy3/ptq/Hy3_kvcache_calibrate.yaml} (76%) rename configs/{hy3/ptq/hy3_vllm_calibrate.yaml => Hy3/ptq/Hy3_vllm_calibrate.yaml} (85%) rename configs/{hy3/ptq/hy3_vllm_quant_fp8_per_tensor.yaml => Hy3/ptq/Hy3_vllm_quant_fp8_per_tensor.yaml} (77%) rename scripts/ptq/{run_kvcache_calibrate_for_hy3.sh => run_kvcache_calibrate_for_Hy3.sh} (87%) rename scripts/ptq/{run_vllm_calibrate_for_hy3.sh => run_vllm_calibrate_for_Hy3.sh} (89%) rename scripts/ptq/{run_vllm_quant_for_hy3.sh => run_vllm_quant_for_Hy3.sh} (90%) diff --git a/configs/hy3/ptq/hy3_kvcache_calibrate.yaml b/configs/Hy3/ptq/Hy3_kvcache_calibrate.yaml similarity index 76% rename from configs/hy3/ptq/hy3_kvcache_calibrate.yaml rename to configs/Hy3/ptq/Hy3_kvcache_calibrate.yaml index def75081..1586e981 100644 --- a/configs/hy3/ptq/hy3_kvcache_calibrate.yaml +++ b/configs/Hy3/ptq/Hy3_kvcache_calibrate.yaml @@ -1,4 +1,4 @@ -# KV-cache calibration + scale search for hy3 (standalone, no weight/MoE hooks) +# KV-cache calibration + scale search for Hy3 (standalone, no weight/MoE hooks) # Consumed by: tools/kvcache/run_kvcache_calibrate.py # # Keys here match the script's argparse `dest` names. Values listed below @@ -19,11 +19,11 @@ distributed_executor_backend: ray # ray | mp skip_weight_loading: false # true => dummy weights (debug only) # -------- KV-cache granularity -------- -per_head: true # true => per-head scales; false => per-tensor +per_head: false # true => per-head scales; false => per-tensor # -------- KV-cache scale search -------- -search_kv_scale: true -search_kv_num_samples: 32 -search_kv_min_multiplier: 0.4 -search_kv_max_multiplier: 8.0 +search_kv_scale: false +search_kv_num_samples: 64 +search_kv_min_multiplier: 0.8 +search_kv_max_multiplier: 16.0 search_kv_num_steps: 50 diff --git a/configs/hy3/ptq/hy3_vllm_calibrate.yaml b/configs/Hy3/ptq/Hy3_vllm_calibrate.yaml similarity index 85% rename from configs/hy3/ptq/hy3_vllm_calibrate.yaml rename to configs/Hy3/ptq/Hy3_vllm_calibrate.yaml index 994ff1da..3a29b001 100644 --- a/configs/hy3/ptq/hy3_vllm_calibrate.yaml +++ b/configs/Hy3/ptq/Hy3_vllm_calibrate.yaml @@ -1,4 +1,4 @@ -# vLLM activation calibration for hy3 (collects activation + MoE expert stats, +# vLLM activation calibration for Hy3 (collects activation + MoE expert stats, # and optionally KV-cache stats / scale search). # Consumed by: tools/run_vllm_calibrate.py # @@ -14,7 +14,7 @@ output_dir: /path/to/statistics # -------- Model loading / runtime -------- tp_size: 16 batch_size: 4 -num_samples: 512 +num_samples: 64 max_length: 16384 distributed_executor_backend: ray # ray | mp skip_weight_loading: false # true => dummy weights (debug only) @@ -27,10 +27,10 @@ num_speculative_tokens: 1 verbose: false # -------- KV-cache granularity -------- -kv_granularity: per-head # none | per-tensor | per-head +kv_granularity: per-tensor # none | per-tensor | per-head # -------- KV-cache scale search -------- -search_kv_scale: true +search_kv_scale: false search_kv_num_samples: 64 search_kv_min_multiplier: 0.8 search_kv_max_multiplier: 16.0 diff --git a/configs/hy3/ptq/hy3_vllm_quant_fp8_per_tensor.yaml b/configs/Hy3/ptq/Hy3_vllm_quant_fp8_per_tensor.yaml similarity index 77% rename from configs/hy3/ptq/hy3_vllm_quant_fp8_per_tensor.yaml rename to configs/Hy3/ptq/Hy3_vllm_quant_fp8_per_tensor.yaml index 8584688c..6f145a3e 100644 --- a/configs/hy3/ptq/hy3_vllm_quant_fp8_per_tensor.yaml +++ b/configs/Hy3/ptq/Hy3_vllm_quant_fp8_per_tensor.yaml @@ -1,4 +1,4 @@ -# Stage-2 of the hy3 PTQ pipeline: FP8 quantization of bf16 weights using +# Stage-2 of the Hy3 PTQ pipeline: FP8 quantization of bf16 weights using # the activation_stats / moe_expert_stats produced by stage-1 calibration. # Consumed by: tools/fp8_quant_with_vllm_activation.py # @@ -7,7 +7,7 @@ # precedence (e.g. `python3 ... -c this.yaml --num-workers 8` will use 8). # # IMPORTANT: input_vllm_ac_json_path MUST equal the `output_dir` used in -# stage 1 (hy3_vllm_calibrate.yaml), otherwise the FP8 quantizer will not +# stage 1 (Hy3_vllm_calibrate.yaml), otherwise the FP8 quantizer will not # find activation_stats.json / moe_expert_stats.json. input_bf16_hf_path: /path/to/input/model @@ -21,7 +21,7 @@ num_workers: 16 # KV-cache scheme & granularity (must match calibration config) # scheme: dynamic => no static scale is saved; granularity forced to per_token_per_head # static => granularity can be none | per-tensor | per-head -k_scheme: dynamic # dynamic | static +k_scheme: static # dynamic | static v_scheme: static # dynamic | static -k_granularity: per-head # none | per-tensor | per-head (only used when k_scheme=static) -v_granularity: per-head # none | per-tensor | per-head (only used when v_scheme=static) +k_granularity: per-tensor # none | per-tensor | per-head (only used when k_scheme=static) +v_granularity: per-tensor # none | per-tensor | per-head (only used when v_scheme=static) diff --git a/scripts/ptq/README.md b/scripts/ptq/README.md index 1eb3ae66..7931fe5a 100644 --- a/scripts/ptq/README.md +++ b/scripts/ptq/README.md @@ -8,7 +8,7 @@ ## 一、环境准备(运行校准脚本前必须完成) -> 📌 **硬性要求**(当前 hy3 校准脚本经过验证的配置): +> 📌 **硬性要求**(当前 Hy3 校准脚本经过验证的配置): > - **算力**:**16 卡**(两个节点 × 每节点 8 卡),用于 TP/PP 跨节点切分 > - **vLLM 版本**:**v0.20.0**(补丁文件按此版本对齐,其它版本需要重新生成补丁) > - **Python 环境**:所有节点保持一致(建议使用同一个 conda / venv) @@ -21,7 +21,7 @@ ### 1. 准备 Ray 集群(2 节点 × 8 卡 = 16 卡) -hy3 等大模型需要跨节点 TP/PP,校准脚本默认走 vLLM 的 Ray distributed executor,必须先在 **两台 8 卡节点** 上分别拉起 Ray,组成一个 16 卡集群。 +Hy3 等大模型需要跨节点 TP/PP,校准脚本默认走 vLLM 的 Ray distributed executor,必须先在 **两台 8 卡节点** 上分别拉起 Ray,组成一个 16 卡集群。 下面给出的环境变量按 **RDMA / 多网卡** 集群的常见配置示例,请按实际网络拓扑调整(特别是 `*_SOCKET_IFNAME`、`NCCL_IB_GID_INDEX`)。 @@ -113,19 +113,19 @@ bash tools/vllm_patch/install.sh --help # 查看完整用法 --- -## 二、hy3.0 系列脚本(Hunyuan-A20B 等 hy3 模型) +## 二、Hy3.0 系列脚本(Hunyuan-A20B 等 Hy3 模型) 下面 3 个脚本共享同一套 vLLM 运行时环境(chunked prefill / FlashInfer attention / mp distributed executor / fused MoE 等),区别在于产出物不同。 | 脚本 | 用途 | 入口 | | --- | --- | --- | -| [`run_vllm_quant_for_hy3.sh`](./run_vllm_quant_for_hy3.sh) | ★ 推荐的"一键流水线":校准 + 量化 | `tools/run_vllm_calibrate.py` + `tools/fp8_quant_with_vllm_activation.py` | -| [`run_vllm_calibrate_for_hy3.sh`](./run_vllm_calibrate_for_hy3.sh) | 仅 W8A8C8 联合校准 | `tools/run_vllm_calibrate.py` | -| [`run_kvcache_calibrate_for_hy3.sh`](./run_kvcache_calibrate_for_hy3.sh) | 仅 KV-cache 校准(轻量) | `tools/kvcache/run_kvcache_calibrate.py` | +| [`run_vllm_quant_for_Hy3.sh`](./run_vllm_quant_for_Hy3.sh) | ★ 推荐的"一键流水线":校准 + 量化 | `tools/run_vllm_calibrate.py` + `tools/fp8_quant_with_vllm_activation.py` | +| [`run_vllm_calibrate_for_Hy3.sh`](./run_vllm_calibrate_for_Hy3.sh) | 仅 W8A8C8 联合校准 | `tools/run_vllm_calibrate.py` | +| [`run_kvcache_calibrate_for_Hy3.sh`](./run_kvcache_calibrate_for_Hy3.sh) | 仅 KV-cache 校准(轻量) | `tools/kvcache/run_kvcache_calibrate.py` | --- -### 1. `run_vllm_quant_for_hy3.sh` ★推荐的"一键流水线" +### 1. `run_vllm_quant_for_Hy3.sh` ★推荐的"一键流水线" **功能**:bf16 模型 → vLLM 激活校准 → FP8 HF safetensors,全流程一次完成。 @@ -149,17 +149,17 @@ bash tools/vllm_patch/install.sh --help # 查看完整用法 #### CLI 开关 ```bash -bash run_vllm_quant_for_hy3.sh # 两阶段都跑 -bash run_vllm_quant_for_hy3.sh --skip-calibrate # 仅量化(复用已有 stats_dir) -bash run_vllm_quant_for_hy3.sh --skip-quantize # 仅校准 -bash run_vllm_quant_for_hy3.sh --help # 打印用法 +bash run_vllm_quant_for_Hy3.sh # 两阶段都跑 +bash run_vllm_quant_for_Hy3.sh --skip-calibrate # 仅量化(复用已有 stats_dir) +bash run_vllm_quant_for_Hy3.sh --skip-quantize # 仅校准 +bash run_vllm_quant_for_Hy3.sh --help # 打印用法 ``` > 脚本开启 `set -euo pipefail`,任一阶段失败将立即中断。 --- -### 2. `run_vllm_calibrate_for_hy3.sh` — 一键脚本里的"阶段 1"独立版 +### 2. `run_vllm_calibrate_for_Hy3.sh` — 一键脚本里的"阶段 1"独立版 **功能**:只跑 W8A8C8 联合校准,不做量化。 @@ -181,20 +181,20 @@ bash run_vllm_quant_for_hy3.sh --help # 打印用法 #### 适用场景 - 想自己接后续量化工具,不走 `fp8_quant_with_vllm_activation.py`。 -- 想单独调校 PTQ 数据集 / `num_samples` / `max_length`,再用 `run_vllm_quant_for_hy3.sh --skip-calibrate` 复用结果。 +- 想单独调校 PTQ 数据集 / `num_samples` / `max_length`,再用 `run_vllm_quant_for_Hy3.sh --skip-calibrate` 复用结果。 - Debug 用 `--skip-weight-loading` 跑 dummy 权重,快速验证 hook 注册流程。 --- -### 3. `run_kvcache_calibrate_for_hy3.sh` — 仅校准 KV-cache(轻量) +### 3. `run_kvcache_calibrate_for_Hy3.sh` — 仅校准 KV-cache(轻量) **功能**:只校准 KV-cache(K/V min/max),不做 weight / activation / MoE 统计。 - **入口**:`tools/kvcache/run_kvcache_calibrate.py` -#### 关键差异(与 `run_vllm_calibrate_for_hy3.sh` 对比) +#### 关键差异(与 `run_vllm_calibrate_for_Hy3.sh` 对比) -| 维度 | `run_kvcache_calibrate_for_hy3.sh` | `run_vllm_calibrate_for_hy3.sh` | +| 维度 | `run_kvcache_calibrate_for_Hy3.sh` | `run_vllm_calibrate_for_Hy3.sh` | | --- | --- | --- | | MoE / Linear 钩子 | 故意 **NOT** 设置 `VLLM_MOE_COLLECT_STATS`,完全跳过,启动更快、CPU 内存占用更低 | 全开 | | KV 搜索范围 | `[0.4, 8.0]`,`num_steps=50`(更窄、更聚焦) | `[0.8, 16.0]` | diff --git a/scripts/ptq/run_kvcache_calibrate_for_hy3.sh b/scripts/ptq/run_kvcache_calibrate_for_Hy3.sh similarity index 87% rename from scripts/ptq/run_kvcache_calibrate_for_hy3.sh rename to scripts/ptq/run_kvcache_calibrate_for_Hy3.sh index ba519324..eb3413cb 100755 --- a/scripts/ptq/run_kvcache_calibrate_for_hy3.sh +++ b/scripts/ptq/run_kvcache_calibrate_for_Hy3.sh @@ -17,10 +17,10 @@ export ASYNC_SCHEDULING=1 export VLLM_ENABLE_PREFIX_CACHING=1 export PRECISIONMODE=HF -CONFIG=configs/hy3/ptq/hy3_kvcache_calibrate.yaml +CONFIG=configs/Hy3/ptq/Hy3_kvcache_calibrate.yaml mkdir -p logs python3 tools/kvcache/run_kvcache_calibrate.py \ -c $CONFIG \ - 2>&1 | tee logs/run_kvcache_calibrate_hy3.log + 2>&1 | tee logs/run_kvcache_calibrate_Hy3.log diff --git a/scripts/ptq/run_vllm_calibrate_for_hy3.sh b/scripts/ptq/run_vllm_calibrate_for_Hy3.sh similarity index 89% rename from scripts/ptq/run_vllm_calibrate_for_hy3.sh rename to scripts/ptq/run_vllm_calibrate_for_Hy3.sh index a28c97c4..c7c9c812 100755 --- a/scripts/ptq/run_vllm_calibrate_for_hy3.sh +++ b/scripts/ptq/run_vllm_calibrate_for_Hy3.sh @@ -22,10 +22,10 @@ export ASYNC_SCHEDULING=1 export VLLM_ENABLE_PREFIX_CACHING=1 export PRECISIONMODE=HF -CONFIG=configs/hy3/ptq/hy3_vllm_calibrate.yaml +CONFIG=configs/Hy3/ptq/Hy3_vllm_calibrate.yaml mkdir -p logs python3 tools/run_vllm_calibrate.py \ -c $CONFIG \ - 2>&1 | tee logs/run_vllm_calibrate_hy3.log + 2>&1 | tee logs/run_vllm_calibrate_Hy3.log diff --git a/scripts/ptq/run_vllm_quant_for_hy3.sh b/scripts/ptq/run_vllm_quant_for_Hy3.sh similarity index 90% rename from scripts/ptq/run_vllm_quant_for_hy3.sh rename to scripts/ptq/run_vllm_quant_for_Hy3.sh index 528718cd..e5f53df8 100755 --- a/scripts/ptq/run_vllm_quant_for_hy3.sh +++ b/scripts/ptq/run_vllm_quant_for_Hy3.sh @@ -18,13 +18,13 @@ # ``output_dir`` in CALIB_CONFIG, otherwise stage 2 cannot find the stats. # # Usage: -# bash run_vllm_quant_for_hy3.sh +# bash run_vllm_quant_for_Hy3.sh # (run both stages back-to-back) # -# bash run_vllm_quant_for_hy3.sh --skip-calibrate +# bash run_vllm_quant_for_Hy3.sh --skip-calibrate # (skip stage 1, only quantize using existing stats dir) # -# bash run_vllm_quant_for_hy3.sh --skip-quantize +# bash run_vllm_quant_for_Hy3.sh --skip-quantize # (only run stage 1, do not produce the FP8 model) # ============================================================================= @@ -68,8 +68,8 @@ export PRECISIONMODE=HF # ---------------------------------------------------------------------------- # YAML configs (one per stage) # ---------------------------------------------------------------------------- -CALIB_CONFIG=configs/hy3/ptq/hy3_vllm_calibrate.yaml -QUANT_CONFIG=configs/hy3/ptq/hy3_vllm_quant_fp8_per_tensor.yaml +CALIB_CONFIG=configs/Hy3/ptq/Hy3_vllm_calibrate.yaml +QUANT_CONFIG=configs/Hy3/ptq/Hy3_vllm_quant_fp8_per_tensor.yaml mkdir -p logs @@ -82,7 +82,7 @@ if [[ "${do_calibrate}" -eq 1 ]]; then python3 tools/run_vllm_calibrate.py \ -c "${CALIB_CONFIG}" \ - 2>&1 | tee "logs/run_vllm_quant_hy3-calibrate.log" + 2>&1 | tee "logs/run_vllm_quant_Hy3-calibrate.log" echo "[pipeline] Stage 1 finished." else @@ -98,7 +98,7 @@ if [[ "${do_quantize}" -eq 1 ]]; then python3 tools/fp8_quant_with_vllm_activation.py \ -c "${QUANT_CONFIG}" \ - 2>&1 | tee "logs/run_vllm_quant_hy3-quantize.log" + 2>&1 | tee "logs/run_vllm_quant_Hy3-quantize.log" echo "[pipeline] Stage 2 finished." else diff --git a/tools/_yaml_args.py b/tools/_yaml_args.py index cead2860..74625454 100644 --- a/tools/_yaml_args.py +++ b/tools/_yaml_args.py @@ -14,7 +14,7 @@ """Shared YAML-config loader for standalone tool scripts. -The hy3 tool entry points (``tools/run_vllm_calibrate.py``, +The Hy3 tool entry points (``tools/run_vllm_calibrate.py``, ``tools/kvcache/run_kvcache_calibrate.py`` and ``tools/fp8_quant_with_vllm_activation.py``) all use ``argparse``. To match the style of ``scripts/ptq/run_vllm_quant_for_deepseek_v3.sh`` (shell only diff --git a/tools/kvcache/README.md b/tools/kvcache/README.md index 1313bb78..c673d0c5 100644 --- a/tools/kvcache/README.md +++ b/tools/kvcache/README.md @@ -60,8 +60,8 @@ python3 tools/kvcache/run_kvcache_calibrate.py \ --search-kv-num-steps 50 ``` -A ready-to-run wrapper for hy3 lives at -[`scripts/ptq/run_kvcache_calibrate_for_hy3.sh`](../../scripts/ptq/run_kvcache_calibrate_for_hy3.sh). +A ready-to-run wrapper for Hy3 lives at +[`scripts/ptq/run_kvcache_calibrate_for_Hy3.sh`](../../scripts/ptq/run_kvcache_calibrate_for_Hy3.sh). > ⚠️ Requires the AngelSlim vLLM patch > ([`tools/vllm_patch/`](../vllm_patch/)) to be installed in the active From 6bcdf3db3849bbcbfc22177138abb0140c7efbc2 Mon Sep 17 00:00:00 2001 From: krizaltang Date: Fri, 29 May 2026 14:27:57 +0800 Subject: [PATCH 3/3] add kv per head yaml. --- configs/Hy3/ptq/Hy3_vllm_calibrate.yaml | 37 ---------- .../ptq/Hy3_vllm_quant_fp8_per_tensor.yaml | 27 ------- .../ptq/{ => fp8}/Hy3_kvcache_calibrate.yaml | 0 .../Hy3/ptq/fp8/Hy3_vllm_ptq_kv_per_head.yaml | 70 +++++++++++++++++++ .../Hy3/ptq/fp8/Hy3_vllm_ptq_per_tensor.yaml | 70 +++++++++++++++++++ scripts/ptq/README.md | 10 ++- scripts/ptq/run_kvcache_calibrate_for_Hy3.sh | 2 +- scripts/ptq/run_vllm_calibrate_for_Hy3.sh | 2 +- scripts/ptq/run_vllm_quant_for_Hy3.sh | 25 +++---- tools/fp8_quant_with_vllm_activation.py | 37 ++++++++-- 10 files changed, 194 insertions(+), 86 deletions(-) delete mode 100644 configs/Hy3/ptq/Hy3_vllm_calibrate.yaml delete mode 100644 configs/Hy3/ptq/Hy3_vllm_quant_fp8_per_tensor.yaml rename configs/Hy3/ptq/{ => fp8}/Hy3_kvcache_calibrate.yaml (100%) create mode 100644 configs/Hy3/ptq/fp8/Hy3_vllm_ptq_kv_per_head.yaml create mode 100644 configs/Hy3/ptq/fp8/Hy3_vllm_ptq_per_tensor.yaml diff --git a/configs/Hy3/ptq/Hy3_vllm_calibrate.yaml b/configs/Hy3/ptq/Hy3_vllm_calibrate.yaml deleted file mode 100644 index 3a29b001..00000000 --- a/configs/Hy3/ptq/Hy3_vllm_calibrate.yaml +++ /dev/null @@ -1,37 +0,0 @@ -# vLLM activation calibration for Hy3 (collects activation + MoE expert stats, -# and optionally KV-cache stats / scale search). -# Consumed by: tools/run_vllm_calibrate.py -# -# Keys here match the script's argparse `dest` names. Values listed below -# override argparse defaults; explicit command-line flags still take final -# precedence (e.g. `python3 ... -c this.yaml --tp-size 8` will use 8). - -# -------- Paths -------- -model_path: /path/to/model -ptq_data_path: /path/to/dataset -output_dir: /path/to/statistics - -# -------- Model loading / runtime -------- -tp_size: 16 -batch_size: 4 -num_samples: 64 -max_length: 16384 -distributed_executor_backend: ray # ray | mp -skip_weight_loading: false # true => dummy weights (debug only) - -# -------- MTP (Multi-Token Prediction) -------- -enable_mtp: false -num_speculative_tokens: 1 - -# -------- Debug -------- -verbose: false - -# -------- KV-cache granularity -------- -kv_granularity: per-tensor # none | per-tensor | per-head - -# -------- KV-cache scale search -------- -search_kv_scale: false -search_kv_num_samples: 64 -search_kv_min_multiplier: 0.8 -search_kv_max_multiplier: 16.0 -search_kv_num_steps: 50 diff --git a/configs/Hy3/ptq/Hy3_vllm_quant_fp8_per_tensor.yaml b/configs/Hy3/ptq/Hy3_vllm_quant_fp8_per_tensor.yaml deleted file mode 100644 index 6f145a3e..00000000 --- a/configs/Hy3/ptq/Hy3_vllm_quant_fp8_per_tensor.yaml +++ /dev/null @@ -1,27 +0,0 @@ -# Stage-2 of the Hy3 PTQ pipeline: FP8 quantization of bf16 weights using -# the activation_stats / moe_expert_stats produced by stage-1 calibration. -# Consumed by: tools/fp8_quant_with_vllm_activation.py -# -# Keys here match the script's argparse `dest` names. Values listed below -# override argparse defaults; explicit command-line flags still take final -# precedence (e.g. `python3 ... -c this.yaml --num-workers 8` will use 8). -# -# IMPORTANT: input_vllm_ac_json_path MUST equal the `output_dir` used in -# stage 1 (Hy3_vllm_calibrate.yaml), otherwise the FP8 quantizer will not -# find activation_stats.json / moe_expert_stats.json. - -input_bf16_hf_path: /path/to/input/model -input_vllm_ac_json_path: /path/to/statistics -output_fp8_hf_path: /path/to/output/fp8_model - -# Optional: leave at defaults unless you know what you are doing -block_size: [-1, -1] -num_workers: 16 - -# KV-cache scheme & granularity (must match calibration config) -# scheme: dynamic => no static scale is saved; granularity forced to per_token_per_head -# static => granularity can be none | per-tensor | per-head -k_scheme: static # dynamic | static -v_scheme: static # dynamic | static -k_granularity: per-tensor # none | per-tensor | per-head (only used when k_scheme=static) -v_granularity: per-tensor # none | per-tensor | per-head (only used when v_scheme=static) diff --git a/configs/Hy3/ptq/Hy3_kvcache_calibrate.yaml b/configs/Hy3/ptq/fp8/Hy3_kvcache_calibrate.yaml similarity index 100% rename from configs/Hy3/ptq/Hy3_kvcache_calibrate.yaml rename to configs/Hy3/ptq/fp8/Hy3_kvcache_calibrate.yaml diff --git a/configs/Hy3/ptq/fp8/Hy3_vllm_ptq_kv_per_head.yaml b/configs/Hy3/ptq/fp8/Hy3_vllm_ptq_kv_per_head.yaml new file mode 100644 index 00000000..3f4fd4e1 --- /dev/null +++ b/configs/Hy3/ptq/fp8/Hy3_vllm_ptq_kv_per_head.yaml @@ -0,0 +1,70 @@ +# Unified YAML config for the Hy3 PTQ pipeline. +# Consumed by BOTH stages of scripts/ptq/run_vllm_quant_for_Hy3.sh: +# - tools/run_vllm_calibrate.py (stage 1: activation calibration) +# - tools/fp8_quant_with_vllm_activation.py (stage 2: FP8 quantization) +# +# Each stage's argparse picks up only the keys it knows about; unrelated +# keys are ignored with a single "[yaml-config] WARNING: unknown keys" log. +# Keys here use underscore form (matching argparse `dest`). Explicit CLI +# flags still take final precedence over YAML values. + +# ============================================================================ +# Shared paths (model_path / output_dir are defined for stage 1; stage 2 +# reuses them via fallbacks: input_bf16_hf_path defaults to model_path, +# input_vllm_ac_json_path defaults to output_dir). +# ============================================================================ +model_path: /path/to/model # stage1: bf16 model dir; stage2 reuses as input_bf16_hf_path +ptq_data_path: /path/to/dataset # stage1 only +output_dir: /path/to/statistics # stage1: where stats are written; stage2 reuses as input_vllm_ac_json_path +output_fp8_hf_path: /path/to/output/fp8_model # stage2 only + +# ============================================================================ +# Stage 1 — vLLM activation calibration (tools/run_vllm_calibrate.py) +# ============================================================================ + +# -------- Model loading / runtime -------- +tp_size: 16 +batch_size: 4 +num_samples: 512 +max_length: 16384 +distributed_executor_backend: ray # ray | mp +skip_weight_loading: false # true => dummy weights (debug only) + +# -------- MTP (Multi-Token Prediction) -------- +enable_mtp: false +num_speculative_tokens: 1 + +# -------- Debug -------- +verbose: false + +# -------- KV-cache calibration granularity -------- +# Used by stage 1 to decide how KV stats are *collected*. Calibration always +# runs regardless of the stage-2 scheme; this only controls the granularity +# of the collected scale (none | per-tensor | per-head). +kv_granularity: per-head + +# -------- KV-cache scale search (stage 1) -------- +search_kv_scale: true +search_kv_num_samples: 64 +search_kv_min_multiplier: 0.8 +search_kv_max_multiplier: 16.0 +search_kv_num_steps: 50 + +# ============================================================================ +# Stage 2 — FP8 quantization (tools/fp8_quant_with_vllm_activation.py) +# ============================================================================ + +# Optional: leave at defaults unless you know what you are doing +block_size: [-1, -1] +num_workers: 16 + +# KV-cache scheme & granularity at *quantization* time. +# scheme = dynamic => no static scale is saved; granularity is forced to +# per_token_per_head in config.json. +# scheme = static => calibrated scale is written to kv_cache_scales.safetensors, +# at the granularity selected below +# (none | per-tensor | per-head). +k_scheme: dynamic # dynamic | static +v_scheme: static # dynamic | static +quant_k_granularity: per-head # only used when k_scheme=static +quant_v_granularity: per-head # only used when v_scheme=static diff --git a/configs/Hy3/ptq/fp8/Hy3_vllm_ptq_per_tensor.yaml b/configs/Hy3/ptq/fp8/Hy3_vllm_ptq_per_tensor.yaml new file mode 100644 index 00000000..aa93074c --- /dev/null +++ b/configs/Hy3/ptq/fp8/Hy3_vllm_ptq_per_tensor.yaml @@ -0,0 +1,70 @@ +# Unified YAML config for the Hy3 PTQ pipeline. +# Consumed by BOTH stages of scripts/ptq/run_vllm_quant_for_Hy3.sh: +# - tools/run_vllm_calibrate.py (stage 1: activation calibration) +# - tools/fp8_quant_with_vllm_activation.py (stage 2: FP8 quantization) +# +# Each stage's argparse picks up only the keys it knows about; unrelated +# keys are ignored with a single "[yaml-config] WARNING: unknown keys" log. +# Keys here use underscore form (matching argparse `dest`). Explicit CLI +# flags still take final precedence over YAML values. + +# ============================================================================ +# Shared paths (model_path / output_dir are defined for stage 1; stage 2 +# reuses them via fallbacks: input_bf16_hf_path defaults to model_path, +# input_vllm_ac_json_path defaults to output_dir). +# ============================================================================ +model_path: /path/to/model # stage1: bf16 model dir; stage2 reuses as input_bf16_hf_path +ptq_data_path: /path/to/dataset # stage1 only +output_dir: /path/to/statistics # stage1: where stats are written; stage2 reuses as input_vllm_ac_json_path +output_fp8_hf_path: /path/to/output/fp8_model # stage2 only + +# ============================================================================ +# Stage 1 — vLLM activation calibration (tools/run_vllm_calibrate.py) +# ============================================================================ + +# -------- Model loading / runtime -------- +tp_size: 16 +batch_size: 4 +num_samples: 512 +max_length: 16384 +distributed_executor_backend: ray # ray | mp +skip_weight_loading: false # true => dummy weights (debug only) + +# -------- MTP (Multi-Token Prediction) -------- +enable_mtp: false +num_speculative_tokens: 1 + +# -------- Debug -------- +verbose: false + +# -------- KV-cache calibration granularity -------- +# Used by stage 1 to decide how KV stats are *collected*. Calibration always +# runs regardless of the stage-2 scheme; this only controls the granularity +# of the collected scale (none | per-tensor | per-head). +kv_granularity: per-tensor + +# -------- KV-cache scale search (stage 1) -------- +search_kv_scale: true +search_kv_num_samples: 64 +search_kv_min_multiplier: 0.8 +search_kv_max_multiplier: 16.0 +search_kv_num_steps: 50 + +# ============================================================================ +# Stage 2 — FP8 quantization (tools/fp8_quant_with_vllm_activation.py) +# ============================================================================ + +# Optional: leave at defaults unless you know what you are doing +block_size: [-1, -1] +num_workers: 16 + +# KV-cache scheme & granularity at *quantization* time. +# scheme = dynamic => no static scale is saved; granularity is forced to +# per_token_per_head in config.json. +# scheme = static => calibrated scale is written to kv_cache_scales.safetensors, +# at the granularity selected below +# (none | per-tensor | per-head). +k_scheme: static # dynamic | static +v_scheme: static # dynamic | static +quant_k_granularity: per-tensor # only used when k_scheme=static +quant_v_granularity: per-tensor # only used when v_scheme=static diff --git a/scripts/ptq/README.md b/scripts/ptq/README.md index 7931fe5a..9201460f 100644 --- a/scripts/ptq/README.md +++ b/scripts/ptq/README.md @@ -141,8 +141,14 @@ bash tools/vllm_patch/install.sh --help # 查看完整用法 #### 阶段 2:调用 `tools/fp8_quant_with_vllm_activation.py` - 读取 `${stats_dir}` 下的 `activation_stats.json` / `moe_expert_stats.json`,结合原 bf16 权重,做 per-tensor FP8 量化(含 weight + input scale),写出到 `${fp8_path}`。 -- KV-cache scale 的写入行为由量化 YAML 中的 `k_scheme` / `v_scheme` 控制: - - `static`:将校准得到的 scale 写入 `kv_cache_scales.safetensors`,粒度由 `k_granularity` / `v_granularity` 决定(`none` | `per-tensor` | `per-head`)。 +- 校准(stage-1)与量化(stage-2)共享 **同一份 YAML**:[`configs/Hy3/ptq/fp8/Hy3_vllm_ptq_per_tensor.yaml`](../../configs/Hy3/ptq/fp8/Hy3_vllm_ptq_per_tensor.yaml)。 + - 路径只配一次:stage-2 的 `input_bf16_hf_path` 默认回退到 stage-1 的 `model_path`,`input_vllm_ac_json_path` 默认回退到 stage-1 的 `output_dir`。 + - 每个阶段只读取自己关心的字段,不认识的字段会打一行 `[yaml-config] WARNING: unknown keys` 然后忽略,属于正常现象。 +- KV-cache 的"校准粒度"与"量化粒度"分开控制: + - 校准阶段(stage-1)由 `kv_granularity`(`none` | `per-tensor` | `per-head`)决定 KV scale 的收集粒度。 + - 量化阶段(stage-2)由 `k_scheme` / `v_scheme`(`dynamic` | `static`)决定是否把 scale 写进 safetensor;当 scheme=`static` 时,再由 `quant_k_granularity` / `quant_v_granularity`(`none` | `per-tensor` | `per-head`)决定写入粒度。 +- KV-cache scale 的写入行为由量化阶段的 `k_scheme` / `v_scheme` 控制: + - `static`:将校准得到的 scale 写入 `kv_cache_scales.safetensors`,粒度由 `quant_k_granularity` / `quant_v_granularity` 决定(`none` | `per-tensor` | `per-head`)。 - `dynamic`:不写入对应的 scale(`model.safetensors.index.json` 中也不包含对应 key),`config.json` 中标记为 `"scheme": "dynamic", "granularity": "per_token_per_head"`(与 `q_quant` 一致)。 - 产出的 `config.json` 中 `attn_quant_config.kv_cache_quant` 的 `k_quant` 和 `v_quant` 独立配置,支持 K/V 使用不同的 scheme。 diff --git a/scripts/ptq/run_kvcache_calibrate_for_Hy3.sh b/scripts/ptq/run_kvcache_calibrate_for_Hy3.sh index eb3413cb..d30a70b0 100755 --- a/scripts/ptq/run_kvcache_calibrate_for_Hy3.sh +++ b/scripts/ptq/run_kvcache_calibrate_for_Hy3.sh @@ -17,7 +17,7 @@ export ASYNC_SCHEDULING=1 export VLLM_ENABLE_PREFIX_CACHING=1 export PRECISIONMODE=HF -CONFIG=configs/Hy3/ptq/Hy3_kvcache_calibrate.yaml +CONFIG=configs/Hy3/ptq/fp8/Hy3_kvcache_calibrate.yaml mkdir -p logs diff --git a/scripts/ptq/run_vllm_calibrate_for_Hy3.sh b/scripts/ptq/run_vllm_calibrate_for_Hy3.sh index c7c9c812..90dd3add 100755 --- a/scripts/ptq/run_vllm_calibrate_for_Hy3.sh +++ b/scripts/ptq/run_vllm_calibrate_for_Hy3.sh @@ -22,7 +22,7 @@ export ASYNC_SCHEDULING=1 export VLLM_ENABLE_PREFIX_CACHING=1 export PRECISIONMODE=HF -CONFIG=configs/Hy3/ptq/Hy3_vllm_calibrate.yaml +CONFIG=configs/Hy3/ptq/fp8/Hy3_vllm_ptq_per_tensor.yaml mkdir -p logs diff --git a/scripts/ptq/run_vllm_quant_for_Hy3.sh b/scripts/ptq/run_vllm_quant_for_Hy3.sh index e5f53df8..1b688290 100755 --- a/scripts/ptq/run_vllm_quant_for_Hy3.sh +++ b/scripts/ptq/run_vllm_quant_for_Hy3.sh @@ -6,16 +6,17 @@ # Stage 1: tools/run_vllm_calibrate.py # * Loads the bf16 model with vLLM, runs forward passes on the PTQ dataset, # and dumps activation_stats.json / moe_expert_stats.json / kv_cache_* -# into the directory given by ``output_dir`` in CALIB_CONFIG. +# into the directory given by ``output_dir`` in PTQ_CONFIG. # # Stage 2: tools/fp8_quant_with_vllm_activation.py # * Reads activation_stats.json (+ moe_expert_stats.json if any) plus the # original bf16 weights, applies per-tensor FP8 quantization with # calibrated input scales, and writes the FP8 HF model into the directory -# given by ``output_fp8_hf_path`` in QUANT_CONFIG. +# given by ``output_fp8_hf_path`` in PTQ_CONFIG. # -# IMPORTANT: ``input_vllm_ac_json_path`` in QUANT_CONFIG must equal -# ``output_dir`` in CALIB_CONFIG, otherwise stage 2 cannot find the stats. +# Both stages share a SINGLE unified YAML (PTQ_CONFIG); stage 2 reuses stage +# 1's ``model_path`` as ``input_bf16_hf_path`` and ``output_dir`` as +# ``input_vllm_ac_json_path``, so paths only need to be set once. # # Usage: # bash run_vllm_quant_for_Hy3.sh @@ -41,7 +42,7 @@ for arg in "$@"; do --skip-calibrate) do_calibrate=0 ;; --skip-quantize) do_quantize=0 ;; -h|--help) - sed -n '2,30p' "${BASH_SOURCE[0]}" | sed 's/^# \{0,1\}//' + sed -n '2,32p' "${BASH_SOURCE[0]}" | sed 's/^# \{0,1\}//' exit 0 ;; *) @@ -66,10 +67,10 @@ export VLLM_ENABLE_PREFIX_CACHING=1 export PRECISIONMODE=HF # ---------------------------------------------------------------------------- -# YAML configs (one per stage) +# Unified YAML config (drives BOTH stages; each stage's argparse picks up +# only the keys it knows about, and unknown keys are warned-and-ignored). # ---------------------------------------------------------------------------- -CALIB_CONFIG=configs/Hy3/ptq/Hy3_vllm_calibrate.yaml -QUANT_CONFIG=configs/Hy3/ptq/Hy3_vllm_quant_fp8_per_tensor.yaml +PTQ_CONFIG=configs/Hy3/ptq/fp8/Hy3_vllm_ptq_per_tensor.yaml mkdir -p logs @@ -78,10 +79,10 @@ mkdir -p logs # ============================================================================ if [[ "${do_calibrate}" -eq 1 ]]; then echo "[pipeline] === Stage 1/2: activation calibration ===" - echo "[pipeline] CALIB_CONFIG=${CALIB_CONFIG}" + echo "[pipeline] PTQ_CONFIG=${PTQ_CONFIG}" python3 tools/run_vllm_calibrate.py \ - -c "${CALIB_CONFIG}" \ + -c "${PTQ_CONFIG}" \ 2>&1 | tee "logs/run_vllm_quant_Hy3-calibrate.log" echo "[pipeline] Stage 1 finished." @@ -94,10 +95,10 @@ fi # ============================================================================ if [[ "${do_quantize}" -eq 1 ]]; then echo "[pipeline] === Stage 2/2: FP8 quantization ===" - echo "[pipeline] QUANT_CONFIG=${QUANT_CONFIG}" + echo "[pipeline] PTQ_CONFIG=${PTQ_CONFIG}" python3 tools/fp8_quant_with_vllm_activation.py \ - -c "${QUANT_CONFIG}" \ + -c "${PTQ_CONFIG}" \ 2>&1 | tee "logs/run_vllm_quant_Hy3-quantize.log" echo "[pipeline] Stage 2 finished." diff --git a/tools/fp8_quant_with_vllm_activation.py b/tools/fp8_quant_with_vllm_activation.py index 8966c7d9..a4dca258 100644 --- a/tools/fp8_quant_with_vllm_activation.py +++ b/tools/fp8_quant_with_vllm_activation.py @@ -1,3 +1,4 @@ +import argparse import json import math import multiprocessing as mp @@ -253,8 +254,8 @@ def main(bf16_path, fp8_path, block_size, ac_json_data): # Resolve scheme & granularity from CLI/YAML args k_scheme = getattr(args, "k_scheme", "static") v_scheme = getattr(args, "v_scheme", "static") - k_granularity_cfg = getattr(args, "k_granularity", "per-head").replace("-", "_") - v_granularity_cfg = getattr(args, "v_granularity", "per-head").replace("-", "_") + k_granularity_cfg = getattr(args, "quant_k_granularity", "per-head").replace("-", "_") + v_granularity_cfg = getattr(args, "quant_v_granularity", "per-head").replace("-", "_") # If scheme is dynamic, granularity is forced to per_token_per_head if k_scheme == "dynamic": @@ -557,19 +558,27 @@ def process_moe_values(data: Dict[str, Dict]) -> Dict[str, Dict]: "granularity forced to per_token_per_head) or 'static' (use calibrated scale).", ) parser.add_argument( - "--k-granularity", + "--quant-k-granularity", type=str, default="per-head", choices=["none", "per-tensor", "per-head"], - help="K-cache granularity when k_scheme=static (ignored if k_scheme=dynamic).", + help="K-cache granularity used at *quantization* time when k_scheme=static " + "(ignored if k_scheme=dynamic). Distinct from the calibration-time " + "granularity controlled by stage-1's --kv-granularity.", ) parser.add_argument( - "--v-granularity", + "--quant-v-granularity", type=str, default="per-head", choices=["none", "per-tensor", "per-head"], - help="V-cache granularity when v_scheme=static (ignored if v_scheme=dynamic).", + help="V-cache granularity used at *quantization* time when v_scheme=static " + "(ignored if v_scheme=dynamic). Distinct from the calibration-time " + "granularity controlled by stage-1's --kv-granularity.", ) + # Stage-1 path keys (model_path / output_dir) are accepted as fallbacks + # so that one unified YAML can drive both stages. + parser.add_argument("--model-path", type=str, default="", help=argparse.SUPPRESS) + parser.add_argument("--output-dir", type=str, default="", help=argparse.SUPPRESS) args = parser.parse_args() # Lazy-import _yaml_args (sibling module in tools/). Done here instead of @@ -584,6 +593,22 @@ def process_moe_values(data: Dict[str, Dict]) -> Dict[str, Dict]: apply_yaml_config(parser, args) + # Path fallbacks: when running with the unified Hy3 YAML, stage 2 reuses + # stage 1's `model_path` as the bf16 input dir, and `output_dir` (where + # stage 1 wrote stats) as the activation-json dir. + if not getattr(args, "input_bf16_hf_path", "") and getattr(args, "model_path", ""): + args.input_bf16_hf_path = args.model_path + print( + f"[yaml-config] input_bf16_hf_path not set; falling back to " + f"model_path={args.input_bf16_hf_path!r}" + ) + if not getattr(args, "input_vllm_ac_json_path", "") and getattr(args, "output_dir", ""): + args.input_vllm_ac_json_path = args.output_dir + print( + f"[yaml-config] input_vllm_ac_json_path not set; falling back to " + f"output_dir={args.input_vllm_ac_json_path!r}" + ) + # Validate required paths (may come from CLI or YAML). missing = [ name