NVIDIA · kinjalpatel27 · Apr 14, 2026 · Apr 10, 2026 · Apr 10, 2026 · coderabbitai
@@ -28,6 +28,7 @@ You can either edit the `quant_config` dictionary in `vllm_serve_fakequant.py`,
 | QUANT_FILE_PATH | Optional path to exported quantizer state dict `quantizer_state.pth` | None |
 | MODELOPT_STATE_PATH | Optional path to exported `vllm_fq_modelopt_state.pth` (restores quantizer state and parameters) | None |
 | CALIB_BATCH_SIZE | Calibration batch size                           | 1                  |
+| RECIPE_PATH      | Optional path to a ModelOpt PTQ recipe YAML  | None |
 
 Set these variables in your shell or Docker environment as needed to customize calibration.
 
@@ -65,7 +66,7 @@ Step 1: export the model with bf16 weights and quantizer state. To export the mo
 ```bash
 python ../llm_ptq/hf_ptq.py \
   --pyt_ckpt_path <MODEL_PATH> \
-  --qformat nvfp4 \
+  --recipe <PATH_TO_RECIPE> \
   --calib_size 512 \
   --export_path <EXPORT_DIR> \
   --vllm_fakequant_export \

@@ -43,6 +43,7 @@
     "quant_file_path": os.environ.get("QUANT_FILE_PATH", None),
     "modelopt_state_path": os.environ.get("MODELOPT_STATE_PATH", None),
     "calib_batch_size": int(os.environ.get("CALIB_BATCH_SIZE", 1)),
+    "recipe_path": os.environ.get("RECIPE_PATH", None),
 }
 
 
@@ -138,6 +139,7 @@ def compile_or_warm_up_model(self) -> None:
             quant_config["quant_cfg"]
             or quant_config["kv_quant_cfg"]
             or quant_config["modelopt_state_path"]
+            or quant_config["recipe_path"]
         ):
             _fakequant_run_prolog_worker(self)
         super().compile_or_warm_up_model()
@@ -24,6 +24,7 @@
 from vllm.v1.core.sched.output import CachedRequestData, NewRequestData, SchedulerOutput
 
 import modelopt.torch.quantization as mtq
+from modelopt.recipe import ModelOptPTQRecipe, load_recipe
 
 
 def _create_new_data_cls(data_cls, **kwargs):
@@ -141,22 +142,31 @@ def update_kv_cfg_for_mla(model: torch.nn.Module, kv_quant_cfg: list) -> list:
 def get_quant_config(quant_config: dict[str, Any], model: Any) -> dict[str, Any]:
     import copy
 
-    quant_cfg = (
-        copy.deepcopy(getattr(mtq, quant_config["quant_cfg"])) if quant_config["quant_cfg"] else {}
-    )
-    quant_kv_cfg = (
-        copy.deepcopy(getattr(mtq, quant_config["kv_quant_cfg"]))
-        if quant_config["kv_quant_cfg"]
-        else {}
-    )
+    if quant_config["recipe_path"]:
+        recipe = load_recipe(quant_config["recipe_path"])
+        assert isinstance(recipe, ModelOptPTQRecipe), (
+            f"Expected PTQ recipe, but got {type(recipe).__name__} from {quant_config['recipe_path']}"
+        )
+        quant_cfg = recipe.quantize
-    if quant_config["recipe_path"]:
-        recipe = load_recipe(quant_config["recipe_path"])
-        assert isinstance(recipe, ModelOptPTQRecipe), (
-            f"Expected PTQ recipe, but got {type(recipe).__name__} from {quant_config['recipe_path']}"
-        )
-        quant_cfg = recipe.quantize
+    if quant_config["recipe_path"]:
+        recipe = load_recipe(quant_config["recipe_path"])
+        if not isinstance(recipe, ModelOptPTQRecipe):
+            raise ValueError(
+                f"Expected PTQ recipe, but got {type(recipe).__name__} from {quant_config['recipe_path']}"
+            )
+        quant_cfg = recipe.quantize
-    if quant_config["recipe_path"]:
-        recipe = load_recipe(quant_config["recipe_path"])
-        assert isinstance(recipe, ModelOptPTQRecipe), (
-            f"Expected PTQ recipe, but got {type(recipe).__name__} from {quant_config['recipe_path']}"
-        )
-        quant_cfg = recipe.quantize
+    if quant_config["recipe_path"]:
+        recipe = load_recipe(quant_config["recipe_path"])
+        if not isinstance(recipe, ModelOptPTQRecipe):
+            raise ValueError(
+                f"Expected PTQ recipe, but got {type(recipe).__name__} from {quant_config['recipe_path']}"
+            )
+        quant_cfg = recipe.quantize
+    else:
+        quant_cfg = (
+            copy.deepcopy(getattr(mtq, quant_config["quant_cfg"]))
+            if quant_config["quant_cfg"]
+            else {}
+        )
+        quant_kv_cfg = (
+            copy.deepcopy(getattr(mtq, quant_config["kv_quant_cfg"]))
+            if quant_config["kv_quant_cfg"]
+            else {}
+        )
 
-    # Check if model has MLA and update KV config accordingly
-    if quant_kv_cfg:
-        quant_kv_cfg["quant_cfg"] = update_kv_cfg_for_mla(model, quant_kv_cfg["quant_cfg"])
+        # Check if model has MLA and update KV config accordingly
+        if quant_kv_cfg:
+            quant_kv_cfg["quant_cfg"] = update_kv_cfg_for_mla(model, quant_kv_cfg["quant_cfg"])
 
-    if quant_kv_cfg:
-        quant_cfg = mtq.utils.update_quant_cfg_with_kv_cache_quant(
-            quant_cfg, quant_kv_cfg["quant_cfg"]
-        )
+        if quant_kv_cfg:
+            quant_cfg = mtq.utils.update_quant_cfg_with_kv_cache_quant(
+                quant_cfg, quant_kv_cfg["quant_cfg"]
+            )
 
     return quant_cfg
@@ -78,6 +78,7 @@
     "KV_QUANT_CFG",
     "MODELOPT_STATE_PATH",
     "CALIB_BATCH_SIZE",
+    "RECIPE_PATH",
     "TRUST_REMOTE_CODE",
 }