From f660a938f18f4d11bef33eaf380f50b74f16af50 Mon Sep 17 00:00:00 2001
From: Willow <deepindeed2022@gmail.com>
Date: Fri, 10 Apr 2026 08:29:49 +0000
Subject: [PATCH 1/3] support Qwen3.5 quantization

---
 examples/llm_ptq/README.md                    |  1 +
 examples/llm_ptq/example_utils.py             |  6 ++
 examples/vlm_ptq/README.md                    |  1 +
 modelopt/torch/export/model_utils.py          |  2 +
 modelopt/torch/export/quant_utils.py          |  2 +-
 .../_test_utils/torch/transformers_models.py  | 36 +++++++++++
 .../quantization/plugins/test_huggingface.py  | 59 +++++++++++++++++++
 7 files changed, 106 insertions(+), 1 deletion(-)
diff --git a/examples/llm_ptq/README.md b/examples/llm_ptq/README.md
index 1cc1acfbf9..83a89ad37d 100755
--- a/examples/llm_ptq/README.md
+++ b/examples/llm_ptq/README.md
@@ -109,6 +109,7 @@ Please reference our [framework scripts](#framework-scripts) and our [docs](http
 | Gemma 3 | ✅<sup>2</sup> | - | ✅ | - | - |
 | QWen 2, 2.5 <sup>4</sup> | ✅ | ✅ | ✅ | ✅ | ✅ |
 | QWen3, 3.5 MOE, Next <sup>6</sup> | ✅ | - | - | - | ✅ |
+| QWen3.5 <sup>6</sup> | ✅ | - | ✅ | - | - |
 | QwQ | ✅ | - | - | - | ✅ |
 | DeepSeek V3, R1, V3.1, V3.2<sup>7</sup> | - | - | - | - | ✅ |
 | GLM-4.7<sup>8</sup> | ✅ | - | - | - | ✅ |
diff --git a/examples/llm_ptq/example_utils.py b/examples/llm_ptq/example_utils.py
index c2d4d4bfca..712741ace8 100755
--- a/examples/llm_ptq/example_utils.py
+++ b/examples/llm_ptq/example_utils.py
@@ -252,6 +252,12 @@ def build_quant_cfg(
         quant_cfg["quant_cfg"].append({"quantizer_name": "*image*", "enable": False})
         quant_cfg["quant_cfg"].append({"quantizer_name": "*vision*", "enable": False})
 
+    if model_type == "qwen3_5":
+        # GatedDeltaNet's in_proj_b and in_proj_a have very narrow output dimensions
+        # (hidden_size -> num_v_heads, e.g. 1024 -> 16), quantizing them causes accuracy loss.
+        quant_cfg["quant_cfg"].append({"quantizer_name": "*in_proj_b*", "enable": False})
+        quant_cfg["quant_cfg"].append({"quantizer_name": "*in_proj_a*", "enable": False})
+
     return quant_cfg
 
 
diff --git a/examples/vlm_ptq/README.md b/examples/vlm_ptq/README.md
index 8b9c31aa42..2a4a16f1e2 100644
--- a/examples/vlm_ptq/README.md
+++ b/examples/vlm_ptq/README.md
@@ -38,6 +38,7 @@ Please refer to the [llm_ptq/README.md](../llm_ptq/README.md#getting-started) fo
 | VILA | ✅ | ✅ | ✅ | ✅ | - |
 | Phi-3-vision, Phi-4-multimodal | ✅ | ✅ | ✅ | ✅ | ✅  |
 | Qwen2, 2.5-VL | ✅ | ✅ | ✅ | ✅ | ✅ |
+| Qwen3.5 | ✅ | - | ✅ | - | - |
 | Gemma3 | ✅ | - | - | - | - |
 
 > *<sup>1.</sup>Only TensorRT-LLM checkpoint export is supported. Not compatible with the TensorRT-LLM torch backend* \
diff --git a/modelopt/torch/export/model_utils.py b/modelopt/torch/export/model_utils.py
index 3bd72d9de9..ceea524c88 100755
--- a/modelopt/torch/export/model_utils.py
+++ b/modelopt/torch/export/model_utils.py
@@ -29,6 +29,8 @@
     "MPT": "mpt",
     "Bloom": "bloom",
     "ChatGLM": "chatglm",
+    "Qwen3_5Moe": "qwen3_5moe",
+    "Qwen3_5": "qwen3_5",
     "Qwen3Moe": "qwen3moe",
     "Qwen3Next": "qwen3next",
     "QWen": "qwen",
diff --git a/modelopt/torch/export/quant_utils.py b/modelopt/torch/export/quant_utils.py
index 4ceb51cd2c..8b3c7612fc 100755
--- a/modelopt/torch/export/quant_utils.py
+++ b/modelopt/torch/export/quant_utils.py
@@ -1221,7 +1221,7 @@ def _update_svdquant(modules, new_pre_quant_scale):
     # Mathematical equivalence:
     #   Before: down_proj_out = {[act_fn(self.gate_proj(x)) * up_proj(x)] * scale} @ down_proj.W^T
     #   After:  down_proj_out = {[act_fn(self.gate_proj(x)) * (up_proj(x) * scale)]} @ down_proj.W^T
-    (["LlamaMLP", "Qwen3MLP", "Qwen3MoeMLP"], ("up_proj", "down_proj")),
+    (["LlamaMLP", "Qwen3MLP", "Qwen3MoeMLP", "Qwen3_5MLP"], ("up_proj", "down_proj")),
 ]
 
 
diff --git a/tests/_test_utils/torch/transformers_models.py b/tests/_test_utils/torch/transformers_models.py
index 8fe2f68b32..14f140657b 100644
--- a/tests/_test_utils/torch/transformers_models.py
+++ b/tests/_test_utils/torch/transformers_models.py
@@ -35,6 +35,11 @@
     T5ForConditionalGeneration,
 )
 
+try:
+    from transformers import Qwen3_5TextConfig
+except ImportError:
+    Qwen3_5TextConfig = None
+
 import modelopt.torch.opt as mto
 
 SEED = 1234
@@ -117,6 +122,37 @@ def create_tiny_qwen3_moe_dir(
     get_tiny_qwen3_moe(**config_kwargs).save_pretrained(qwen3_moe_dir)
     return qwen3_moe_dir
 
+##### Qwen3.5 (hybrid linear attention + full attention) #####
+def get_tiny_qwen3_5(**config_kwargs) -> PreTrainedModel:
+    if Qwen3_5TextConfig is None:
+        pytest.skip("Qwen3_5TextConfig not available (requires transformers >= 4.57)")
+
+    set_seed(SEED)
+
+    kwargs = {
+        "dtype": torch.bfloat16,
+        "hidden_size": 32,
+        "intermediate_size": 32,
+        "num_hidden_layers": 4,
+        "num_attention_heads": 2,
+        "num_key_value_heads": 1,
+        "head_dim": 16,
+        "linear_num_key_heads": 4,
+        "linear_num_value_heads": 4,
+        "linear_key_head_dim": 8,
+        "linear_value_head_dim": 8,
+        "linear_conv_kernel_dim": 4,
+        "full_attention_interval": 4,
+        "attn_output_gate": True,
+        "max_position_embeddings": 32,
+        "vocab_size": 32,
+        "rms_norm_eps": 1e-6,
+    }
+    kwargs.update(**config_kwargs)
+    tiny_qwen3_5 = AutoModelForCausalLM.from_config(Qwen3_5TextConfig(**kwargs))
+
+    return tiny_qwen3_5
+
 
 ##### GPT-OSS #####
 def get_tiny_gpt_oss(**config_kwargs) -> PreTrainedModel:
diff --git a/tests/unit/torch/quantization/plugins/test_huggingface.py b/tests/unit/torch/quantization/plugins/test_huggingface.py
index 692ab07d4a..6f89181ec1 100644
--- a/tests/unit/torch/quantization/plugins/test_huggingface.py
+++ b/tests/unit/torch/quantization/plugins/test_huggingface.py
@@ -24,6 +24,7 @@
     create_tiny_llama_dir,
     get_tiny_gpt_oss,
     get_tiny_llama,
+    get_tiny_qwen3_5,
     get_tiny_qwen3_moe,
     tf_modelopt_state_and_output_tester,
 )
@@ -243,3 +244,61 @@ def test_hf_decoder_discoverer_registration_path():
     assert LayerActivationCollector.get_decoder_layers(model) is get_homogeneous_hf_decoder_layers(
         model
     )
+
+
+@pytest.mark.parametrize(
+    "quant_config",
+    [mtq.FP8_DEFAULT_CFG, mtq.INT4_AWQ_CFG],
+    ids=["fp8", "int4_awq"],
+)
+def test_qwen3_5_hybrid_attention_quantize(quant_config):
+    """Verify FP8 and AWQ quantization works for Qwen3.5 hybrid (GatedDeltaNet + Attention)."""
+    import copy
+
+    model = get_tiny_qwen3_5()
+
+    quant_cfg = copy.deepcopy(quant_config)
+    if quant_config is mtq.INT4_AWQ_CFG:
+        for entry in quant_cfg["quant_cfg"]:
+            if entry["quantizer_name"] == "*weight_quantizer":
+                entry.setdefault("cfg", {})["block_sizes"] = {-1: 16}
+                break
+
+    # Disable narrow GatedDeltaNet projections (same as example_utils does for qwen3_5)
+    quant_cfg["quant_cfg"].append({"quantizer_name": "*in_proj_b*", "enable": False})
+    quant_cfg["quant_cfg"].append({"quantizer_name": "*in_proj_a*", "enable": False})
+
+    def calib_fn(model):
+        x = model.dummy_inputs["input_ids"]
+        for _ in range(2):
+            model(x)
+
+    mtq.quantize(model, quant_cfg, calib_fn)
+
+    # Verify the model still produces output
+    with torch.no_grad():
+        out = model(model.dummy_inputs["input_ids"])
+    assert out.logits is not None
+
+    # Verify both GatedDeltaNet and Attention linear layers got quantized
+    has_gdn_quantized = False
+    has_attn_quantized = False
+    for name, module in model.named_modules():
+        if hasattr(module, "weight_quantizer") and hasattr(module, "weight"):
+            if "linear_attn.in_proj_qkv" in name:
+                has_gdn_quantized = True
+            if "self_attn.q_proj" in name:
+                has_attn_quantized = True
+    assert has_gdn_quantized, "GatedDeltaNet linear layers should be quantized"
+    assert has_attn_quantized, "Attention linear layers should be quantized"
+
+    # Verify narrow projections are NOT quantized
+    for name, module in model.named_modules():
+        if "in_proj_b" in name and hasattr(module, "weight_quantizer"):
+            assert not module.weight_quantizer.is_enabled, (
+                f"in_proj_b should have quantization disabled: {name}"
+            )
+        if "in_proj_a" in name and hasattr(module, "weight_quantizer"):
+            assert not module.weight_quantizer.is_enabled, (
+                f"in_proj_a should have quantization disabled: {name}"
+            )

From d2bb7ad0eda6ea63a3019cb24a1f7c39fdf3feab Mon Sep 17 00:00:00 2001
From: Willow <deepindeed2022@gmail.com>
Date: Mon, 13 Apr 2026 01:52:25 +0000
Subject: [PATCH 2/3] fix qwen3.5 ut

---
 tests/_test_utils/torch/transformers_models.py            | 1 +
 tests/unit/torch/quantization/plugins/test_huggingface.py | 5 +++--
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/tests/_test_utils/torch/transformers_models.py b/tests/_test_utils/torch/transformers_models.py
index 14f140657b..2b652d05dc 100644
--- a/tests/_test_utils/torch/transformers_models.py
+++ b/tests/_test_utils/torch/transformers_models.py
@@ -124,6 +124,7 @@ def create_tiny_qwen3_moe_dir(
 
 ##### Qwen3.5 (hybrid linear attention + full attention) #####
 def get_tiny_qwen3_5(**config_kwargs) -> PreTrainedModel:
+    """Create a tiny Qwen3.5 model with hybrid GatedDeltaNet + full attention layers for testing."""
     if Qwen3_5TextConfig is None:
         pytest.skip("Qwen3_5TextConfig not available (requires transformers >= 4.57)")
 
diff --git a/tests/unit/torch/quantization/plugins/test_huggingface.py b/tests/unit/torch/quantization/plugins/test_huggingface.py
index 6f89181ec1..736ca223e2 100644
--- a/tests/unit/torch/quantization/plugins/test_huggingface.py
+++ b/tests/unit/torch/quantization/plugins/test_huggingface.py
@@ -269,6 +269,7 @@ def test_qwen3_5_hybrid_attention_quantize(quant_config):
     quant_cfg["quant_cfg"].append({"quantizer_name": "*in_proj_a*", "enable": False})
 
     def calib_fn(model):
+        """Run calibration forward passes with dummy inputs."""
         x = model.dummy_inputs["input_ids"]
         for _ in range(2):
             model(x)
@@ -285,9 +286,9 @@ def calib_fn(model):
     has_attn_quantized = False
     for name, module in model.named_modules():
         if hasattr(module, "weight_quantizer") and hasattr(module, "weight"):
-            if "linear_attn.in_proj_qkv" in name:
+            if "linear_attn.in_proj_qkv" in name and module.weight_quantizer.is_enabled:
                 has_gdn_quantized = True
-            if "self_attn.q_proj" in name:
+            if "self_attn.q_proj" in name and module.weight_quantizer.is_enabled:
                 has_attn_quantized = True
     assert has_gdn_quantized, "GatedDeltaNet linear layers should be quantized"
     assert has_attn_quantized, "Attention linear layers should be quantized"

From 477ee772d6c07e7b7de667b01aa71bca42c2d130 Mon Sep 17 00:00:00 2001
From: Willow <deepindeed2022@gmail.com>
Date: Mon, 13 Apr 2026 03:09:44 +0000
Subject: [PATCH 3/3] add docstring for Docstring coverage

---
 examples/llm_ptq/example_utils.py                    |  6 ++++++
 tests/_test_utils/torch/transformers_models.py       |  2 ++
 .../torch/quantization/plugins/test_huggingface.py   | 12 ++++++++++++
 3 files changed, 20 insertions(+)

diff --git a/examples/llm_ptq/example_utils.py b/examples/llm_ptq/example_utils.py
index 712741ace8..ac07295220 100755
--- a/examples/llm_ptq/example_utils.py
+++ b/examples/llm_ptq/example_utils.py
@@ -209,6 +209,7 @@ def build_quant_cfg(
     model_type,
     moe_calib_experts_ratio: float | None = None,
 ) -> dict[str, Any]:
+    """Build quantization config with model-specific overrides for AWQ, SmoothQuant, and VLM."""
     quant_cfg = copy.deepcopy(quant_cfg)
     if "awq" in str(quant_cfg.get("algorithm")):
         from modelopt.torch.quantization.config import find_quant_cfg_entry_by_path
@@ -257,6 +258,11 @@ def build_quant_cfg(
         # (hidden_size -> num_v_heads, e.g. 1024 -> 16), quantizing them causes accuracy loss.
         quant_cfg["quant_cfg"].append({"quantizer_name": "*in_proj_b*", "enable": False})
         quant_cfg["quant_cfg"].append({"quantizer_name": "*in_proj_a*", "enable": False})
+        # TRT-LLM's Qwen3.5 linear-attention packing only supports weight/weight_scale_inv;
+        # disable activation quantization so input_scale is not exported for these layers.
+        quant_cfg["quant_cfg"].append(
+            {"quantizer_name": "*linear_attn*input_quantizer", "enable": False}
+        )
 
     return quant_cfg
 
diff --git a/tests/_test_utils/torch/transformers_models.py b/tests/_test_utils/torch/transformers_models.py
index 2b652d05dc..4a338ca67e 100644
--- a/tests/_test_utils/torch/transformers_models.py
+++ b/tests/_test_utils/torch/transformers_models.py
@@ -112,6 +112,7 @@ def get_tiny_qwen3_moe(**config_kwargs) -> PreTrainedModel:
 def create_tiny_qwen3_moe_dir(
     tmp_path: Path | str, with_tokenizer: bool = False, **config_kwargs
 ) -> Path:
+    """Save a tiny Qwen3 MoE model (and optional tokenizer) to a temp directory."""
     qwen3_moe_dir = Path(tmp_path) / "tiny_qwen3_moe"
     if with_tokenizer:
         tokenizer = AutoTokenizer.from_pretrained(
@@ -157,6 +158,7 @@ def get_tiny_qwen3_5(**config_kwargs) -> PreTrainedModel:
 
 ##### GPT-OSS #####
 def get_tiny_gpt_oss(**config_kwargs) -> PreTrainedModel:
+    """Create a tiny GPT-OSS MoE model for testing."""
     set_seed(SEED)
 
     kwargs = {
diff --git a/tests/unit/torch/quantization/plugins/test_huggingface.py b/tests/unit/torch/quantization/plugins/test_huggingface.py
index 736ca223e2..326781e723 100644
--- a/tests/unit/torch/quantization/plugins/test_huggingface.py
+++ b/tests/unit/torch/quantization/plugins/test_huggingface.py
@@ -236,6 +236,7 @@ def test_is_homogeneous_hf_model_gpt_oss():
 
 
 def test_hf_decoder_discoverer_registration_path():
+    """Verify HF decoder layer discoverer is registered and returns correct layers."""
     model = get_tiny_llama()
     assert any(
         is_supported is is_homogeneous_hf_model and discoverer is get_homogeneous_hf_decoder_layers
@@ -267,6 +268,10 @@ def test_qwen3_5_hybrid_attention_quantize(quant_config):
     # Disable narrow GatedDeltaNet projections (same as example_utils does for qwen3_5)
     quant_cfg["quant_cfg"].append({"quantizer_name": "*in_proj_b*", "enable": False})
     quant_cfg["quant_cfg"].append({"quantizer_name": "*in_proj_a*", "enable": False})
+    # Disable activation quantization for linear attention (TRT-LLM packing limitation)
+    quant_cfg["quant_cfg"].append(
+        {"quantizer_name": "*linear_attn*input_quantizer", "enable": False}
+    )
 
     def calib_fn(model):
         """Run calibration forward passes with dummy inputs."""
@@ -303,3 +308,10 @@ def calib_fn(model):
             assert not module.weight_quantizer.is_enabled, (
                 f"in_proj_a should have quantization disabled: {name}"
             )
+
+    # Verify linear_attn input quantizers are disabled (no input_scale for TRT-LLM export)
+    for name, module in model.named_modules():
+        if "linear_attn" in name and hasattr(module, "input_quantizer"):
+            assert not module.input_quantizer.is_enabled, (
+                f"linear_attn input_quantizer should be disabled: {name}"
+            )