[None][feat] Optimize nemotron-h from python level

Wanli-Jiang · Wanli-Jiang · commit db95bb74edd3 · 2026-04-14T02:16:05.000-07:00
* Enable more c++ routing combinations.
* Update mamba tensor operations.

Signed-off-by: Wanli Jiang &lt;35160485+Wanli-Jiang@users.noreply.github.com&gt;
diff --git a/tensorrt_llm/_torch/modules/fused_moe/routing.py b/tensorrt_llm/_torch/modules/fused_moe/routing.py
@@ -264,9 +264,9 @@ def noaux_tc(self, logits, e_score_correction_bias):
                         "The configuration is not supported by the fused routing kernel. We have to use the original pytorch implementation."
                     )
                 self.is_fused = False
-        elif (num_experts > 512 or (self.top_k > 8 and self.top_k != 22)
-              or (self.topk_group == 1 and self.top_k != 22)):
-            # We have special implementation for n_group == 1, top_k == 22 and num_experts == 512 for Nemotron Super v3.
+        elif num_experts > 512 or (self.top_k > 8 and self.top_k != 22):
+            # The fused noaux_tc_op kernel supports n_group==1 with top_k<=8
+            # or top_k==22, and num_experts<=512.
             if self.is_fused:
                 warnings.warn(
                     "The configuration is not supported by the fused routing kernel. We have to use the original pytorch implementation."
diff --git a/tensorrt_llm/_torch/modules/mamba/mamba2_mixer.py b/tensorrt_llm/_torch/modules/mamba/mamba2_mixer.py
@@ -240,6 +240,16 @@ def post_load_weights(self):
                 and self.norm.nvfp4_scale is None):
             self._try_attach_nvfp4_scale()
 
+        # Pre-expand A, D, dt_bias for the decode path.
+        self._A_expanded = repeat(self.A,
+                                  "h -> h p n",
+                                  p=self.head_dim,
+                                  n=self.d_state).to(dtype=torch.float32)
+        self._dt_bias_expanded = repeat(self.dt_bias,
+                                        "h -> h p",
+                                        p=self.head_dim)
+        self._D_expanded = repeat(self.D, "h -> h p", p=self.head_dim)
+
     def _try_attach_nvfp4_scale(self):
         """Attach input_scale from out_proj to norm for fused RMSNorm+Quant."""
 
@@ -454,22 +464,15 @@ def convert_dt():
                 ],
                 dim=-1,
             )
-            # Use .contiguous() to ensure proper 128-byte alignment required by
-            # flashinfer's selective_state_update kernel. x_d, B_d, C_d are views
-            # into sliced tensors which may not be 128-byte aligned.
-            x_d = rearrange(x_d, "b (h p) -> b h p",
-                            p=self.head_dim).contiguous()
+            x_d = rearrange(x_d, "b (h p) -> b h p", p=self.head_dim)
             dt_d = repeat(dt_d, "b h -> b h p", p=self.head_dim)
-            B_d = rearrange(B_d, "b (g n) -> b g n",
-                            g=self.tp_ngroups).contiguous()
-            C_d = rearrange(C_d, "b (g n) -> b g n",
-                            g=self.tp_ngroups).contiguous()
+            B_d = rearrange(B_d, "b (g n) -> b g n", g=self.tp_ngroups)
+            C_d = rearrange(C_d, "b (g n) -> b g n", g=self.tp_ngroups)
             z_d = rearrange(z_d, "b (h p) -> b h p", p=self.head_dim)
 
-            A = repeat(self.A, "h -> h p n", p=self.head_dim,
-                       n=self.d_state).to(dtype=torch.float32)
-            dt_bias = repeat(self.dt_bias, "h -> h p", p=self.head_dim)
-            D = repeat(self.D, "h -> h p", p=self.head_dim)
+            A = self._A_expanded
+            dt_bias = self._dt_bias_expanded
+            D = self._D_expanded
             if is_target_verify:
                 intermediate_ssm_states = layer_cache.intermediate_ssm
                 # Build kwargs for MTP selective_state_update