From c19d43e71ababe76d2d563e70f5e96b232d8b347 Mon Sep 17 00:00:00 2001
From: gasoonjia <gasoonjia@icloud.com>
Date: Mon, 13 Apr 2026 23:35:47 -0700
Subject: [PATCH] Add fused GatedDeltaNet decode Triton kernel

Fuse Q/K/V split, L2 normalization, head repeat, gating computation,
and delta-rule recurrent state update into a single Triton kernel for
decode (T=1). Replaces ~6 small AOTI-generated kernels with one,
reducing GatedDeltaNet kernel time by ~62% and improving end-to-end
decode throughput by ~2% (106 -> 108.5 tok/s on A100).
---
 backends/cuda/triton/kernels/__init__.py      |   9 +
 .../triton/kernels/fused_deltanet_decode.py   | 310 ++++++++++++++++++
 .../kernels/test_fused_deltanet_decode.py     | 295 +++++++++++++++++
 examples/models/qwen3_5_moe/export.py         |   1 -
 examples/models/qwen3_5_moe/model.py          |  74 ++---
 5 files changed, 643 insertions(+), 46 deletions(-)
 create mode 100644 backends/cuda/triton/kernels/fused_deltanet_decode.py
 create mode 100644 backends/cuda/triton/kernels/test_fused_deltanet_decode.py

diff --git a/backends/cuda/triton/kernels/__init__.py b/backends/cuda/triton/kernels/__init__.py
index e7af2bdaf84..ee8a80b8df6 100644
--- a/backends/cuda/triton/kernels/__init__.py
+++ b/backends/cuda/triton/kernels/__init__.py
@@ -29,3 +29,12 @@
     __all__.append("tq4_sdpa")
 except ImportError:
     pass
+
+try:
+    from executorch.backends.cuda.triton.kernels.fused_deltanet_decode import (  # noqa: F401
+        fused_deltanet_decode,
+    )
+
+    __all__.append("fused_deltanet_decode")
+except ImportError:
+    pass
diff --git a/backends/cuda/triton/kernels/fused_deltanet_decode.py b/backends/cuda/triton/kernels/fused_deltanet_decode.py
new file mode 100644
index 00000000000..ce55c62dae8
--- /dev/null
+++ b/backends/cuda/triton/kernels/fused_deltanet_decode.py
@@ -0,0 +1,310 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+"""
+Fully-fused SRAM-resident GatedDeltaNet recurrent kernel for decode (T=1).
+
+Fuses post-projection (Q/K/V split from conv1d output, L2 normalization,
+head repeat, gating computation) AND the recurrent state update into a
+single Triton kernel per layer.
+
+This eliminates intermediate HBM reads/writes for q, k, v, g, beta tensors
+and removes multiple small kernel launches (normalize, repeat_interleave,
+sigmoid, softplus, exp) that the previous partial-fusion approach required.
+
+For each (batch, v_head):
+    k_head = v_head // V_PER_K          # shared K head
+    q, k = L2_normalize(qkv_conv[Q/K])  # split + normalize
+    v = qkv_conv[V]                      # split
+    decay = exp(-exp(A_log) * softplus(alpha + dt_bias))
+    beta = sigmoid(beta_raw)
+    state = state * decay                # decay
+    Sk = state @ k                       # [V]
+    delta = beta * (v - Sk)              # [V]
+    state = state + outer(k, delta)      # rank-1 update
+    output = state @ (q * scale)         # [V]
+
+The kernel tiles over the V dimension in blocks of BLOCK_V.
+For each V-tile, it streams through K in blocks of BLOCK_K.
+
+Registered as torch.ops.triton.fused_deltanet_decode for AOTI compilation.
+"""
+
+import torch
+import triton
+import triton.language as tl
+from torch.library import triton_op, wrap_triton
+
+
+@triton.autotune(
+    configs=[
+        triton.Config({"BLOCK_K": 32, "BLOCK_V": 32}),
+        triton.Config({"BLOCK_K": 64, "BLOCK_V": 64}),
+        triton.Config({"BLOCK_K": 128, "BLOCK_V": 128}),
+        triton.Config({"BLOCK_K": 128, "BLOCK_V": 64}),
+        triton.Config({"BLOCK_K": 64, "BLOCK_V": 128}),
+    ],
+    key=["K", "V_DIM"],
+)
+@triton.jit
+def _fused_deltanet_decode_kernel(
+    # Tensor pointers
+    QKV_ptr,  # [B, conv_dim]  post-conv1d+silu output
+    Alpha_ptr,  # [B, H]         raw gating input (a)
+    BetaRaw_ptr,  # [B, H]         raw write strength (b, pre-sigmoid)
+    NegAExp_ptr,  # [H]            -exp(A_log), precomputed
+    DtBias_ptr,  # [H]            dt_bias parameter
+    S_in_ptr,  # [B, H, K, V]   recurrent state input (read-only)
+    S_out_ptr,  # [B, H, K, V]   recurrent state output (write-only)
+    O_ptr,  # [B, H, V]      output
+    # Dimension constants
+    K: tl.constexpr,  # head_k_dim (128)
+    V_DIM: tl.constexpr,  # head_v_dim (128)
+    KEY_DIM: tl.constexpr,  # num_k_heads * K (2048)
+    V_PER_K: tl.constexpr,  # num_v_heads // num_k_heads (2)
+    SCALE: tl.constexpr,  # K^(-0.5)
+    L2_EPS: tl.constexpr,  # 1e-6
+    # Strides
+    stride_qkv_b,  # qkv stride for batch dim
+    stride_ab,  # alpha stride for batch dim
+    stride_bb,  # beta_raw stride for batch dim
+    stride_s_b,  # state stride: batch
+    stride_s_h,  # state stride: head
+    stride_s_k,  # state stride: K dim
+    stride_s_v,  # state stride: V dim
+    stride_ob,  # output stride: batch
+    stride_oh,  # output stride: head
+    stride_ov,  # output stride: V dim
+    # Block sizes (autotuned)
+    BLOCK_K: tl.constexpr,
+    BLOCK_V: tl.constexpr,
+):
+    """One program per (batch, v_head, v_block)."""
+    pid_bh = tl.program_id(0)  # batch * num_v_heads index
+    pid_v = tl.program_id(1)  # V-tile index
+
+    # Decompose pid_bh into batch and v_head
+    H: tl.constexpr = KEY_DIM // K * V_PER_K  # num_v_heads
+    bid = pid_bh // H
+    h = pid_bh % H
+    k_head = h // V_PER_K  # corresponding K head
+
+    # V-tile range
+    v_start = pid_v * BLOCK_V
+    v_offs = v_start + tl.arange(0, BLOCK_V)
+    v_mask = v_offs < V_DIM
+
+    # ====== Phase 1: Load V slice from qkv_conv ======
+    # Layout: qkv_conv = [Q(KEY_DIM) | K(KEY_DIM) | V(H * V_DIM)]
+    qkv_base = QKV_ptr + bid * stride_qkv_b
+    v_base = qkv_base + 2 * KEY_DIM + h * V_DIM
+    v_vals = tl.load(v_base + v_offs, mask=v_mask, other=0.0).to(tl.float32)
+
+    # ====== Phase 2: Compute gating and beta ======
+    alpha_h = tl.load(Alpha_ptr + bid * stride_ab + h).to(tl.float32)
+    neg_a_exp_h = tl.load(NegAExp_ptr + h).to(tl.float32)
+    dt_bias_h = tl.load(DtBias_ptr + h).to(tl.float32)
+
+    # softplus with numerical stability
+    sp_input = alpha_h + dt_bias_h
+    sp = tl.where(sp_input > 20.0, sp_input, tl.log(1.0 + tl.exp(sp_input)))
+    gate = neg_a_exp_h * sp  # always negative
+    decay = tl.exp(gate)
+
+    beta_raw_h = tl.load(BetaRaw_ptr + bid * stride_bb + h).to(tl.float32)
+    beta = tl.sigmoid(beta_raw_h)
+
+    # ====== Phase 3: Compute K and Q L2 norms (full-vector reduction) ======
+    # Each v_block program needs the full K-vector norms, so we compute them here.
+    # This is redundant across v_blocks for the same (batch, head) but avoids
+    # a separate kernel launch or shared memory coordination.
+    q_base = qkv_base + k_head * K
+    k_base = qkv_base + KEY_DIM + k_head * K
+
+    q_sq_sum = tl.zeros([], dtype=tl.float32)
+    k_sq_sum = tl.zeros([], dtype=tl.float32)
+    for kk in range(0, K, BLOCK_K):
+        kk_offs = kk + tl.arange(0, BLOCK_K)
+        kk_mask = kk_offs < K
+        q_chunk = tl.load(q_base + kk_offs, mask=kk_mask, other=0.0).to(tl.float32)
+        k_chunk = tl.load(k_base + kk_offs, mask=kk_mask, other=0.0).to(tl.float32)
+        q_sq_sum += tl.sum(q_chunk * q_chunk)
+        k_sq_sum += tl.sum(k_chunk * k_chunk)
+
+    q_norm = tl.maximum(tl.sqrt(q_sq_sum), L2_EPS)
+    k_norm = tl.maximum(tl.sqrt(k_sq_sum), L2_EPS)
+
+    # ====== Phase 4: Recurrent state update ======
+    s_in_base = S_in_ptr + bid * stride_s_b + h * stride_s_h
+    s_out_base = S_out_ptr + bid * stride_s_b + h * stride_s_h
+
+    # --- Pass 1: Decay state, compute Sk = (decay*S)^T @ k_normalized ---
+    sk_acc = tl.zeros([BLOCK_V], dtype=tl.float32)
+    for kk in range(0, K, BLOCK_K):
+        kk_offs = kk + tl.arange(0, BLOCK_K)
+        kk_mask = kk_offs < K
+
+        # Load normalized k slice
+        k_vals = (
+            tl.load(k_base + kk_offs, mask=kk_mask, other=0.0).to(tl.float32) / k_norm
+        )
+
+        # Load state tile [BLOCK_K, BLOCK_V]
+        tile_offs = kk_offs[:, None] * stride_s_k + v_offs[None, :] * stride_s_v
+        tile_mask = kk_mask[:, None] & v_mask[None, :]
+        s_tile = tl.load(s_in_base + tile_offs, mask=tile_mask, other=0.0).to(
+            tl.float32
+        )
+
+        # Decay
+        s_tile = s_tile * decay
+
+        # Sk[v] += sum_k(state[k,v] * k_normalized[k])
+        sk_acc += tl.sum(s_tile * k_vals[:, None], axis=0)
+
+    # delta = beta * (v - Sk)
+    delta_v = beta * (v_vals - sk_acc)
+
+    # --- Pass 2: Re-read input, decay + rank-1 update, write output state, compute output ---
+    out_acc = tl.zeros([BLOCK_V], dtype=tl.float32)
+    for kk in range(0, K, BLOCK_K):
+        kk_offs = kk + tl.arange(0, BLOCK_K)
+        kk_mask = kk_offs < K
+
+        # Load normalized k and q slices
+        k_vals = (
+            tl.load(k_base + kk_offs, mask=kk_mask, other=0.0).to(tl.float32) / k_norm
+        )
+        q_vals = (
+            tl.load(q_base + kk_offs, mask=kk_mask, other=0.0).to(tl.float32)
+            / q_norm
+            * SCALE
+        )
+
+        # Re-read input state and decay
+        tile_offs = kk_offs[:, None] * stride_s_k + v_offs[None, :] * stride_s_v
+        tile_mask = kk_mask[:, None] & v_mask[None, :]
+        s_tile = tl.load(s_in_base + tile_offs, mask=tile_mask, other=0.0).to(
+            tl.float32
+        )
+        s_tile = s_tile * decay
+
+        # Rank-1 update: S += k ⊗ delta
+        s_tile = s_tile + k_vals[:, None] * delta_v[None, :]
+
+        # Store updated state
+        tl.store(
+            s_out_base + tile_offs,
+            s_tile.to(S_out_ptr.dtype.element_ty),
+            mask=tile_mask,
+        )
+
+        # Output: out[v] += sum_k(S_new[k,v] * q_scaled[k])
+        out_acc += tl.sum(s_tile * q_vals[:, None], axis=0)
+
+    # Store output
+    o_offs = O_ptr + bid * stride_ob + h * stride_oh + v_offs * stride_ov
+    tl.store(o_offs, out_acc.to(O_ptr.dtype.element_ty), mask=v_mask)
+
+
+@triton_op("triton::fused_deltanet_decode", mutates_args={})
+def fused_deltanet_decode(
+    qkv: torch.Tensor,
+    alpha: torch.Tensor,
+    beta_raw: torch.Tensor,
+    A_log: torch.Tensor,
+    dt_bias: torch.Tensor,
+    state: torch.Tensor,
+) -> tuple[torch.Tensor, torch.Tensor]:
+    """
+    Fully-fused GatedDeltaNet decode (T=1) recurrent step.
+
+    Fuses Q/K/V split, L2 normalization, head repeat, gating, and delta rule
+    recurrence into a single kernel.
+
+    Args:
+        qkv: [B, conv_dim] post-conv1d+silu output (Q|K|V concatenated)
+        alpha: [B, num_v_heads] raw gating input (pre-softplus)
+        beta_raw: [B, num_v_heads] raw write strength (pre-sigmoid)
+        A_log: [num_v_heads] log(A) parameter (negated exp computed inside)
+        dt_bias: [num_v_heads] gating bias parameter
+        state: [B, num_v_heads, K, V] recurrent state (read-only, not mutated)
+
+    Returns:
+        tuple of (output, new_state):
+            output: [B, num_v_heads, V] decode output (same dtype as state)
+            new_state: [B, num_v_heads, K, V] updated state (same dtype as state)
+    """
+    B = qkv.shape[0]
+    H, K, V_DIM = state.shape[1], state.shape[2], state.shape[3]
+
+    # Derive layout constants from tensor shapes
+    # conv_dim = 2 * KEY_DIM + H * V_DIM, KEY_DIM = num_k_heads * K
+    value_dim = H * V_DIM
+    KEY_DIM = (qkv.shape[1] - value_dim) // 2
+    num_k_heads = KEY_DIM // K
+    V_PER_K = H // num_k_heads
+
+    output = torch.empty(B, H, V_DIM, dtype=state.dtype, device=qkv.device)
+
+    # Compute neg_A_exp from A_log parameter
+    neg_A_exp = -torch.exp(A_log.float())
+
+    # Separate input/output state buffers for autotuning safety
+    # (autotuner may re-run the kernel; reading from a buffer we also write
+    # would produce wrong results on the second run)
+    state_in = state.float().contiguous()
+    state_out = torch.empty_like(state_in)
+
+    def grid(meta):
+        return (B * H, triton.cdiv(V_DIM, meta["BLOCK_V"]))
+
+    wrap_triton(_fused_deltanet_decode_kernel)[grid](
+        qkv,
+        alpha,
+        beta_raw,
+        neg_A_exp,
+        dt_bias,
+        state_in,
+        state_out,
+        output,
+        # Dimensions
+        K=K,
+        V_DIM=V_DIM,
+        KEY_DIM=KEY_DIM,
+        V_PER_K=V_PER_K,
+        SCALE=K**-0.5,
+        L2_EPS=1e-6,
+        # Strides
+        stride_qkv_b=qkv.stride(0),
+        stride_ab=alpha.stride(0),
+        stride_bb=beta_raw.stride(0),
+        stride_s_b=state_in.stride(0),
+        stride_s_h=state_in.stride(1),
+        stride_s_k=state_in.stride(2),
+        stride_s_v=state_in.stride(3),
+        stride_ob=output.stride(0),
+        stride_oh=output.stride(1),
+        stride_ov=output.stride(2),
+    )
+
+    return output, state_out.to(state.dtype)
+
+
+@fused_deltanet_decode.register_fake
+def _fused_deltanet_decode_fake(
+    qkv: torch.Tensor,
+    alpha: torch.Tensor,
+    beta_raw: torch.Tensor,
+    A_log: torch.Tensor,
+    dt_bias: torch.Tensor,
+    state: torch.Tensor,
+) -> tuple[torch.Tensor, torch.Tensor]:
+    B = qkv.shape[0]
+    H, K_DIM, V_DIM = state.shape[1], state.shape[2], state.shape[3]
+    output = torch.empty(B, H, V_DIM, dtype=state.dtype, device=qkv.device)
+    new_state = torch.empty(B, H, K_DIM, V_DIM, dtype=state.dtype, device=qkv.device)
+    return output, new_state
diff --git a/backends/cuda/triton/kernels/test_fused_deltanet_decode.py b/backends/cuda/triton/kernels/test_fused_deltanet_decode.py
new file mode 100644
index 00000000000..3888a9e745f
--- /dev/null
+++ b/backends/cuda/triton/kernels/test_fused_deltanet_decode.py
@@ -0,0 +1,295 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+"""
+Correctness test: fully-fused Triton decode kernel vs reference PyTorch.
+
+Verifies that torch.ops.triton.fused_deltanet_decode (which takes raw
+qkv_conv, alpha, beta_raw, neg_A_exp, dt_bias) produces the same output
+and state as the original GatedDeltaNet T=1 recurrence with manual
+Q/K/V split, L2 norm, head repeat, and gating.
+"""
+
+import os
+import sys
+
+import torch
+import torch.nn.functional as F
+
+# Direct import from source tree
+sys.path.insert(0, os.path.dirname(__file__))
+from fused_deltanet_decode import fused_deltanet_decode  # noqa: F401, E402
+
+
+def reference_deltanet_decode(
+    qkv_conv,
+    alpha,
+    beta_raw,
+    neg_A_exp,
+    dt_bias,
+    state,
+    num_k_heads,
+    num_v_heads,
+    head_k_dim,
+    head_v_dim,
+):
+    """Reference PyTorch implementation matching model.py's original T=1 path.
+
+    Does Q/K/V split, L2 norm, head repeat, gating, then recurrent update.
+    """
+    B = qkv_conv.shape[0]
+    key_dim = num_k_heads * head_k_dim
+
+    # Q/K/V split from qkv_conv
+    q = qkv_conv[:, :key_dim].reshape(B, num_k_heads, head_k_dim)
+    k = qkv_conv[:, key_dim : 2 * key_dim].reshape(B, num_k_heads, head_k_dim)
+    v = qkv_conv[:, 2 * key_dim :].reshape(B, num_v_heads, head_v_dim)
+
+    # L2 normalize
+    q = F.normalize(q.float(), p=2, dim=-1)
+    k = F.normalize(k.float(), p=2, dim=-1)
+    v = v.float()
+
+    # Head repeat (16 K-heads -> 32 V-heads)
+    head_repeat = num_v_heads // num_k_heads
+    if head_repeat > 1:
+        q = q.repeat_interleave(head_repeat, dim=1)
+        k = k.repeat_interleave(head_repeat, dim=1)
+
+    # Gating
+    beta = torch.sigmoid(beta_raw.float())
+    g = neg_A_exp.float() * F.softplus(alpha.float() + dt_bias.float())
+
+    # Recurrent update (same math as original model.py T=1)
+    scale = head_k_dim**-0.5
+    state_f32 = state.float()
+
+    decay = torch.exp(g).unsqueeze(-1).unsqueeze(-1)  # [B, H, 1, 1]
+    state_f32 = state_f32 * decay
+
+    Sk = torch.einsum("bhkv,bhk->bhv", state_f32, k)
+    delta = beta.unsqueeze(-1) * (v - Sk)
+    state_f32 = state_f32 + torch.einsum("bhk,bhv->bhkv", k, delta)
+
+    output = torch.einsum("bhkv,bhk->bhv", state_f32, q) * scale
+
+    state.copy_(state_f32.to(state.dtype))
+    return output, state
+
+
+def test_fused_deltanet_decode():
+    torch.manual_seed(42)
+    device = "cuda"
+
+    # Qwen3.5 MoE dimensions
+    B = 1
+    num_k_heads, num_v_heads = 16, 32
+    head_k_dim, head_v_dim = 128, 128
+    key_dim = num_k_heads * head_k_dim  # 2048
+    value_dim = num_v_heads * head_v_dim  # 4096
+    conv_dim = 2 * key_dim + value_dim  # 8192
+
+    # Random qkv_conv (post-conv1d+silu output)
+    qkv_conv = torch.randn(B, conv_dim, device=device, dtype=torch.bfloat16) * 0.1
+
+    # Raw alpha and beta (pre-gating, pre-sigmoid)
+    alpha = torch.randn(B, num_v_heads, device=device, dtype=torch.float32)
+    beta_raw = torch.randn(B, num_v_heads, device=device, dtype=torch.float32)
+
+    # Model parameters
+    A_log = torch.log(torch.empty(num_v_heads, device=device).uniform_(0.5, 8))
+    neg_A_exp = -torch.exp(A_log).float()
+    dt_bias = torch.ones(num_v_heads, device=device, dtype=torch.float32)
+
+    # Initial state
+    state_init = (
+        torch.randn(
+            B, num_v_heads, head_k_dim, head_v_dim, device=device, dtype=torch.bfloat16
+        )
+        * 0.1
+    )
+
+    # --- Reference ---
+    ref_state = state_init.clone()
+    ref_output, ref_state = reference_deltanet_decode(
+        qkv_conv.clone(),
+        alpha.clone(),
+        beta_raw.clone(),
+        neg_A_exp.clone(),
+        dt_bias.clone(),
+        ref_state,
+        num_k_heads,
+        num_v_heads,
+        head_k_dim,
+        head_v_dim,
+    )
+
+    # --- Fused kernel ---
+    fused_state = state_init.clone()
+    fused_output = torch.ops.triton.fused_deltanet_decode(
+        qkv_conv.clone(),
+        alpha.clone(),
+        beta_raw.clone(),
+        neg_A_exp.clone(),
+        dt_bias.clone(),
+        fused_state,
+    )
+
+    # Compare
+    out_diff = (fused_output.float() - ref_output.float()).abs()
+    out_max_err = out_diff.max().item()
+    out_mean_err = out_diff.mean().item()
+
+    state_diff = (fused_state.float() - ref_state.float()).abs()
+    state_max_err = state_diff.max().item()
+    state_mean_err = state_diff.mean().item()
+
+    print(f"Output  — max err: {out_max_err:.6e}, mean err: {out_mean_err:.6e}")
+    print(f"State   — max err: {state_max_err:.6e}, mean err: {state_mean_err:.6e}")
+
+    assert out_max_err < 0.05, f"Output max error too large: {out_max_err}"
+    assert state_max_err < 0.05, f"State max error too large: {state_max_err}"
+    print("PASSED: fused kernel matches reference within tolerance.\n")
+
+    # --- Batch test ---
+    B2 = 4
+    qkv2 = torch.randn(B2, conv_dim, device=device, dtype=torch.bfloat16) * 0.1
+    alpha2 = torch.randn(B2, num_v_heads, device=device, dtype=torch.float32)
+    beta_raw2 = torch.randn(B2, num_v_heads, device=device, dtype=torch.float32)
+    state2_init = (
+        torch.randn(
+            B2, num_v_heads, head_k_dim, head_v_dim, device=device, dtype=torch.bfloat16
+        )
+        * 0.1
+    )
+
+    ref_state2 = state2_init.clone()
+    ref_out2, ref_state2 = reference_deltanet_decode(
+        qkv2.clone(),
+        alpha2.clone(),
+        beta_raw2.clone(),
+        neg_A_exp.clone(),
+        dt_bias.clone(),
+        ref_state2,
+        num_k_heads,
+        num_v_heads,
+        head_k_dim,
+        head_v_dim,
+    )
+    fused_state2 = state2_init.clone()
+    fused_out2 = torch.ops.triton.fused_deltanet_decode(
+        qkv2.clone(),
+        alpha2.clone(),
+        beta_raw2.clone(),
+        neg_A_exp.clone(),
+        dt_bias.clone(),
+        fused_state2,
+    )
+
+    out_err2 = (fused_out2.float() - ref_out2.float()).abs().max().item()
+    state_err2 = (fused_state2.float() - ref_state2.float()).abs().max().item()
+    print(
+        f"Batch={B2} — output max err: {out_err2:.6e}, state max err: {state_err2:.6e}"
+    )
+    assert out_err2 < 0.05, f"Batch output error too large: {out_err2}"
+    assert state_err2 < 0.05, f"Batch state error too large: {state_err2}"
+    print("PASSED: batch test.\n")
+
+    # --- Multi-step sequential test (accumulation drift check) ---
+    print("Testing multi-step sequential decode (10 steps)...")
+    state_ref = (
+        torch.randn(
+            1, num_v_heads, head_k_dim, head_v_dim, device=device, dtype=torch.bfloat16
+        )
+        * 0.01
+    )
+    state_fused = state_ref.clone()
+
+    for _ in range(10):
+        qkv_step = torch.randn(1, conv_dim, device=device, dtype=torch.bfloat16) * 0.1
+        a_step = torch.randn(1, num_v_heads, device=device, dtype=torch.float32)
+        b_step = torch.randn(1, num_v_heads, device=device, dtype=torch.float32)
+
+        ref_out, state_ref = reference_deltanet_decode(
+            qkv_step.clone(),
+            a_step.clone(),
+            b_step.clone(),
+            neg_A_exp.clone(),
+            dt_bias.clone(),
+            state_ref,
+            num_k_heads,
+            num_v_heads,
+            head_k_dim,
+            head_v_dim,
+        )
+        fused_out = torch.ops.triton.fused_deltanet_decode(
+            qkv_step.clone(),
+            a_step.clone(),
+            b_step.clone(),
+            neg_A_exp.clone(),
+            dt_bias.clone(),
+            state_fused,
+        )
+
+    final_out_err = (fused_out.float() - ref_out.float()).abs().max().item()
+    final_state_err = (state_fused.float() - state_ref.float()).abs().max().item()
+    print(
+        f"After 10 steps — output max err: {final_out_err:.6e}, state max err: {final_state_err:.6e}"
+    )
+    assert final_out_err < 0.1, f"Multi-step output error too large: {final_out_err}"
+    assert final_state_err < 0.1, f"Multi-step state error too large: {final_state_err}"
+    print("PASSED: multi-step test.\n")
+
+    # --- CUDA Graph compatibility test ---
+    print("Testing CUDA Graph compatibility...")
+    qkv_cg = torch.randn(1, conv_dim, device=device, dtype=torch.bfloat16) * 0.1
+    alpha_cg = torch.randn(1, num_v_heads, device=device, dtype=torch.float32)
+    beta_cg = torch.randn(1, num_v_heads, device=device, dtype=torch.float32)
+    state_cg = state_init[:1].clone()
+
+    # Warmup
+    for _ in range(3):
+        state_cg.copy_(state_init[:1])
+        _ = torch.ops.triton.fused_deltanet_decode(
+            qkv_cg, alpha_cg, beta_cg, neg_A_exp, dt_bias, state_cg
+        )
+
+    # Capture
+    state_cg.copy_(state_init[:1])
+    graph = torch.cuda.CUDAGraph()
+    with torch.cuda.graph(graph):
+        out_cg = torch.ops.triton.fused_deltanet_decode(
+            qkv_cg, alpha_cg, beta_cg, neg_A_exp, dt_bias, state_cg
+        )
+
+    # Replay
+    state_cg.copy_(state_init[:1])
+    graph.replay()
+
+    # Compare with reference using same inputs
+    ref_state_cg = state_init[:1].clone()
+    ref_out_cg, _ = reference_deltanet_decode(
+        qkv_cg.clone(),
+        alpha_cg.clone(),
+        beta_cg.clone(),
+        neg_A_exp.clone(),
+        dt_bias.clone(),
+        ref_state_cg,
+        num_k_heads,
+        num_v_heads,
+        head_k_dim,
+        head_v_dim,
+    )
+    cg_err = (out_cg.float() - ref_out_cg.float()).abs().max().item()
+    print(f"CUDA Graph — output max err: {cg_err:.6e}")
+    assert cg_err < 0.05, f"CUDA Graph output error too large: {cg_err}"
+    print("PASSED: CUDA Graph compatible.\n")
+
+    print("ALL TESTS PASSED.")
+
+
+if __name__ == "__main__":
+    test_fused_deltanet_decode()
diff --git a/examples/models/qwen3_5_moe/export.py b/examples/models/qwen3_5_moe/export.py
index 19a720a2e79..b2a669a237a 100644
--- a/examples/models/qwen3_5_moe/export.py
+++ b/examples/models/qwen3_5_moe/export.py
@@ -113,7 +113,6 @@ def load_prequantized_model(prequantized_dir, max_seq_len=4096):
         ".conv_state",
         ".recurrent_state",
         ".cache_positions",
-        ".inv_freq",
     )
     expected_missing = {k for k in missing if any(p in k for p in runtime_prefixes)}
     weight_missing = set(missing) - expected_missing
diff --git a/examples/models/qwen3_5_moe/model.py b/examples/models/qwen3_5_moe/model.py
index 751915fb123..faccebe0a81 100644
--- a/examples/models/qwen3_5_moe/model.py
+++ b/examples/models/qwen3_5_moe/model.py
@@ -392,57 +392,41 @@ def forward(self, x, input_pos):
             acc = acc + conv_input[:, :, k : k + T_conv].float() * w[:, k : k + 1]
         qkv_conv = F.silu(acc[:, :, -T:]).to(conv_input.dtype).transpose(1, 2)
 
-        # Split via slicing (torch.split produces split_copy which lacks AOTI fallback)
-        kd = self.key_dim
-        q = qkv_conv[..., :kd].reshape(B, T, self.num_k_heads, self.head_k_dim)
-        k = qkv_conv[..., kd : 2 * kd].reshape(B, T, self.num_k_heads, self.head_k_dim)
-        v = qkv_conv[..., 2 * kd :].reshape(B, T, self.num_v_heads, self.head_v_dim)
-
-        # L2-normalize Q and K (the FLA kernel expects pre-normalized inputs;
-        # HF reference uses use_qk_l2norm_in_kernel=True which does this inside)
-        q = F.normalize(q, p=2, dim=-1)
-        k = F.normalize(k, p=2, dim=-1)
-
-        # head_repeat for k_heads != v_heads
-        if self.head_repeat > 1:
-            q = q.repeat_interleave(self.head_repeat, dim=2)
-            k = k.repeat_interleave(self.head_repeat, dim=2)
-
-        # Mamba-style gating
-        beta = b.sigmoid()
-        g = -self.A_log.float().exp() * F.softplus(a.float() + self.dt_bias)
-
         if T == 1:
-            # Native recurrent delta rule — AOTI fuses with surrounding ops
-            scale = self.head_k_dim**-0.5
-
-            q_s = q[:, 0].float()  # [B, H, K]
-            k_s = k[:, 0].float()  # [B, H, K]
-            v_s = v[:, 0].float()  # [B, H, V]
-            g_s = g[:, 0]  # [B, H]
-            beta_s = beta[:, 0]  # [B, H]
-
-            state = self.recurrent_state[:B].float()  # [B, H, K, V]
+            # Fully-fused Triton decode kernel: Q/K/V split, L2 norm,
+            # head repeat, gating, and delta rule recurrence in one kernel.
+            # State is mutated in-place (CUDA Graph compatible).
+            output, new_state = torch.ops.triton.fused_deltanet_decode(
+                qkv_conv[:, 0],  # [B, conv_dim] — squeeze T=1
+                a[:, 0],  # [B, H] raw alpha
+                b[:, 0],  # [B, H] raw beta
+                self.A_log,  # [H] nn.Parameter
+                self.dt_bias,  # [H] nn.Parameter
+                self.recurrent_state[:B],  # [B, H, K, V]
+            )
+            output = output.unsqueeze(1).to(qkv_conv.dtype)  # [B, 1, H, V]
 
-            # Decay state by exp(g)
-            decay = torch.exp(g_s).unsqueeze(-1).unsqueeze(-1)  # [B, H, 1, 1]
-            state = state * decay
+            with torch.no_grad():
+                self.recurrent_state[:B].copy_(new_state)
+        else:
+            # Chunked FLA triton_op for prefill — needs pre-processed inputs
+            kd = self.key_dim
+            q = qkv_conv[..., :kd].reshape(B, T, self.num_k_heads, self.head_k_dim)
+            k = qkv_conv[..., kd : 2 * kd].reshape(
+                B, T, self.num_k_heads, self.head_k_dim
+            )
+            v = qkv_conv[..., 2 * kd :].reshape(B, T, self.num_v_heads, self.head_v_dim)
 
-            # Sk = state @ k (project state by key)
-            Sk = torch.einsum("bhkv,bhk->bhv", state, k_s)
+            q = F.normalize(q, p=2, dim=-1)
+            k = F.normalize(k, p=2, dim=-1)
 
-            # Delta rule state update
-            delta = beta_s.unsqueeze(-1) * (v_s - Sk)  # [B, H, V]
-            state = state + torch.einsum("bhk,bhv->bhkv", k_s, delta)
+            if self.head_repeat > 1:
+                q = q.repeat_interleave(self.head_repeat, dim=2)
+                k = k.repeat_interleave(self.head_repeat, dim=2)
 
-            # Output = state @ q * scale
-            output = torch.einsum("bhkv,bhk->bhv", state, q_s) * scale
-            output = output.unsqueeze(1).to(q.dtype)  # [B, 1, H, V]
+            beta = b.sigmoid()
+            g = -self.A_log.float().exp() * F.softplus(a.float() + self.dt_bias)
 
-            with torch.no_grad():
-                self.recurrent_state[:B].copy_(state.to(self.recurrent_state.dtype))
-        else:
-            # Chunked FLA triton_op for prefill
             output, new_state = torch.ops.triton.chunk_gated_delta_rule(
                 q, k, v, g, beta, self.recurrent_state[:B]
             )