quic · divytrip3005 · Apr 21, 2026 · Apr 21, 2026 · Apr 21, 2026 · Apr 23, 2026
diff --git a/QEfficient/base/modeling_qeff.py b/QEfficient/base/modeling_qeff.py
@@ -394,6 +394,8 @@ def get_onnx_path(
         retain_full_kv: Optional[bool] = False,
         mla_absorption: Optional[Dict[str, bool]] = None,
         qaic_config: Optional[dict] = None,
+        moe_prefill_packed_chunk_size: Optional[int] = None,
+        moe_prefill_num_nsp: Optional[int] = None,
         **compiler_options,
     ):
         kwargs = {
@@ -409,6 +411,11 @@ def get_onnx_path(
                     "prefill_only": prefill_only,
                     "prefill_seq_len": specializations[0].get("seq_len"),
                     "enable_chunking": enable_chunking,
+                    "num_cores": compiler_options.get("aic_num_cores", constants.DEFAULT_AIC_NUM_CORES),
+                    "moe_prefill_num_nsp": moe_prefill_num_nsp,
+                    "moe_prefill_packed_chunk_size": constants.MOE_PREFILL_PACKED_CHUNK_SIZE
+                    if moe_prefill_packed_chunk_size is None
+                    else moe_prefill_packed_chunk_size,
                 }
             )
 
@@ -527,6 +534,8 @@ def _compile(
                 For QNN Compilation path, when enable_qnn is set to True, any parameter passed in compiler_options will be ignored.
         """
 
+        moe_prefill_packed_chunk_size = compiler_options.pop("moe_prefill_packed_chunk_size", None)
+        moe_prefill_num_nsp = compiler_options.pop("moe_prefill_num_nsp", None)
         onnx_path = Path(
             onnx_path
             if onnx_path
@@ -542,6 +551,8 @@ def _compile(
                 mla_absorption,
                 num_devices=mdp_ts_num_devices,
                 qaic_config=qaic_config,
+                moe_prefill_packed_chunk_size=moe_prefill_packed_chunk_size,
+                moe_prefill_num_nsp=moe_prefill_num_nsp,
                 **compiler_options,
             )
         )

diff --git a/QEfficient/base/onnx_transforms.py b/QEfficient/base/onnx_transforms.py
@@ -21,11 +21,15 @@
     CtxGatherBlockedKV,
     CtxGatherFunc,
     CtxGatherFunc3D,
+    CtxGatherFunc3DGeneralized,
     CtxGatherFuncBlockedKV,
     CtxScatter,
     CtxScatter3D,
+    CtxScatter3DInt,
     CtxScatterFunc,
     CtxScatterFunc3D,
+    CtxScatterFunc3DGeneralized,
+    CtxScatterFunc3DInt,
 )
 from QEfficient.customop.ctx_scatter_gather_cb import (
     CtxGatherBlockedKVCB,
@@ -92,8 +96,11 @@ class CustomOpTransform(BaseOnnxTransform):
         "CustomRMSNormFunc": (CustomRMSNormFunc, CustomRMSNorm),
         "CtxScatterFunc": (CtxScatterFunc, CtxScatter),
         "CtxScatterFunc3D": (CtxScatterFunc3D, CtxScatter3D),
+        "CtxScatterFunc3DInt": (CtxScatterFunc3DInt, CtxScatter3DInt),
+        "CtxScatterFunc3DGeneralized": (CtxScatterFunc3DGeneralized, CtxScatter3D),
         "CtxGatherFunc": (CtxGatherFunc, CtxGather),
         "CtxGatherFunc3D": (CtxGatherFunc3D, CtxGather3D),
+        "CtxGatherFunc3DGeneralized": (CtxGatherFunc3DGeneralized, CtxGather3D),
         "CtxScatterFuncCB3D": (CtxScatterFuncCB3D, CtxScatterCB3D),
         "CtxGatherFuncCB3D": (CtxGatherFuncCB3D, CtxGatherCB3D),
         "CtxGatherFuncBlockedKV": (CtxGatherFuncBlockedKV, CtxGatherBlockedKV),

diff --git a/QEfficient/customop/__init__.py b/QEfficient/customop/__init__.py
@@ -8,9 +8,12 @@
 from QEfficient.customop.ctx_scatter_gather import (
     CtxGatherFunc,
     CtxGatherFunc3D,
+    CtxGatherFunc3DGeneralized,
     CtxGatherFuncBlockedKV,
     CtxScatterFunc,
     CtxScatterFunc3D,
+    CtxScatterFunc3DGeneralized,
+    CtxScatterFunc3DInt,
 )
 from QEfficient.customop.ctx_scatter_gather_cb import (
     CtxGatherFuncBlockedKVCB,
@@ -26,7 +29,10 @@
     "CtxGatherFuncBlockedKV",
     "CtxScatterFunc",
     "CtxGatherFunc3D",
+    "CtxGatherFunc3DGeneralized",
     "CtxScatterFunc3D",
+    "CtxScatterFunc3DGeneralized",
+    "CtxScatterFunc3DInt",
     "CustomRMSNormAIC",
     "GemmaCustomRMSNormAIC",
     "CtxGatherFuncCB",

diff --git a/QEfficient/customop/ctx_scatter_gather.py b/QEfficient/customop/ctx_scatter_gather.py
@@ -69,6 +69,9 @@ def CtxScatter3D(data: onnxscript.FLOAT, position_ids: onnxscript.INT32, updates
 
     # Create indices
     batch_idx = ops.Expand(ops.Unsqueeze(ops.Range(zero, batch_size, one), [1, 2]), exp_shape)
+
+    # keep index tensor types aligned for backend that require exact dtype match
+    batch_idx = ops.Cast(batch_idx, to=onnxscript.INT32.dtype)
     ctx_idx = ops.Expand(ops.Unsqueeze(position_ids, [2]), exp_shape)
     indices = ops.Concat(batch_idx, ctx_idx, axis=2)
 
@@ -78,8 +81,9 @@ def CtxScatter3D(data: onnxscript.FLOAT, position_ids: onnxscript.INT32, updates
 class CtxScatterFunc3D(torch.autograd.Function):
     @staticmethod
     def forward(data: torch.Tensor, position_ids: torch.Tensor, updates: torch.Tensor):
+        data = data.clone()
         batch_idx = torch.arange(data.shape[0]).view(-1, 1)
-        ctx_idx = position_ids
+        ctx_idx = torch.where(position_ids == torch.iinfo(torch.int32).max, data.shape[1] - 1, position_ids)
         data[batch_idx, ctx_idx] = updates
         return data
 
@@ -92,9 +96,80 @@ def symbolic(g: torch.Graph, data: torch.Value, position_ids: torch.Value, updat
         return g.onnxscript_op(CtxScatter3D, data, position_ids, updates).setTypeAs(data)
 
 
+class CtxScatterFunc3DGeneralized(torch.autograd.Function):
+    """Scatter variant that preserves ``data`` at invalid (INT32_MAX) positions.
+
+    Unlike :class:`CtxScatterFunc3D`, which writes updates for invalid rows to
+    ``data.shape[1]-1`` (potentially clobbering valid content), this version
+    masks out invalid rows before scattering so ``data`` is left untouched where
+    ``position_ids == INT32_MAX``.
+    """
+
+    @staticmethod
+    def forward(data: torch.Tensor, position_ids: torch.Tensor, updates: torch.Tensor):
+        data = data.clone()
+        valid = position_ids != torch.iinfo(torch.int32).max
+        batch_idx = torch.arange(data.shape[0], device=data.device).view(-1, 1).expand_as(position_ids)
+        data[batch_idx[valid], position_ids[valid].long()] = updates[valid]
+        return data
+
+    @staticmethod
+    def setup_context(ctx, inputs, outputs):
+        pass
+
+    @staticmethod
+    def symbolic(g: torch.Graph, data: torch.Value, position_ids: torch.Value, updates: torch.Value) -> torch.Value:
+        return g.onnxscript_op(CtxScatter3D, data, position_ids, updates).setTypeAs(data)
+
+
+@onnxscript.script(onnxscript.values.Opset("com.qualcomm.cloud", 1))
+def CtxScatter3DInt(
+    data: onnxscript.INT32, position_ids: onnxscript.INT32, updates: onnxscript.INT32
+) -> onnxscript.INT32:
+    # Find dims
+    batch_size = ops.Gather(ops.Shape(data), [0])
+    seq_len = ops.Gather(ops.Shape(position_ids), [1])
+
+    # Expanded shape to create indices
+    zero = ops.Constant(value_ints=[0])
+    one = ops.Constant(value_ints=[1])
+    exp_shape = ops.Concat(batch_size, seq_len, one, axis=0)
+
+    # Create indices
+    batch_idx = ops.Expand(ops.Unsqueeze(ops.Range(zero, batch_size, one), [1, 2]), exp_shape)
+    batch_idx = ops.Cast(batch_idx, to=onnxscript.INT32.dtype)
+    ctx_idx = ops.Expand(ops.Unsqueeze(position_ids, [2]), exp_shape)
+    indices = ops.Concat(batch_idx, ctx_idx, axis=2)
+
+    return ops.ScatterND(data, indices, updates)
+
+
+class CtxScatterFunc3DInt(torch.autograd.Function):
+    """Int32-typed scatter used to build a packed->original index table."""
+
+    @staticmethod
+    def forward(data: torch.Tensor, position_ids: torch.Tensor, updates: torch.Tensor):
+        data = data.clone()
+        valid = position_ids != torch.iinfo(torch.int32).max
+        batch_idx = torch.arange(data.shape[0], device=data.device).view(-1, 1).expand_as(position_ids)
+        data[batch_idx[valid], position_ids[valid].long()] = updates[valid]
+        return data
+
+    @staticmethod
+    def setup_context(ctx, inputs, outputs):
+        pass
+
+    @staticmethod
+    def symbolic(g: torch.Graph, data: torch.Value, position_ids: torch.Value, updates: torch.Value) -> torch.Value:
+        return g.onnxscript_op(CtxScatter3DInt, data, position_ids, updates).setTypeAs(data)
+
+
 @onnxscript.script(onnxscript.values.Opset("com.qualcomm.cloud", 1))
 def CtxGather3D(data: onnxscript.FLOAT, ctx_indices: onnxscript.INT32) -> onnxscript.FLOAT:
-    ctx_indices = ops.Expand(ctx_indices, ops.Slice(ops.Shape(data), starts=[0], ends=[2], axes=[0]))
+    batch_size = ops.Slice(ops.Shape(data), starts=[0], ends=[1], axes=[0])
+    idx_seq_len = ops.Slice(ops.Shape(ctx_indices), starts=[1], ends=[2], axes=[0])
+    expand_shape = ops.Concat(batch_size, idx_seq_len, axis=0)
+    ctx_indices = ops.Expand(ctx_indices, expand_shape)
     ctx_indices = ops.Unsqueeze(ctx_indices, [-1])
     return ops.GatherND(data, ctx_indices, batch_dims=1)
 
@@ -103,6 +178,7 @@ class CtxGatherFunc3D(torch.autograd.Function):
     @staticmethod
     def forward(data: torch.Tensor, ctx_indices: torch.Tensor):
         batch_indices = torch.arange(data.shape[0]).view(-1, 1)
+        ctx_indices = torch.where(ctx_indices == torch.iinfo(torch.int32).max, 0, ctx_indices)
         return data[batch_indices, ctx_indices]
 
     @staticmethod
@@ -114,6 +190,31 @@ def symbolic(g: torch.Graph, data: torch.Value, ctx_indices: torch.Value) -> tor
         return g.onnxscript_op(CtxGather3D, data, ctx_indices).setTypeAs(data)
 
 
+class CtxGatherFunc3DGeneralized(torch.autograd.Function):
+    """Gather variant that tolerates INT32_MAX indices (invalid rows read from 0).
+
+    Semantically equivalent to :class:`CtxGatherFunc3D` on the PyTorch side but
+    exposed as a separate autograd op so callers using the packed/cumsum scatter
+    pipeline can be easily recognized and so the ONNX symbolic omits
+    ``setTypeAs`` (needed when the caller already has a matching dtype on
+    ``data`` and wants the op signature to flow through without dtype pinning).
+    """
+
+    @staticmethod
+    def forward(data: torch.Tensor, ctx_indices: torch.Tensor):
+        batch_indices = torch.arange(data.shape[0]).view(-1, 1)
+        ctx_indices = torch.where(ctx_indices == torch.iinfo(torch.int32).max, 0, ctx_indices)
+        return data[batch_indices, ctx_indices]
+
+    @staticmethod
+    def setup_context(ctx, inputs, outputs):
+        pass
+
+    @staticmethod
+    def symbolic(g: torch.Graph, data: torch.Value, ctx_indices: torch.Value) -> torch.Value:
+        return g.onnxscript_op(CtxGather3D, data, ctx_indices)
+
+
 @onnxscript.script(onnxscript.values.Opset("com.qualcomm.cloud", 1))
 def CtxGather(
     data: onnxscript.FLOAT, ctx_indices: onnxscript.INT32, comp_ctx_len: onnxscript.INT32

diff --git a/QEfficient/transformers/modeling_utils.py b/QEfficient/transformers/modeling_utils.py
@@ -196,7 +196,7 @@
 DYNAMIC_SEQ_LEN_SUPPORTED_MODEL_ARCH = {"gemma3", "llama4", "gemma3_text", "llama4_text"}
 
 # This is for supporting different modelling classes specially written for prefill-only model
-SPECIALIZED_DISAGG_SERVING_MODEL_ARCH = {"gpt_oss", "kimi_k2", "kimi_k25"}
+SPECIALIZED_DISAGG_SERVING_MODEL_ARCH = {"gpt_oss", "qwen3_moe", "granitemoe", "kimi_k2", "kimi_k25"}
 
 _PROXY_ONLY_ONNX_TRANSFORMS = (FP16ClipTransform, SplitTensorsTransform)