fixup! [test] Enable LoRA in PRAD speculative decoding

Funatiq · Funatiq · commit 0a620781e3a5 · 2026-04-14T14:11:31.000Z
Signed-off-by: Robin Kobus &lt;19427718+Funatiq@users.noreply.github.com&gt;
diff --git a/tensorrt_llm/_torch/peft/lora/cuda_graph_lora_manager.py b/tensorrt_llm/_torch/peft/lora/cuda_graph_lora_manager.py
@@ -106,8 +106,14 @@ def get_layer_idx(
             return None
 
         # Ignore LoRA layers without at least one of the target modules.
+        # Skip LoRA layers that belong to draft model subtrees (e.g., PARD
+        # embeds a full HF model as a submodule whose layers share the same
+        # layer_idx values as the target model, causing key collisions).
         for name, module in model.named_modules():
             if isinstance(module, LoraLayer):
+                if name.startswith("draft_model."):
+                    logger.debug(f"Skipping draft model LoRA module {name}")
+                    continue
                 layer_idx = get_layer_idx(model, module, name)
                 # if target_modules_ids is None, by default enable all modules
                 if self.target_modules_ids and not any(
diff --git a/tensorrt_llm/_torch/pyexecutor/model_engine.py b/tensorrt_llm/_torch/pyexecutor/model_engine.py
@@ -537,9 +537,11 @@ def _init_cuda_graph_lora_manager(self, lora_config: LoraConfig):
             max_lora_size = lora_config.max_loras or 8  # Default fallback
             max_batch_size = self.batch_size  # Use engine's max batch size
 
-            # For spec decode, each generation request contributes
-            # max_draft_len + 1 tokens per forward pass.
-            max_tokens_per_seq = (self.original_max_draft_len +
+            # For spec decode, each generation request can contribute up to
+            # tokens_per_gen_step tokens per forward pass. This is larger than
+            # max_draft_len + 1 for modes like PARD, which use extra mask
+            # tokens in the same generation step.
+            max_tokens_per_seq = (self.original_max_total_draft_tokens +
                                   1) if self.is_spec_decode else 1
             self.cuda_graph_lora_manager = CudaGraphLoraManager(
                 max_lora_size=max_lora_size,