From 2aa8c167d6902d492ff1a62053ad5d10ef57f58a Mon Sep 17 00:00:00 2001
From: quic-xiyushi <xiyushi@qti.qualcomm.com>
Date: Tue, 21 Apr 2026 10:34:49 -0700
Subject: [PATCH 1/3] Add on-device sampling support for Qwen3VL Dense

Signed-off-by: quic-xiyushi <xiyushi@qti.qualcomm.com>
---
 QEfficient/transformers/models/pytorch_transforms.py |  2 ++
 QEfficient/transformers/sampler/sampler.py           | 11 +++++++++--
 2 files changed, 11 insertions(+), 2 deletions(-)

diff --git a/QEfficient/transformers/models/pytorch_transforms.py b/QEfficient/transformers/models/pytorch_transforms.py
index 31c86a9c72..73ac27eed8 100644
--- a/QEfficient/transformers/models/pytorch_transforms.py
+++ b/QEfficient/transformers/models/pytorch_transforms.py
@@ -457,6 +457,7 @@
     QEffQwen3MoeSparseMoeBlock,
 )
 from QEfficient.transformers.models.qwen3_vl.modeling_qwen3_vl import (
+    QEffQwen3VLDecoderWrapper,
     QEffQwen3VLForConditionalGeneration,
     QEffQwen3VLModel,
     QEffQwen3VLTextAttention,
@@ -854,6 +855,7 @@ class SamplerTransform:
         QEffPhi3ForCausalLM,
         QEffQwen2ForCausalLM,
         QEffQwen_2_5_vl_DecoderWrapper,
+        QEffQwen3VLDecoderWrapper,
     }
 
     @classmethod
diff --git a/QEfficient/transformers/sampler/sampler.py b/QEfficient/transformers/sampler/sampler.py
index 5c86b63553..e0dd1a8fea 100644
--- a/QEfficient/transformers/sampler/sampler.py
+++ b/QEfficient/transformers/sampler/sampler.py
@@ -25,6 +25,7 @@ class SamplerOutput(ModelOutput):
     probs: torch.FloatTensor = None
     next_tokens: torch.IntTensor = None
     vision_embeds: Optional[torch.FloatTensor] = None  # For VLMs
+    deepstack_features: Optional[torch.FloatTensor] = None  # For Qwen3VL
     image_idx: Optional[torch.IntTensor] = None  # for VLMs
     past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None
     past_repetition_penalty_buffer: Optional[torch.Tensor] = None
@@ -110,6 +111,7 @@ def sampler_forward(
     comp_ctx_lengths: Optional[torch.LongTensor] = None,
     batch_index: Optional[torch.LongTensor] = None,
     inputs_embeds: Optional[torch.FloatTensor] = None,
+    deepstack_features: Optional[torch.FloatTensor] = None,
     labels: Optional[torch.LongTensor] = None,
     use_cache: Optional[bool] = None,
     output_attentions: Optional[bool] = None,
@@ -195,11 +197,15 @@ def sampler_forward(
             past_key_values=past_key_values,
             comp_ctx_lengths=comp_ctx_lengths,
         )
+        output_keys = ["logits", "vision_embeds", "image_idx", "past_key_values"]
         if batch_index is not None:
             forward_kwargs["batch_index"] = batch_index
+        if deepstack_features is not None:
+            forward_kwargs["deepstack_features"] = deepstack_features
+            output_keys.insert(2, "deepstack_features")
 
-        logits, vision_embeds, image_idx, past_key_values = self.old_forward(**forward_kwargs)
-        outputs = dict(logits=logits, vision_embeds=vision_embeds, image_idx=image_idx, past_key_values=past_key_values)
+        result = self.old_forward(**forward_kwargs)
+        outputs = dict(zip(output_keys, result))
         if position_ids.dim() == 3:  # For models using m-rope
             position_ids = position_ids[0]
     else:
@@ -356,6 +362,7 @@ def sampler_forward(
         probs=probs,
         next_tokens=next_tokens,  # Return sampled next tokens instead of logits
         vision_embeds=outputs.get("vision_embeds", None),
+        deepstack_features=outputs.get("deepstack_features", None),
         image_idx=outputs.get("image_idx", None),
         past_key_values=outputs.get("past_key_values", None),
         past_repetition_penalty_buffer=past_repetition_penalty_buffer,

From 0047fb0b9e9e4ec40bd9912ce812f61f1d4766da Mon Sep 17 00:00:00 2001
From: quic-xiyushi <xiyushi@qti.qualcomm.com>
Date: Tue, 21 Apr 2026 13:59:42 -0700
Subject: [PATCH 2/3] Fix on-device sampling test after changes in qpc path api

Signed-off-by: quic-xiyushi <xiyushi@qti.qualcomm.com>
---
 tests/transformers/sampler/test_sampler.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/tests/transformers/sampler/test_sampler.py b/tests/transformers/sampler/test_sampler.py
index 2434f89283..924878a1fe 100644
--- a/tests/transformers/sampler/test_sampler.py
+++ b/tests/transformers/sampler/test_sampler.py
@@ -166,9 +166,10 @@ def test_sampler_transform(
         mxfp6_matmul=True,
     )
     if is_vlm:
-        model_w_sampler_qpc_path = model_w_sampler_qpc_path[1]
-        model_w_sampler_w_guided_decoding_qpc_path = model_w_sampler_w_guided_decoding_qpc_path[1]
-        model_wo_sampler_qpc_path = model_wo_sampler_qpc_path[1]
+        lang_qpc_path = "lang_qpc_path"
+        model_w_sampler_qpc_path = model_w_sampler_qpc_path[lang_qpc_path]
+        model_w_sampler_w_guided_decoding_qpc_path = model_w_sampler_w_guided_decoding_qpc_path[lang_qpc_path]
+        model_wo_sampler_qpc_path = model_wo_sampler_qpc_path[lang_qpc_path]
 
     # Init qaic session
     model_w_sampler_session = QAICInferenceSession(model_w_sampler_qpc_path)

From c2efe561788f528e113914cff4eda6401c4c64bd Mon Sep 17 00:00:00 2001
From: quic-xiyushi <xiyushi@qti.qualcomm.com>
Date: Wed, 22 Apr 2026 08:21:32 -0700
Subject: [PATCH 3/3] Add on-device sampling test for qwen3vl

Signed-off-by: quic-xiyushi <xiyushi@qti.qualcomm.com>
---
 QEfficient/transformers/sampler/sampler.py |  2 +-
 tests/transformers/sampler/test_sampler.py | 68 ++++++++++++++++++++++
 2 files changed, 69 insertions(+), 1 deletion(-)

diff --git a/QEfficient/transformers/sampler/sampler.py b/QEfficient/transformers/sampler/sampler.py
index e0dd1a8fea..9460bd221e 100644
--- a/QEfficient/transformers/sampler/sampler.py
+++ b/QEfficient/transformers/sampler/sampler.py
@@ -137,7 +137,7 @@ def sampler_forward(
     Perform the sampling of next tokens on the QAIC device (instead of the host)
     and return the next tokens and/or probability distributions.
 
-    The vision_embeds and image_idx parameters are optional
+    The vision_embeds, deepstack_features, and image_idx parameters are optional
     and are used only for VLMs when supported by the original forward function.
 
     Args:
diff --git a/tests/transformers/sampler/test_sampler.py b/tests/transformers/sampler/test_sampler.py
index 924878a1fe..9f79be0330 100644
--- a/tests/transformers/sampler/test_sampler.py
+++ b/tests/transformers/sampler/test_sampler.py
@@ -41,6 +41,19 @@
         None,  # spec_length
         True,  # is_vlm
     ),
+    pytest.param(
+        "Qwen/Qwen3-VL-2B-Instruct",  # model
+        (
+            ["https://picsum.photos/id/237/536/354"] * 2,
+            ["Can you describe the image in detail."] * 2,
+        ),  # images and prompts
+        128,  # prefill_seq_len
+        4096,  # ctx_len
+        20,  # generation_len
+        2,  # full_batch_size
+        None,  # spec_length
+        True,  # is_vlm
+    ),
 ]
 
 
@@ -522,6 +535,61 @@ def test_random_sampling(
                 ]
             ],
         }
+    elif model == "Qwen/Qwen3-VL-2B-Instruct":
+        golden_texts = {
+            "w_sampler": "This is a close-up, top-down photograph of an adorable black puppy resting on weathered wooden flooring",
+            "wo_sampler": "This is a close-up, top-down photograph of a young black puppy, likely a Labrador Retri",
+        }
+        golden_ids = {
+            "w_sampler": [
+                [
+                    1986,
+                    374,
+                    264,
+                    3265,
+                    5239,
+                    11,
+                    1909,
+                    14875,
+                    10300,
+                    315,
+                    458,
+                    40608,
+                    3691,
+                    41189,
+                    40119,
+                    389,
+                    9104,
+                    291,
+                    22360,
+                    36148,
+                ]
+            ],
+            "wo_sampler": [
+                [
+                    1986,
+                    374,
+                    264,
+                    3265,
+                    5239,
+                    11,
+                    1909,
+                    14875,
+                    10300,
+                    315,
+                    264,
+                    3908,
+                    3691,
+                    41189,
+                    11,
+                    4363,
+                    264,
+                    79276,
+                    10392,
+                    461,
+                ]
+            ],
+        }
     for i in range(full_batch_size):
         assert (
             tokenizer.decode(model_w_sampler_exec_info.generated_ids[i][:generation_len]) == golden_texts["w_sampler"]