From 2aa8c167d6902d492ff1a62053ad5d10ef57f58a Mon Sep 17 00:00:00 2001 From: quic-xiyushi Date: Tue, 21 Apr 2026 10:34:49 -0700 Subject: [PATCH 1/3] Add on-device sampling support for Qwen3VL Dense Signed-off-by: quic-xiyushi --- QEfficient/transformers/models/pytorch_transforms.py | 2 ++ QEfficient/transformers/sampler/sampler.py | 11 +++++++++-- 2 files changed, 11 insertions(+), 2 deletions(-) diff --git a/QEfficient/transformers/models/pytorch_transforms.py b/QEfficient/transformers/models/pytorch_transforms.py index 31c86a9c72..73ac27eed8 100644 --- a/QEfficient/transformers/models/pytorch_transforms.py +++ b/QEfficient/transformers/models/pytorch_transforms.py @@ -457,6 +457,7 @@ QEffQwen3MoeSparseMoeBlock, ) from QEfficient.transformers.models.qwen3_vl.modeling_qwen3_vl import ( + QEffQwen3VLDecoderWrapper, QEffQwen3VLForConditionalGeneration, QEffQwen3VLModel, QEffQwen3VLTextAttention, @@ -854,6 +855,7 @@ class SamplerTransform: QEffPhi3ForCausalLM, QEffQwen2ForCausalLM, QEffQwen_2_5_vl_DecoderWrapper, + QEffQwen3VLDecoderWrapper, } @classmethod diff --git a/QEfficient/transformers/sampler/sampler.py b/QEfficient/transformers/sampler/sampler.py index 5c86b63553..e0dd1a8fea 100644 --- a/QEfficient/transformers/sampler/sampler.py +++ b/QEfficient/transformers/sampler/sampler.py @@ -25,6 +25,7 @@ class SamplerOutput(ModelOutput): probs: torch.FloatTensor = None next_tokens: torch.IntTensor = None vision_embeds: Optional[torch.FloatTensor] = None # For VLMs + deepstack_features: Optional[torch.FloatTensor] = None # For Qwen3VL image_idx: Optional[torch.IntTensor] = None # for VLMs past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None past_repetition_penalty_buffer: Optional[torch.Tensor] = None @@ -110,6 +111,7 @@ def sampler_forward( comp_ctx_lengths: Optional[torch.LongTensor] = None, batch_index: Optional[torch.LongTensor] = None, inputs_embeds: Optional[torch.FloatTensor] = None, + deepstack_features: Optional[torch.FloatTensor] = None, labels: Optional[torch.LongTensor] = None, use_cache: Optional[bool] = None, output_attentions: Optional[bool] = None, @@ -195,11 +197,15 @@ def sampler_forward( past_key_values=past_key_values, comp_ctx_lengths=comp_ctx_lengths, ) + output_keys = ["logits", "vision_embeds", "image_idx", "past_key_values"] if batch_index is not None: forward_kwargs["batch_index"] = batch_index + if deepstack_features is not None: + forward_kwargs["deepstack_features"] = deepstack_features + output_keys.insert(2, "deepstack_features") - logits, vision_embeds, image_idx, past_key_values = self.old_forward(**forward_kwargs) - outputs = dict(logits=logits, vision_embeds=vision_embeds, image_idx=image_idx, past_key_values=past_key_values) + result = self.old_forward(**forward_kwargs) + outputs = dict(zip(output_keys, result)) if position_ids.dim() == 3: # For models using m-rope position_ids = position_ids[0] else: @@ -356,6 +362,7 @@ def sampler_forward( probs=probs, next_tokens=next_tokens, # Return sampled next tokens instead of logits vision_embeds=outputs.get("vision_embeds", None), + deepstack_features=outputs.get("deepstack_features", None), image_idx=outputs.get("image_idx", None), past_key_values=outputs.get("past_key_values", None), past_repetition_penalty_buffer=past_repetition_penalty_buffer, From 0047fb0b9e9e4ec40bd9912ce812f61f1d4766da Mon Sep 17 00:00:00 2001 From: quic-xiyushi Date: Tue, 21 Apr 2026 13:59:42 -0700 Subject: [PATCH 2/3] Fix on-device sampling test after changes in qpc path api Signed-off-by: quic-xiyushi --- tests/transformers/sampler/test_sampler.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/tests/transformers/sampler/test_sampler.py b/tests/transformers/sampler/test_sampler.py index 2434f89283..924878a1fe 100644 --- a/tests/transformers/sampler/test_sampler.py +++ b/tests/transformers/sampler/test_sampler.py @@ -166,9 +166,10 @@ def test_sampler_transform( mxfp6_matmul=True, ) if is_vlm: - model_w_sampler_qpc_path = model_w_sampler_qpc_path[1] - model_w_sampler_w_guided_decoding_qpc_path = model_w_sampler_w_guided_decoding_qpc_path[1] - model_wo_sampler_qpc_path = model_wo_sampler_qpc_path[1] + lang_qpc_path = "lang_qpc_path" + model_w_sampler_qpc_path = model_w_sampler_qpc_path[lang_qpc_path] + model_w_sampler_w_guided_decoding_qpc_path = model_w_sampler_w_guided_decoding_qpc_path[lang_qpc_path] + model_wo_sampler_qpc_path = model_wo_sampler_qpc_path[lang_qpc_path] # Init qaic session model_w_sampler_session = QAICInferenceSession(model_w_sampler_qpc_path) From c2efe561788f528e113914cff4eda6401c4c64bd Mon Sep 17 00:00:00 2001 From: quic-xiyushi Date: Wed, 22 Apr 2026 08:21:32 -0700 Subject: [PATCH 3/3] Add on-device sampling test for qwen3vl Signed-off-by: quic-xiyushi --- QEfficient/transformers/sampler/sampler.py | 2 +- tests/transformers/sampler/test_sampler.py | 68 ++++++++++++++++++++++ 2 files changed, 69 insertions(+), 1 deletion(-) diff --git a/QEfficient/transformers/sampler/sampler.py b/QEfficient/transformers/sampler/sampler.py index e0dd1a8fea..9460bd221e 100644 --- a/QEfficient/transformers/sampler/sampler.py +++ b/QEfficient/transformers/sampler/sampler.py @@ -137,7 +137,7 @@ def sampler_forward( Perform the sampling of next tokens on the QAIC device (instead of the host) and return the next tokens and/or probability distributions. - The vision_embeds and image_idx parameters are optional + The vision_embeds, deepstack_features, and image_idx parameters are optional and are used only for VLMs when supported by the original forward function. Args: diff --git a/tests/transformers/sampler/test_sampler.py b/tests/transformers/sampler/test_sampler.py index 924878a1fe..9f79be0330 100644 --- a/tests/transformers/sampler/test_sampler.py +++ b/tests/transformers/sampler/test_sampler.py @@ -41,6 +41,19 @@ None, # spec_length True, # is_vlm ), + pytest.param( + "Qwen/Qwen3-VL-2B-Instruct", # model + ( + ["https://picsum.photos/id/237/536/354"] * 2, + ["Can you describe the image in detail."] * 2, + ), # images and prompts + 128, # prefill_seq_len + 4096, # ctx_len + 20, # generation_len + 2, # full_batch_size + None, # spec_length + True, # is_vlm + ), ] @@ -522,6 +535,61 @@ def test_random_sampling( ] ], } + elif model == "Qwen/Qwen3-VL-2B-Instruct": + golden_texts = { + "w_sampler": "This is a close-up, top-down photograph of an adorable black puppy resting on weathered wooden flooring", + "wo_sampler": "This is a close-up, top-down photograph of a young black puppy, likely a Labrador Retri", + } + golden_ids = { + "w_sampler": [ + [ + 1986, + 374, + 264, + 3265, + 5239, + 11, + 1909, + 14875, + 10300, + 315, + 458, + 40608, + 3691, + 41189, + 40119, + 389, + 9104, + 291, + 22360, + 36148, + ] + ], + "wo_sampler": [ + [ + 1986, + 374, + 264, + 3265, + 5239, + 11, + 1909, + 14875, + 10300, + 315, + 264, + 3908, + 3691, + 41189, + 11, + 4363, + 264, + 79276, + 10392, + 461, + ] + ], + } for i in range(full_batch_size): assert ( tokenizer.decode(model_w_sampler_exec_info.generated_ids[i][:generation_len]) == golden_texts["w_sampler"]