quic · qcdipankar · Apr 20, 2026
diff --git a/QEfficient/transformers/models/qwen3_vl/modeling_qwen3_vl.py b/QEfficient/transformers/models/qwen3_vl/modeling_qwen3_vl.py
@@ -388,12 +388,6 @@ def forward(
         key_states = self.k_norm(self.k_proj(hidden_states).view(hidden_shape)).transpose(1, 2)
         value_states = self.v_proj(hidden_states).view(hidden_shape).transpose(1, 2)
 
-        # cos, sin = position_embeddings
-        # kv_seq_len = key_states.shape[-2]
-        # kv_seq_len = past_key_value.get_seq_length()
-        past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
-        # cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
-
         query_states, key_states = qeff_apply_rotary_pos_emb(
             query_states,
             key_states,
@@ -409,7 +403,6 @@ def forward(
                 "cos": cos_cached,
                 "batch_index": batch_index,
                 "position_ids": position_ids[0],
-                "past_seen_tokens": past_seen_tokens,
             }
             if comp_ctx_lengths is not None:
                 attention_mask = attention_mask[:, :, :, : comp_ctx_lengths.shape[-1]]
@@ -503,7 +496,7 @@ def forward(
         residual = hidden_states
         hidden_states = self.post_attention_layernorm(hidden_states)
         hidden_states = self.mlp(hidden_states)
-        hidden_states = residual + hidden_states[0]
+        hidden_states = residual + hidden_states
 
         outputs = (hidden_states,)
 
@@ -667,7 +660,7 @@ class QEffQwen3VLDecoderWrapper(nn.Module):
     def __init__(self, model):
         super().__init__()
         self.model = model
-        self.language_model = self.model.model
+        self.language_model = self.model.model.language_model
 
     def get_submodules_for_export(self) -> Type[nn.Module]:
         """

diff --git a/examples/image_text_to_text/models/qwen3vl/qwen3_vl.py b/examples/image_text_to_text/models/qwen3vl/qwen3_vl.py
@@ -45,7 +45,7 @@
         aic_enable_depth_first=True,
         skip_vision=True,
         mos=1,
-        use_onnx_subfunctions=True,
+        use_onnx_subfunctions=False,
     )
 
     messages = [
@@ -84,8 +84,6 @@
         num_devices=4,
         height=354,
         width=536,
-        # height=1024,
-        # width=1024,
         mxfp6_matmul=True,
         mxint8_kv_cache=True,
         aic_enable_depth_first=True,
@@ -95,10 +93,6 @@
 
     ### IMAGE + TEXT ###
     image_url = "https://picsum.photos/id/237/536/354"
-    # image_url = (
-    #     "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/datasets/cat_style_layout.png"
-    # )
-
     image = Image.open(requests.get(image_url, stream=True).raw)
 
     messages_1 = [
@@ -111,16 +105,6 @@
         },
     ]
 
-    # messages_2 = [
-    #     {
-    #         "role": "user",
-    #         "content": [
-    #             {"type": "image", "image": image},
-    #             {"type": "text", "text": "Describe about the color of the dog."},
-    #         ],
-    #     },
-    # ]
-
     messages = [messages_1] * batch_size
 
     texts = [processor.apply_chat_template(msg, tokenize=False, add_generation_prompt=True) for msg in messages]