From 08bb022d83f4fb76b0fb481140d4563ab5b0db06 Mon Sep 17 00:00:00 2001
From: Amit Raj <amitraj@qti.qualcomm.com>
Date: Sat, 18 Apr 2026 10:55:00 +0000
Subject: [PATCH 01/11] Enabling support of rerankers models 2B and 8B of
 qwen3vl bucket

Signed-off-by: Amit Raj <amitraj@qti.qualcomm.com>
---
 .../transformers/models/modeling_auto.py      |   8 +-
 .../models/qwen3vl/reranker/README.md         |  52 ++
 .../qwen3vl/reranker/qwen3_vl_reranker.py     | 555 ++++++++++++++++++
 tests/configs/image_text_model_configs.json   |   2 +-
 .../image_text_to_text/test_reranker_mad.py   | 455 ++++++++++++++
 5 files changed, 1069 insertions(+), 3 deletions(-)
 create mode 100644 examples/image_text_to_text/models/qwen3vl/reranker/README.md
 create mode 100644 examples/image_text_to_text/models/qwen3vl/reranker/qwen3_vl_reranker.py
 create mode 100644 tests/transformers/models/image_text_to_text/test_reranker_mad.py

diff --git a/QEfficient/transformers/models/modeling_auto.py b/QEfficient/transformers/models/modeling_auto.py
index 0b1e3702b6..fc10032df6 100755
--- a/QEfficient/transformers/models/modeling_auto.py
+++ b/QEfficient/transformers/models/modeling_auto.py
@@ -1377,7 +1377,7 @@ def export(
                 kv_offload=True,
                 continuous_batching=self.continuous_batching,
                 comp_ctx_lengths=self.comp_ctx_lengths_decode,
-                **dummy_inputs_kwargs,
+                prefill_seq_len=prefill_seq_len,
             )
             dynamic_axes = self.model.get_onnx_dynamic_axes(
                 kv_offload=True,
@@ -1385,7 +1385,11 @@ def export(
                 comp_ctx_lengths=self.comp_ctx_lengths_decode,
             )
         except TypeError:
-            inputs = self.model.get_dummy_inputs(kv_offload=True, comp_ctx_lengths=self.comp_ctx_lengths_decode)
+            inputs = self.model.get_dummy_inputs(
+                kv_offload=True,
+                comp_ctx_lengths=self.comp_ctx_lengths_decode,
+                prefill_seq_len=prefill_seq_len,
+            )
             dynamic_axes = self.model.get_onnx_dynamic_axes(
                 kv_offload=True, comp_ctx_lengths=self.comp_ctx_lengths_decode
             )
diff --git a/examples/image_text_to_text/models/qwen3vl/reranker/README.md b/examples/image_text_to_text/models/qwen3vl/reranker/README.md
new file mode 100644
index 0000000000..a3e715478d
--- /dev/null
+++ b/examples/image_text_to_text/models/qwen3vl/reranker/README.md
@@ -0,0 +1,52 @@
+# Qwen3-VL Reranker Inference
+
+This directory contains an AI100 example for running Qwen3-VL reranker models with QEfficient and printing per-document relevance scores.
+
+Supported models:
+- `Qwen/Qwen3-VL-Reranker-2B`
+- `Qwen/Qwen3-VL-Reranker-8B`
+
+## What this example does
+
+- Loads Qwen3-VL reranker from Hugging Face (or local snapshot path).
+- Uses QEff dual-QPC execution (vision encoder + language model).
+- Runs the same query against multiple text/image documents.
+- Prints one score per document in input order.
+
+## Required package
+
+- `qwen-vl-utils>=0.0.14`
+
+```bash
+pip install "qwen-vl-utils>=0.0.14"
+```
+
+## Script
+
+- `qwen3_vl_reranker.py`
+
+## Run
+
+```bash
+python examples/image_text_to_text/models/qwen3vl/reranker/qwen3_vl_reranker.py \
+  --model-name Qwen/Qwen3-VL-Reranker-2B
+```
+
+Or run with 8B:
+
+```bash
+python examples/image_text_to_text/models/qwen3vl/reranker/qwen3_vl_reranker.py \
+  --model-name Qwen/Qwen3-VL-Reranker-8B
+```
+
+With compile parameters:
+
+```bash
+python examples/image_text_to_text/models/qwen3vl/reranker/qwen3_vl_reranker.py \
+  --model-name Qwen/Qwen3-VL-Reranker-2B \
+  --ctx-len 2048 \
+  --num-cores 16 \
+  --num-devices 1 \
+  --compile-prefill-seq-len 4096 \
+  --mxfp6-matmul
+```
diff --git a/examples/image_text_to_text/models/qwen3vl/reranker/qwen3_vl_reranker.py b/examples/image_text_to_text/models/qwen3vl/reranker/qwen3_vl_reranker.py
new file mode 100644
index 0000000000..2fdd225571
--- /dev/null
+++ b/examples/image_text_to_text/models/qwen3vl/reranker/qwen3_vl_reranker.py
@@ -0,0 +1,555 @@
+# -----------------------------------------------------------------------------
+#
+# Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# -----------------------------------------------------------------------------
+
+import argparse
+import os
+from typing import Dict, List, Tuple
+
+import numpy as np
+import torch
+from huggingface_hub import snapshot_download
+from qwen_vl_utils import process_vision_info
+from transformers import AutoConfig, AutoProcessor
+
+from QEfficient import QEFFAutoModelForImageTextToText
+from QEfficient.generation.cloud_infer import QAICInferenceSession
+
+DEFAULT_MODEL_NAME = "Qwen/Qwen3-VL-Reranker-2B"
+DEFAULT_CTX_LEN = 2048
+DEFAULT_NUM_CORES = 16
+DEFAULT_NUM_DEVICES = 1
+
+# Max token budget used by this example's manual truncation/padding flow.
+MAX_LENGTH = 8192
+# Pixel constraints used by Qwen3-VL preprocessing.
+IMAGE_BASE_FACTOR = 16
+IMAGE_FACTOR = IMAGE_BASE_FACTOR * 2
+MIN_PIXELS = 4 * IMAGE_FACTOR * IMAGE_FACTOR
+MAX_PIXELS = 1280 * IMAGE_FACTOR * IMAGE_FACTOR
+FPS = 1.0
+
+
+class QEffQwen3VLReranker:
+    @staticmethod
+    def _resolve_model_source(model_name_or_path: str) -> str:
+        """Return a local model path when given an HF repo id.
+
+        Why:
+        Some transformers versions can fail when resolving chat templates from
+        repo-id mode for this model. Using a local snapshot path avoids that path.
+        """
+        if os.path.isdir(model_name_or_path):
+            return model_name_or_path
+        return snapshot_download(repo_id=model_name_or_path)
+
+    def __init__(
+        self,
+        model_name_or_path: str = DEFAULT_MODEL_NAME,
+        ctx_len: int = DEFAULT_CTX_LEN,
+        num_cores: int = DEFAULT_NUM_CORES,
+        num_devices: int = DEFAULT_NUM_DEVICES,
+        mxfp6_matmul: bool = False,
+        compile_prefill_seq_len: int = None,
+    ):
+        """Initialize the AI100-only reranker wrapper.
+
+        This loads:
+        - HF config/processor for prompt and multimodal preprocessing.
+        - QEFF dual-QPC model wrapper (vision encoder + language decoder).
+        - Token ids for "yes"/"no" used to compute reranker scores.
+
+        Parameters
+        ----------
+        model_name_or_path:
+            HF model id or local snapshot path.
+        """
+        self.model_name_or_path = model_name_or_path
+        self.model_source = self._resolve_model_source(model_name_or_path)
+        self.ctx_len = ctx_len
+        self.num_cores = num_cores
+        self.num_devices = num_devices
+        self.mxfp6_matmul = mxfp6_matmul
+        self.compile_prefill_seq_len = compile_prefill_seq_len
+        self.max_length = MAX_LENGTH
+        self.fps = FPS
+
+        # Use local snapshot for stable processor/chat-template loading.
+        config = AutoConfig.from_pretrained(self.model_source, trust_remote_code=True, padding=True)
+        if hasattr(config, "use_cache"):
+            config.use_cache = True
+        if hasattr(config, "text_config") and hasattr(config.text_config, "use_cache"):
+            config.text_config.use_cache = True
+
+        self.processor = AutoProcessor.from_pretrained(self.model_source, trust_remote_code=True, padding=True)
+        self.model = QEFFAutoModelForImageTextToText.from_pretrained(
+            self.model_source,
+            kv_offload=True,
+            trust_remote_code=True,
+            config=config,
+        )
+
+        self.yes_token_id, self.no_token_id = self._get_yes_no_token_ids(self.processor.tokenizer)
+        self._compiled_qpc_paths = None
+        self._compiled_prefill_seq_len = 0
+        self._compiled_height = None
+        self._compiled_width = None
+
+    @staticmethod
+    def _get_yes_no_token_ids(tokenizer) -> Tuple[int, int]:
+        """Resolve tokenizer ids for the exact tokens 'yes' and 'no'."""
+        vocab = tokenizer.get_vocab()
+        if "yes" not in vocab or "no" not in vocab:
+            raise ValueError("Could not resolve tokenizer ids for exact tokens 'yes' and 'no'.")
+        return vocab["yes"], vocab["no"]
+
+    @staticmethod
+    def _score_from_logits(logits, yes_token_id: int, no_token_id: int) -> float:
+        """Convert model logits into a reranker relevance score.
+
+        Score formula:
+            sigmoid(logit_yes - logit_no)
+        """
+        # Convert runtime output to torch and use final-token logits.
+        logits_tensor = torch.from_numpy(logits) if isinstance(logits, np.ndarray) else logits.detach().cpu()
+        if logits_tensor.ndim == 3:
+            logits_tensor = logits_tensor[:, -1, :]
+        # Binary relevance score from yes/no logit gap.
+        score = torch.sigmoid(logits_tensor[:, yes_token_id] - logits_tensor[:, no_token_id])
+        return float(score[0].item())
+
+    @staticmethod
+    def _truncate_tokens_optimized(tokens: List[int], max_length: int, special_tokens: List[int]) -> List[int]:
+        """Truncate while preserving all special tokens in sequence order."""
+        if len(tokens) <= max_length:
+            return tokens
+
+        # Preserve all special/control tokens and trim only non-special tokens.
+        special_tokens_set = set(special_tokens)
+        num_special = sum(1 for token in tokens if token in special_tokens_set)
+        num_non_special_to_keep = max_length - num_special
+
+        final_tokens = []
+        non_special_kept_count = 0
+        for token in tokens:
+            if token in special_tokens_set:
+                final_tokens.append(token)
+            elif non_special_kept_count < num_non_special_to_keep:
+                final_tokens.append(token)
+                non_special_kept_count += 1
+        return final_tokens
+
+    def _format_mm_content(self, text, image, video, prefix: str) -> List[Dict]:
+        """Build one multimodal content block (prefix + optional image + optional text)."""
+        # Prefix helps the model distinguish query vs document sections.
+        content = [{"type": "text", "text": prefix}]
+
+        if not text and not image and not video:
+            content.append({"type": "text", "text": "NULL"})
+            return content
+
+        if video:
+            raise ValueError("Video input is not supported in this AI100-only example.")
+
+        if image:
+            # Convert local paths to file:// URIs for the processor.
+            if isinstance(image, str):
+                image_content = image if image.startswith(("http", "oss")) else "file://" + image
+            else:
+                image_content = image
+            content.append(
+                {
+                    "type": "image",
+                    "image": image_content,
+                    "min_pixels": MIN_PIXELS,
+                    "max_pixels": MAX_PIXELS,
+                }
+            )
+
+        if text:
+            content.append({"type": "text", "text": text})
+
+        return content
+
+    def _format_mm_instruction(self, instruction: str, query: Dict, document: Dict) -> List[Dict]:
+        """Create the chat payload for one query-document pair."""
+        # Prompt shape follows the HF reranker reference format.
+        contents = [{"type": "text", "text": "<Instruct>: " + instruction}]
+
+        contents.extend(
+            self._format_mm_content(
+                query.get("text"),
+                query.get("image"),
+                query.get("video"),
+                prefix="<Query>:",
+            )
+        )
+        contents.extend(
+            self._format_mm_content(
+                document.get("text"),
+                document.get("image"),
+                document.get("video"),
+                prefix="\n<Document>:",
+            )
+        )
+
+        return [
+            {
+                "role": "system",
+                "content": [
+                    {
+                        "type": "text",
+                        "text": (
+                            "Judge whether the Document meets the requirements based on the Query and the Instruct "
+                            'provided. Note that the answer can only be "yes" or "no".'
+                        ),
+                    }
+                ],
+            },
+            {"role": "user", "content": contents},
+        ]
+
+    def _tokenize_pair(self, pair: List[Dict]) -> Dict:
+        """Tokenize a query-document pair with the exact HF multimodal pipeline."""
+        # Processor expects list-of-conversations.
+        pairs = [pair]
+        text = self.processor.apply_chat_template(pairs, tokenize=False, add_generation_prompt=True)
+
+        # Build image/video tensors + metadata for processor inputs.
+        images, videos, video_kwargs = process_vision_info(
+            pairs,
+            image_patch_size=16,
+            return_video_kwargs=True,
+            return_video_metadata=True,
+        )
+
+        if videos is not None:
+            videos, video_metadatas = zip(*videos)
+            videos = list(videos)
+            video_metadatas = list(video_metadatas)
+        else:
+            video_metadatas = None
+
+        inputs = self.processor(
+            text=text,
+            images=images,
+            videos=videos,
+            video_metadata=video_metadatas,
+            truncation=False,
+            padding=False,
+            do_resize=False,
+            **video_kwargs,
+        )
+
+        # Apply custom truncation preserving trailing template control tokens.
+        for i, input_ids in enumerate(inputs["input_ids"]):
+            inputs["input_ids"][i] = (
+                self._truncate_tokens_optimized(
+                    input_ids[:-5],
+                    self.max_length,
+                    self.processor.tokenizer.all_special_ids,
+                )
+                + input_ids[-5:]
+            )
+
+        # Re-pad through tokenizer utilities so masks align with token ids.
+        padded = self.processor.tokenizer.pad(
+            {"input_ids": inputs["input_ids"]},
+            padding=True,
+            return_tensors="pt",
+            max_length=self.max_length,
+        )
+        for key in padded:
+            inputs[key] = padded[key]
+
+        if "pixel_values" in inputs:
+            # Keep pixels fp32 before explicit cast to fp16 during vision run.
+            inputs["pixel_values"] = inputs["pixel_values"].to(torch.float32)
+
+        return inputs
+
+    def _prepare_inputs(self, tokenized_inputs: Dict, prefill_seq_len: int = None):
+        """Prepare model inputs for dual-QPC prefill execution."""
+        # True prompt length before compile-aligned padding.
+        runtime_prompt_len = int(tokenized_inputs["input_ids"].shape[1])
+        effective_prefill = runtime_prompt_len if prefill_seq_len is None else prefill_seq_len
+        if effective_prefill < runtime_prompt_len:
+            raise ValueError(
+                f"prefill_seq_len ({effective_prefill}) must be >= runtime prompt length ({runtime_prompt_len})."
+            )
+
+        # Let model helper compute position_ids and multimodal placement.
+        prepared_inputs = self.model.model.prepare_inputs_for_generation(
+            inputs=tokenized_inputs,
+            prefill_seq_len=effective_prefill,
+            batch_size=1,
+        )
+
+        # Normalize image_grid_thw to the shape consumed by compiled path.
+        if "image_grid_thw" in prepared_inputs and prepared_inputs["image_grid_thw"].ndim == 2:
+            thw = prepared_inputs["image_grid_thw"][0]
+            t, h, w = int(thw[0].item()), int(thw[1].item()), int(thw[2].item())
+            prepared_inputs["image_grid_thw"] = torch.zeros((1, t, h, w), dtype=thw.dtype)
+
+        if "pixel_values" in prepared_inputs:
+            prepared_inputs["pixel_values"] = prepared_inputs["pixel_values"].to(torch.float32)
+
+        return prepared_inputs, runtime_prompt_len
+
+    def _ensure_compiled(self, prefill_seq_len: int, height: int, width: int):
+        """Compile QPCs if needed, otherwise reuse cached compiled artifacts."""
+        # Reuse previously compiled artifacts whenever shapes are compatible.
+        if (
+            self._compiled_qpc_paths is not None
+            and prefill_seq_len <= self._compiled_prefill_seq_len
+            and height == self._compiled_height
+            and width == self._compiled_width
+        ):
+            return
+
+        reuse_vision_qpc = (
+            self._compiled_qpc_paths is not None and height == self._compiled_height and width == self._compiled_width
+        )
+
+        # Compile one max prefill specialization and optionally skip vision recompile.
+        compiled_paths = self.model.compile(
+            prefill_seq_len=prefill_seq_len,
+            ctx_len=self.ctx_len,
+            img_size=max(height, width),
+            height=height,
+            width=width,
+            num_cores=self.num_cores,
+            num_devices=self.num_devices,
+            mxfp6_matmul=self.mxfp6_matmul,
+            # vision_embed_fp32=True,
+            skip_vision=reuse_vision_qpc,
+        )
+        if reuse_vision_qpc:
+            compiled_paths["vision_qpc_path"] = self._compiled_qpc_paths["vision_qpc_path"]
+
+        self._compiled_qpc_paths = compiled_paths
+        self._compiled_prefill_seq_len = prefill_seq_len
+        self._compiled_height = height
+        self._compiled_width = width
+
+    @staticmethod
+    def _zero_vision_outputs(vision_outputs: Dict[str, np.ndarray]) -> Dict[str, np.ndarray]:
+        """Create zero-valued placeholders matching vision output buffers."""
+        return {name: np.zeros_like(value) for name, value in vision_outputs.items()}
+
+    def _run_ai100_vision(self, prepared_inputs) -> Dict[str, np.ndarray]:
+        """Run the compiled vision encoder QPC and return retained-state buffers."""
+        if "pixel_values" not in prepared_inputs or "image_grid_thw" not in prepared_inputs:
+            raise ValueError("Missing pixel_values/image_grid_thw for vision execution.")
+
+        # Vision session produces retained states consumed by language session.
+        vision_session = QAICInferenceSession(self._compiled_qpc_paths["vision_qpc_path"])
+        vision_outputs = vision_session.run(
+            {
+                # Vision qpc expects fp16 pixels + int64 grid coordinates.
+                "pixel_values": prepared_inputs["pixel_values"].detach().cpu().numpy().astype(np.float16),
+                "image_grid_thw": prepared_inputs["image_grid_thw"].detach().cpu().numpy().astype(np.int64),
+            }
+        )
+        vision_session.deactivate()
+        return vision_outputs
+
+    def _run_ai100_prefill(self, prepared_inputs, vision_template: Dict[str, np.ndarray]) -> np.ndarray:
+        """Run one prefill pass on AI100 language QPC and return logits."""
+        # Match runtime input to compiled prefill length.
+        prefill_len = prepared_inputs["position_ids"].shape[-1]
+        input_ids = prepared_inputs["input_ids"]
+        if input_ids.shape[1] < prefill_len:
+            pad = torch.full(
+                (input_ids.shape[0], prefill_len - input_ids.shape[1]),
+                1,
+                dtype=input_ids.dtype,
+                device=input_ids.device,
+            )
+            input_ids = torch.cat([input_ids, pad], dim=1)
+        else:
+            input_ids = input_ids[:, :prefill_len]
+
+        position_ids = prepared_inputs["position_ids"][..., :prefill_len]
+
+        # For text-only docs, inject zeroed retained states with matching shapes.
+        if "pixel_values" in prepared_inputs and "image_grid_thw" in prepared_inputs:
+            vision_outputs = self._run_ai100_vision(prepared_inputs)
+        else:
+            vision_outputs = self._zero_vision_outputs(vision_template)
+
+        # Skip past/retained buffers and run only required prefill inputs.
+        lang_session = QAICInferenceSession(self._compiled_qpc_paths["lang_qpc_path"])
+        lang_session.skip_buffers(
+            [
+                name
+                for name in lang_session.input_names + lang_session.output_names
+                if name.startswith("past_") or name.endswith("_RetainedState")
+            ]
+        )
+        lang_session.set_buffers(vision_outputs)
+        outputs = lang_session.run(
+            {
+                # image_idx selects the vision buffer slot for this request.
+                "input_ids": input_ids.detach().cpu().numpy().astype(np.int64),
+                "position_ids": position_ids.detach().cpu().numpy().astype(np.int64),
+                "image_idx": np.zeros((1, 1), dtype=np.int64),
+            }
+        )
+        lang_session.deactivate()
+        return outputs["logits"]
+
+    def process(self, inputs: Dict) -> List[float]:
+        """Score all documents for one query on AI100.
+
+        High-level flow:
+        1) Build model-ready query-document pairs.
+        2) Find max prompt/image shape across all docs.
+        3) Compile once at max shape (single stable specialization).
+        4) Run prefill per doc and convert logits -> score.
+        """
+        # Unpack user payload.
+        instruction = inputs["instruction"]
+        query = inputs.get("query", {})
+        documents = inputs.get("documents", [])
+
+        # Collect per-document tokenized contexts first so we can compile once
+        # with the largest prompt/image shape required by this request.
+        prepared_contexts = []
+        max_prompt_len = 0
+        max_grid_h = 22
+        max_grid_w = 34
+
+        # Build each pair in the exact chat-template format expected by the model.
+        for document in documents:
+            pair = self._format_mm_instruction(instruction, query, document)
+            tokenized = self._tokenize_pair(pair)
+            runtime_prompt_len = int(tokenized["input_ids"].shape[1])
+
+            # Track the max image grid (H, W) seen so compile dimensions can
+            # handle all documents in this batch.
+            if "image_grid_thw" in tokenized and tokenized["image_grid_thw"].numel() > 0:
+                grid = tokenized["image_grid_thw"]
+                max_grid_h = max(max_grid_h, int(grid[..., 1].max().item()))
+                max_grid_w = max(max_grid_w, int(grid[..., 2].max().item()))
+
+            prepared_contexts.append(
+                {
+                    "tokenized": tokenized,
+                    "runtime_prompt_len": runtime_prompt_len,
+                }
+            )
+            max_prompt_len = max(max_prompt_len, runtime_prompt_len)
+
+        # Empty documents list => no scores.
+        if max_prompt_len == 0:
+            return []
+
+        # Convert max grid to compile-time pixel dimensions using model patch size.
+        patch_size = int(self.model.model.config.vision_config.patch_size)
+        compile_height = max_grid_h * patch_size
+        compile_width = max_grid_w * patch_size
+
+        # Compile/reuse a single language specialization and prepare all requests
+        # to that same prefill length to avoid per-document recompiles.
+        target_prefill_seq_len = max_prompt_len
+        if self.compile_prefill_seq_len is not None:
+            if self.compile_prefill_seq_len < max_prompt_len:
+                raise ValueError(
+                    f"--compile-prefill-seq-len ({self.compile_prefill_seq_len}) must be >= "
+                    f"max runtime prompt length ({max_prompt_len})."
+                )
+            target_prefill_seq_len = self.compile_prefill_seq_len
+
+        self._ensure_compiled(target_prefill_seq_len, compile_height, compile_width)
+
+        # Prepare all documents to the same prefill length used at compile time.
+        prepared_contexts_with_prefill = []
+        vision_template = None
+        for ctx in prepared_contexts:
+            prepared_inputs, _ = self._prepare_inputs(ctx["tokenized"], prefill_seq_len=target_prefill_seq_len)
+            prepared_contexts_with_prefill.append({"prepared_inputs": prepared_inputs})
+
+            # Capture one real vision-output template so text-only docs can reuse
+            # zero-valued buffers with exact matching shapes.
+            if vision_template is None and "pixel_values" in prepared_inputs and "image_grid_thw" in prepared_inputs:
+                vision_template = self._run_ai100_vision(prepared_inputs)
+
+        # This example currently expects at least one image document to establish
+        # retained-state buffer shapes for mixed image/text batches.
+        if vision_template is None:
+            raise ValueError("At least one image document is required to initialize AI100 vision buffers.")
+
+        # Run language prefill and compute scalar score per document.
+        scores = []
+        for ctx in prepared_contexts_with_prefill:
+            logits = self._run_ai100_prefill(
+                ctx["prepared_inputs"],
+                vision_template=vision_template,
+            )
+            # Reranker score = sigmoid(logit_yes - logit_no).
+            score = self._score_from_logits(logits, self.yes_token_id, self.no_token_id)
+            scores.append(score)
+
+        return scores
+
+
+def main():
+    # Keep CLI simple: just allow model id/path override.
+    parser = argparse.ArgumentParser(description="Qwen3-VL reranker example.")
+    parser.add_argument("--model-name", type=str, default=DEFAULT_MODEL_NAME)
+    parser.add_argument("--ctx-len", type=int, default=DEFAULT_CTX_LEN, help="Context length used at compile time.")
+    parser.add_argument("--num-cores", type=int, default=DEFAULT_NUM_CORES, help="Number of AI100 cores.")
+    parser.add_argument("--num-devices", type=int, default=DEFAULT_NUM_DEVICES, help="Number of AI100 devices.")
+    parser.add_argument(
+        "--mxfp6-matmul",
+        action="store_true",
+        help="Enable MXFP6 matmul during compile (default: disabled).",
+    )
+    parser.add_argument(
+        "--compile-prefill-seq-len",
+        type=int,
+        default=None,
+        help=(
+            "Optional fixed prefill sequence length for compile/padding. "
+            "Must be >= max prompt length of the current request."
+        ),
+    )
+    args = parser.parse_args()
+
+    model = QEffQwen3VLReranker(
+        model_name_or_path=args.model_name,
+        ctx_len=args.ctx_len,
+        num_cores=args.num_cores,
+        num_devices=args.num_devices,
+        mxfp6_matmul=args.mxfp6_matmul,
+        compile_prefill_seq_len=args.compile_prefill_seq_len,
+    )
+
+    # Example input payload matching the HF reranker schema.
+    inputs = {
+        "instruction": "Retrieve images or text relevant to the user's query.",
+        "query": {"text": "A woman playing with her dog on a beach at sunset."},
+        "documents": [
+            {
+                "text": "A woman shares a joyful moment with her golden retriever on a sun-drenched beach at sunset, as the dog offers its paw in a heartwarming display of companionship and trust."
+            },
+            {"image": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg"},
+            {
+                "text": "A woman shares a joyful moment with her golden retriever on a sun-drenched beach at sunset, as the dog offers its paw in a heartwarming display of companionship and trust.",
+                "image": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg",
+            },
+        ],
+        "fps": 1.0,
+    }
+
+    # Print one score per document in the same order as inputs["documents"].
+    scores = model.process(inputs)
+    print(scores)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/tests/configs/image_text_model_configs.json b/tests/configs/image_text_model_configs.json
index 85df559970..f4cdb6a0fd 100644
--- a/tests/configs/image_text_model_configs.json
+++ b/tests/configs/image_text_model_configs.json
@@ -5,7 +5,7 @@
       "model_type": "llava",
       "batch_size": 1,
       "prompt_len": 784,
-      "ctx_len": 1024,
+      "ctx_len": 2048,
       "img_size": 336,
       "img_url": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/ai2d-demo.jpg",
       "text_prompt": "What does the label 15 represent? (1) lava (2) core (3) tunnel (4) ash cloud",
diff --git a/tests/transformers/models/image_text_to_text/test_reranker_mad.py b/tests/transformers/models/image_text_to_text/test_reranker_mad.py
new file mode 100644
index 0000000000..3a6497b520
--- /dev/null
+++ b/tests/transformers/models/image_text_to_text/test_reranker_mad.py
@@ -0,0 +1,455 @@
+# -----------------------------------------------------------------------------
+#
+# Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# ----------------------------------------------------------------------------
+
+import json
+import os
+from typing import Dict, List, Tuple
+
+import numpy as np
+import pytest
+import torch
+from huggingface_hub import snapshot_download
+from qwen_vl_utils import process_vision_info
+from transformers import AutoConfig, AutoProcessor
+
+from QEfficient.generation.cloud_infer import QAICInferenceSession
+from QEfficient.transformers.models.modeling_auto import QEFFAutoModelForImageTextToText
+from QEfficient.utils.test_utils import load_vlm_model, set_num_layers_vlm
+
+CONFIG_PATH = "tests/configs/image_text_model_configs.json"
+
+PT_AI100_MAD_MAX = 5e-3
+MAX_LENGTH = 8192
+RERANKER_DOC_LIMIT = int(os.getenv("QEFF_RERANKER_DOC_LIMIT", "0"))
+
+IMAGE_BASE_FACTOR = 16
+IMAGE_FACTOR = IMAGE_BASE_FACTOR * 2
+MIN_PIXELS = 4 * IMAGE_FACTOR * IMAGE_FACTOR
+MAX_PIXELS = 1280 * IMAGE_FACTOR * IMAGE_FACTOR
+
+EXAMPLE_INPUTS = {
+    "instruction": "Retrieve relevant content.",
+    "query": {"text": "dog on beach"},
+    "documents": [
+        {"image": "https://picsum.photos/id/237/536/354"},
+        {"text": "A dog running on the beach."},
+    ],
+}
+
+with open(CONFIG_PATH, "r") as f:
+    config_data = json.load(f)
+    reranker_models = config_data["image_text_reranker_models"]
+
+test_reranker_models = [model_config["model_name"] for model_config in reranker_models]
+reranker_model_config_dict = {model["model_name"]: model for model in reranker_models}
+
+
+def _resolve_model_source(model_name_or_path: str) -> str:
+    if os.path.isdir(model_name_or_path):
+        return model_name_or_path
+    return snapshot_download(repo_id=model_name_or_path)
+
+
+def _format_mm_content(text, image, video, prefix: str) -> List[Dict]:
+    content = [{"type": "text", "text": prefix}]
+
+    if not text and not image and not video:
+        content.append({"type": "text", "text": "NULL"})
+        return content
+
+    if video:
+        raise ValueError("Video input is not supported in this test.")
+
+    if image:
+        if isinstance(image, str):
+            image_content = image if image.startswith(("http", "oss")) else "file://" + image
+        else:
+            image_content = image
+        content.append(
+            {
+                "type": "image",
+                "image": image_content,
+                "min_pixels": MIN_PIXELS,
+                "max_pixels": MAX_PIXELS,
+            }
+        )
+
+    if text:
+        content.append({"type": "text", "text": text})
+
+    return content
+
+
+def _format_mm_instruction(instruction: str, query: Dict, document: Dict) -> List[Dict]:
+    contents = [{"type": "text", "text": "<Instruct>: " + instruction}]
+
+    contents.extend(
+        _format_mm_content(
+            query.get("text"),
+            query.get("image"),
+            query.get("video"),
+            prefix="<Query>:",
+        )
+    )
+    contents.extend(
+        _format_mm_content(
+            document.get("text"),
+            document.get("image"),
+            document.get("video"),
+            prefix="\n<Document>:",
+        )
+    )
+
+    return [
+        {
+            "role": "system",
+            "content": [
+                {
+                    "type": "text",
+                    "text": (
+                        "Judge whether the Document meets the requirements based on the Query and the Instruct "
+                        'provided. Note that the answer can only be "yes" or "no".'
+                    ),
+                }
+            ],
+        },
+        {"role": "user", "content": contents},
+    ]
+
+
+def _truncate_tokens_optimized(tokens: List[int], max_length: int, special_tokens: List[int]) -> List[int]:
+    if len(tokens) <= max_length:
+        return tokens
+
+    special_tokens_set = set(special_tokens)
+    num_special = sum(1 for token in tokens if token in special_tokens_set)
+    num_non_special_to_keep = max_length - num_special
+
+    final_tokens = []
+    non_special_kept_count = 0
+    for token in tokens:
+        if token in special_tokens_set:
+            final_tokens.append(token)
+        elif non_special_kept_count < num_non_special_to_keep:
+            final_tokens.append(token)
+            non_special_kept_count += 1
+    return final_tokens
+
+
+def _tokenize_pair(processor, pair: List[Dict]) -> Dict:
+    pairs = [pair]
+    text = processor.apply_chat_template(pairs, tokenize=False, add_generation_prompt=True)
+
+    images, videos, video_kwargs = process_vision_info(
+        pairs,
+        image_patch_size=16,
+        return_video_kwargs=True,
+        return_video_metadata=True,
+    )
+
+    if videos is not None:
+        videos, video_metadatas = zip(*videos)
+        videos = list(videos)
+        video_metadatas = list(video_metadatas)
+    else:
+        video_metadatas = None
+
+    inputs = processor(
+        text=text,
+        images=images,
+        videos=videos,
+        video_metadata=video_metadatas,
+        truncation=False,
+        padding=False,
+        do_resize=False,
+        **video_kwargs,
+    )
+
+    for i, input_ids in enumerate(inputs["input_ids"]):
+        inputs["input_ids"][i] = (
+            _truncate_tokens_optimized(
+                input_ids[:-5],
+                MAX_LENGTH,
+                processor.tokenizer.all_special_ids,
+            )
+            + input_ids[-5:]
+        )
+
+    padded = processor.tokenizer.pad(
+        {"input_ids": inputs["input_ids"]},
+        padding=True,
+        return_tensors="pt",
+        max_length=MAX_LENGTH,
+    )
+    for key in padded:
+        inputs[key] = padded[key]
+
+    if "pixel_values" in inputs:
+        inputs["pixel_values"] = inputs["pixel_values"].to(torch.float32)
+
+    return inputs
+
+
+def _get_yes_no_token_ids(tokenizer) -> Tuple[int, int]:
+    vocab = tokenizer.get_vocab()
+    if "yes" not in vocab or "no" not in vocab:
+        raise ValueError("Could not resolve tokenizer ids for exact tokens 'yes' and 'no'.")
+    return vocab["yes"], vocab["no"]
+
+
+def _score_from_logits(logits, yes_token_id: int, no_token_id: int) -> np.ndarray:
+    if isinstance(logits, np.ndarray):
+        logits_tensor = torch.from_numpy(logits)
+    else:
+        logits_tensor = logits.detach().cpu()
+
+    if logits_tensor.ndim == 3:
+        logits_tensor = logits_tensor[:, -1, :]
+    elif logits_tensor.ndim != 2:
+        raise ValueError(f"Unsupported logits rank for score conversion: {logits_tensor.ndim}")
+
+    score = torch.sigmoid(logits_tensor[:, yes_token_id] - logits_tensor[:, no_token_id])
+    return score.detach().cpu().numpy().astype(np.float64)
+
+
+def _score_from_last_hidden(last_hidden_state: torch.Tensor, score_linear: torch.nn.Linear) -> np.ndarray:
+    score = torch.sigmoid(score_linear(last_hidden_state[:, -1])).squeeze(-1)
+    return score.detach().cpu().numpy().astype(np.float64)
+
+
+def _make_score_linear(model_hf, yes_token_id: int, no_token_id: int) -> torch.nn.Linear:
+    lm_head_weights = model_hf.lm_head.weight.data
+    weight_yes = lm_head_weights[yes_token_id]
+    weight_no = lm_head_weights[no_token_id]
+
+    linear_layer = torch.nn.Linear(weight_yes.shape[0], 1, bias=False)
+    with torch.no_grad():
+        linear_layer.weight[0] = weight_yes - weight_no
+    return linear_layer.eval()
+
+
+def _mad_stats(reference: np.ndarray, candidate: np.ndarray) -> Tuple[float, float]:
+    diff = np.abs(reference - candidate)
+    return float(np.mean(diff)), float(np.max(diff))
+
+
+def _prepare_qeff_inputs(qeff_model, tokenized_inputs: Dict, prefill_seq_len: int = None):
+    runtime_prompt_len = int(tokenized_inputs["input_ids"].shape[1])
+    effective_prefill_seq_len = runtime_prompt_len if prefill_seq_len is None else prefill_seq_len
+    if effective_prefill_seq_len < runtime_prompt_len:
+        raise ValueError(
+            f"prefill_seq_len ({effective_prefill_seq_len}) must be >= runtime prompt length ({runtime_prompt_len})."
+        )
+
+    prepared_inputs = qeff_model.model.prepare_inputs_for_generation(
+        inputs=tokenized_inputs,
+        prefill_seq_len=effective_prefill_seq_len,
+        batch_size=1,
+    )
+
+    if "image_grid_thw" in prepared_inputs and prepared_inputs["image_grid_thw"].ndim == 2:
+        thw = prepared_inputs["image_grid_thw"][0]
+        t, h, w = int(thw[0].item()), int(thw[1].item()), int(thw[2].item())
+        prepared_inputs["image_grid_thw"] = torch.zeros((1, t, h, w), dtype=thw.dtype)
+
+    if "pixel_values" in prepared_inputs:
+        prepared_inputs["pixel_values"] = prepared_inputs["pixel_values"].to(torch.float32)
+
+    return prepared_inputs, runtime_prompt_len
+
+
+def _zero_vision_outputs(vision_outputs: Dict[str, np.ndarray]) -> Dict[str, np.ndarray]:
+    return {name: np.zeros_like(value) for name, value in vision_outputs.items()}
+
+
+def _run_ai100_vision(vision_qpc_path: str, prepared_inputs) -> Dict[str, np.ndarray]:
+    vision_session = QAICInferenceSession(vision_qpc_path)
+    vision_inputs = {
+        "pixel_values": prepared_inputs["pixel_values"].detach().cpu().numpy().astype(np.float16),
+        "image_grid_thw": prepared_inputs["image_grid_thw"].detach().cpu().numpy().astype(np.int64),
+    }
+    vision_outputs = vision_session.run(vision_inputs)
+    vision_session.deactivate()
+    return vision_outputs
+
+
+def _run_ai100_prefill(qpc_paths, prepared_inputs, vision_template):
+    if not isinstance(qpc_paths, dict):
+        raise ValueError("Expected qpc_paths to be a dict with vision/lang QPC keys.")
+
+    vision_qpc_path = qpc_paths.get("vision_qpc_path")
+    lang_qpc_path = qpc_paths.get("lang_qpc_path")
+    if vision_qpc_path is None or lang_qpc_path is None:
+        raise ValueError("Missing vision_qpc_path/lang_qpc_path in compiled QPC outputs.")
+
+    prefill_len = prepared_inputs["position_ids"].shape[-1]
+    input_ids = prepared_inputs["input_ids"]
+    if input_ids.shape[1] < prefill_len:
+        pad = torch.full(
+            (input_ids.shape[0], prefill_len - input_ids.shape[1]),
+            1,
+            dtype=input_ids.dtype,
+            device=input_ids.device,
+        )
+        input_ids = torch.cat([input_ids, pad], dim=1)
+    else:
+        input_ids = input_ids[:, :prefill_len]
+    position_ids = prepared_inputs["position_ids"][..., :prefill_len]
+
+    if "pixel_values" in prepared_inputs and "image_grid_thw" in prepared_inputs:
+        vision_outputs = _run_ai100_vision(vision_qpc_path, prepared_inputs)
+    else:
+        vision_outputs = _zero_vision_outputs(vision_template)
+
+    lang_session = QAICInferenceSession(lang_qpc_path)
+    lang_session.skip_buffers(
+        [
+            name
+            for name in lang_session.input_names + lang_session.output_names
+            if name.startswith("past_") or name.endswith("_RetainedState")
+        ]
+    )
+    lang_session.set_buffers(vision_outputs)
+    lang_inputs = {
+        "input_ids": input_ids.detach().cpu().numpy().astype(np.int64),
+        "position_ids": position_ids.detach().cpu().numpy().astype(np.int64),
+        "image_idx": np.zeros((1, 1), dtype=np.int64),
+    }
+    outputs = lang_session.run(lang_inputs)
+    lang_session.deactivate()
+    return outputs["logits"]
+
+
+@pytest.mark.on_qaic
+@pytest.mark.multimodal
+@pytest.mark.regular
+@pytest.mark.parametrize("model_name", test_reranker_models)
+def test_qwen3_vl_reranker_mad_parity(model_name):
+    torch.manual_seed(42)
+    model_cfg = reranker_model_config_dict[model_name]
+    model_source = _resolve_model_source(model_name)
+
+    config = AutoConfig.from_pretrained(model_source, trust_remote_code=True, padding=True)
+    config = set_num_layers_vlm(config, n_layer=model_cfg["num_layers"])
+    if hasattr(config, "use_cache"):
+        config.use_cache = True
+    if hasattr(config, "text_config") and hasattr(config.text_config, "use_cache"):
+        config.text_config.use_cache = True
+
+    model_hf = load_vlm_model(config)
+    model_hf.eval()
+
+    qeff_model = QEFFAutoModelForImageTextToText.from_pretrained(
+        model_source,
+        kv_offload=True,
+        config=config,
+    )
+    processor = AutoProcessor.from_pretrained(model_source, trust_remote_code=True, padding=True)
+
+    yes_token_id, no_token_id = _get_yes_no_token_ids(processor.tokenizer)
+    score_linear = _make_score_linear(model_hf, yes_token_id, no_token_id).to(next(model_hf.parameters()).device)
+    score_linear = score_linear.to(dtype=next(model_hf.parameters()).dtype)
+
+    doc_contexts = []
+    max_prompt_len = 0
+    max_grid_h = 22
+    max_grid_w = 34
+
+    hf_scores_list = []
+
+    documents = EXAMPLE_INPUTS["documents"]
+    if RERANKER_DOC_LIMIT > 0:
+        documents = documents[:RERANKER_DOC_LIMIT]
+
+    for document in documents:
+        pair = _format_mm_instruction(
+            instruction=EXAMPLE_INPUTS["instruction"],
+            query=EXAMPLE_INPUTS["query"],
+            document=document,
+        )
+        tokenized = _tokenize_pair(processor, pair)
+        runtime_prompt_len = int(tokenized["input_ids"].shape[1])
+
+        hf_inputs = {}
+        for key, value in tokenized.items():
+            hf_inputs[key] = value.to(next(model_hf.parameters()).device) if torch.is_tensor(value) else value
+        with torch.no_grad():
+            hf_last_hidden = model_hf.model(**hf_inputs).last_hidden_state
+        hf_score = _score_from_last_hidden(hf_last_hidden, score_linear)[0]
+        hf_scores_list.append(float(hf_score))
+
+        if "image_grid_thw" in tokenized and tokenized["image_grid_thw"].numel() > 0:
+            grid = tokenized["image_grid_thw"]
+            max_grid_h = max(max_grid_h, int(grid[..., 1].max().item()))
+            max_grid_w = max(max_grid_w, int(grid[..., 2].max().item()))
+
+        doc_contexts.append(
+            {
+                "tokenized": tokenized,
+            }
+        )
+        max_prompt_len = max(max_prompt_len, runtime_prompt_len)
+
+    patch_size = int(qeff_model.model.config.vision_config.patch_size)
+    compile_height = max_grid_h * patch_size
+    compile_width = max_grid_w * patch_size
+
+    qpc_paths = qeff_model.compile(
+        img_size=max(compile_height, compile_width),
+        height=compile_height,
+        width=compile_width,
+        prefill_seq_len=max_prompt_len,
+        ctx_len=model_cfg["ctx_len"],
+        num_devices=1,
+        num_cores=16,
+        mxfp6_matmul=False,
+    )
+
+    ai100_scores_list = []
+
+    prepared_contexts = []
+    vision_template_ai100 = None
+    for context in doc_contexts:
+        prepared_inputs, _ = _prepare_qeff_inputs(
+            qeff_model=qeff_model,
+            tokenized_inputs=context["tokenized"],
+            prefill_seq_len=max_prompt_len,
+        )
+        prepared_contexts.append(
+            {
+                "prepared_inputs": prepared_inputs,
+            }
+        )
+        if vision_template_ai100 is None and "pixel_values" in prepared_inputs and "image_grid_thw" in prepared_inputs:
+            vision_template_ai100 = _run_ai100_vision(qpc_paths["vision_qpc_path"], prepared_inputs)
+
+    if vision_template_ai100 is None:
+        raise ValueError("Expected at least one image document to initialize vision templates.")
+
+    for context in prepared_contexts:
+        prepared_inputs_runtime = context["prepared_inputs"]
+        ai100_logits = _run_ai100_prefill(
+            qpc_paths=qpc_paths,
+            prepared_inputs=prepared_inputs_runtime,
+            vision_template=vision_template_ai100,
+        )
+        ai100_score = _score_from_logits(ai100_logits, yes_token_id, no_token_id)[0]
+        ai100_scores_list.append(float(ai100_score))
+
+    hf_scores = np.array(hf_scores_list, dtype=np.float64)
+    ai100_scores = np.array(ai100_scores_list, dtype=np.float64)
+
+    print(f"[SCORES] PyTorch(original): {hf_scores.tolist()}")
+    print(f"[SCORES] AI100: {ai100_scores.tolist()}")
+
+    pt_ai100_mad_mean, pt_ai100_mad_max = _mad_stats(hf_scores, ai100_scores)
+    print(f"[MAD] PyTorch(original) vs AI100: mean={pt_ai100_mad_mean:.6e}, max={pt_ai100_mad_max:.6e}")
+    assert pt_ai100_mad_max <= PT_AI100_MAD_MAX, (
+        f"PyTorch(original) vs AI100 MAD max {pt_ai100_mad_max:.6e} "
+        f"exceeds threshold {PT_AI100_MAD_MAX:.6e}. "
+        f"Check tokenizer ids, prompt formatting, runtime prompt length slicing, and compile dimensions."
+    )

From 711fd8100adde2b5acb2e1837908c2b86a6f08cf Mon Sep 17 00:00:00 2001
From: Amit Raj <amitraj@qti.qualcomm.com>
Date: Tue, 19 May 2026 08:50:39 +0530
Subject: [PATCH 02/11] Functionality changes to PR and rebase with main branch

Signed-off-by: Amit Raj <amitraj@qti.qualcomm.com>
---
 .../models/qwen3vl/reranker/README.md         |  52 --
 .../qwen3vl/reranker/qwen3_vl_reranker.py     | 555 ------------------
 tests/configs/image_text_model_configs.json   |   2 +-
 .../image_text_to_text/test_reranker_mad.py   | 455 --------------
 4 files changed, 1 insertion(+), 1063 deletions(-)
 delete mode 100644 examples/image_text_to_text/models/qwen3vl/reranker/README.md
 delete mode 100644 examples/image_text_to_text/models/qwen3vl/reranker/qwen3_vl_reranker.py
 delete mode 100644 tests/transformers/models/image_text_to_text/test_reranker_mad.py

diff --git a/examples/image_text_to_text/models/qwen3vl/reranker/README.md b/examples/image_text_to_text/models/qwen3vl/reranker/README.md
deleted file mode 100644
index a3e715478d..0000000000
--- a/examples/image_text_to_text/models/qwen3vl/reranker/README.md
+++ /dev/null
@@ -1,52 +0,0 @@
-# Qwen3-VL Reranker Inference
-
-This directory contains an AI100 example for running Qwen3-VL reranker models with QEfficient and printing per-document relevance scores.
-
-Supported models:
-- `Qwen/Qwen3-VL-Reranker-2B`
-- `Qwen/Qwen3-VL-Reranker-8B`
-
-## What this example does
-
-- Loads Qwen3-VL reranker from Hugging Face (or local snapshot path).
-- Uses QEff dual-QPC execution (vision encoder + language model).
-- Runs the same query against multiple text/image documents.
-- Prints one score per document in input order.
-
-## Required package
-
-- `qwen-vl-utils>=0.0.14`
-
-```bash
-pip install "qwen-vl-utils>=0.0.14"
-```
-
-## Script
-
-- `qwen3_vl_reranker.py`
-
-## Run
-
-```bash
-python examples/image_text_to_text/models/qwen3vl/reranker/qwen3_vl_reranker.py \
-  --model-name Qwen/Qwen3-VL-Reranker-2B
-```
-
-Or run with 8B:
-
-```bash
-python examples/image_text_to_text/models/qwen3vl/reranker/qwen3_vl_reranker.py \
-  --model-name Qwen/Qwen3-VL-Reranker-8B
-```
-
-With compile parameters:
-
-```bash
-python examples/image_text_to_text/models/qwen3vl/reranker/qwen3_vl_reranker.py \
-  --model-name Qwen/Qwen3-VL-Reranker-2B \
-  --ctx-len 2048 \
-  --num-cores 16 \
-  --num-devices 1 \
-  --compile-prefill-seq-len 4096 \
-  --mxfp6-matmul
-```
diff --git a/examples/image_text_to_text/models/qwen3vl/reranker/qwen3_vl_reranker.py b/examples/image_text_to_text/models/qwen3vl/reranker/qwen3_vl_reranker.py
deleted file mode 100644
index 2fdd225571..0000000000
--- a/examples/image_text_to_text/models/qwen3vl/reranker/qwen3_vl_reranker.py
+++ /dev/null
@@ -1,555 +0,0 @@
-# -----------------------------------------------------------------------------
-#
-# Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
-# SPDX-License-Identifier: BSD-3-Clause
-#
-# -----------------------------------------------------------------------------
-
-import argparse
-import os
-from typing import Dict, List, Tuple
-
-import numpy as np
-import torch
-from huggingface_hub import snapshot_download
-from qwen_vl_utils import process_vision_info
-from transformers import AutoConfig, AutoProcessor
-
-from QEfficient import QEFFAutoModelForImageTextToText
-from QEfficient.generation.cloud_infer import QAICInferenceSession
-
-DEFAULT_MODEL_NAME = "Qwen/Qwen3-VL-Reranker-2B"
-DEFAULT_CTX_LEN = 2048
-DEFAULT_NUM_CORES = 16
-DEFAULT_NUM_DEVICES = 1
-
-# Max token budget used by this example's manual truncation/padding flow.
-MAX_LENGTH = 8192
-# Pixel constraints used by Qwen3-VL preprocessing.
-IMAGE_BASE_FACTOR = 16
-IMAGE_FACTOR = IMAGE_BASE_FACTOR * 2
-MIN_PIXELS = 4 * IMAGE_FACTOR * IMAGE_FACTOR
-MAX_PIXELS = 1280 * IMAGE_FACTOR * IMAGE_FACTOR
-FPS = 1.0
-
-
-class QEffQwen3VLReranker:
-    @staticmethod
-    def _resolve_model_source(model_name_or_path: str) -> str:
-        """Return a local model path when given an HF repo id.
-
-        Why:
-        Some transformers versions can fail when resolving chat templates from
-        repo-id mode for this model. Using a local snapshot path avoids that path.
-        """
-        if os.path.isdir(model_name_or_path):
-            return model_name_or_path
-        return snapshot_download(repo_id=model_name_or_path)
-
-    def __init__(
-        self,
-        model_name_or_path: str = DEFAULT_MODEL_NAME,
-        ctx_len: int = DEFAULT_CTX_LEN,
-        num_cores: int = DEFAULT_NUM_CORES,
-        num_devices: int = DEFAULT_NUM_DEVICES,
-        mxfp6_matmul: bool = False,
-        compile_prefill_seq_len: int = None,
-    ):
-        """Initialize the AI100-only reranker wrapper.
-
-        This loads:
-        - HF config/processor for prompt and multimodal preprocessing.
-        - QEFF dual-QPC model wrapper (vision encoder + language decoder).
-        - Token ids for "yes"/"no" used to compute reranker scores.
-
-        Parameters
-        ----------
-        model_name_or_path:
-            HF model id or local snapshot path.
-        """
-        self.model_name_or_path = model_name_or_path
-        self.model_source = self._resolve_model_source(model_name_or_path)
-        self.ctx_len = ctx_len
-        self.num_cores = num_cores
-        self.num_devices = num_devices
-        self.mxfp6_matmul = mxfp6_matmul
-        self.compile_prefill_seq_len = compile_prefill_seq_len
-        self.max_length = MAX_LENGTH
-        self.fps = FPS
-
-        # Use local snapshot for stable processor/chat-template loading.
-        config = AutoConfig.from_pretrained(self.model_source, trust_remote_code=True, padding=True)
-        if hasattr(config, "use_cache"):
-            config.use_cache = True
-        if hasattr(config, "text_config") and hasattr(config.text_config, "use_cache"):
-            config.text_config.use_cache = True
-
-        self.processor = AutoProcessor.from_pretrained(self.model_source, trust_remote_code=True, padding=True)
-        self.model = QEFFAutoModelForImageTextToText.from_pretrained(
-            self.model_source,
-            kv_offload=True,
-            trust_remote_code=True,
-            config=config,
-        )
-
-        self.yes_token_id, self.no_token_id = self._get_yes_no_token_ids(self.processor.tokenizer)
-        self._compiled_qpc_paths = None
-        self._compiled_prefill_seq_len = 0
-        self._compiled_height = None
-        self._compiled_width = None
-
-    @staticmethod
-    def _get_yes_no_token_ids(tokenizer) -> Tuple[int, int]:
-        """Resolve tokenizer ids for the exact tokens 'yes' and 'no'."""
-        vocab = tokenizer.get_vocab()
-        if "yes" not in vocab or "no" not in vocab:
-            raise ValueError("Could not resolve tokenizer ids for exact tokens 'yes' and 'no'.")
-        return vocab["yes"], vocab["no"]
-
-    @staticmethod
-    def _score_from_logits(logits, yes_token_id: int, no_token_id: int) -> float:
-        """Convert model logits into a reranker relevance score.
-
-        Score formula:
-            sigmoid(logit_yes - logit_no)
-        """
-        # Convert runtime output to torch and use final-token logits.
-        logits_tensor = torch.from_numpy(logits) if isinstance(logits, np.ndarray) else logits.detach().cpu()
-        if logits_tensor.ndim == 3:
-            logits_tensor = logits_tensor[:, -1, :]
-        # Binary relevance score from yes/no logit gap.
-        score = torch.sigmoid(logits_tensor[:, yes_token_id] - logits_tensor[:, no_token_id])
-        return float(score[0].item())
-
-    @staticmethod
-    def _truncate_tokens_optimized(tokens: List[int], max_length: int, special_tokens: List[int]) -> List[int]:
-        """Truncate while preserving all special tokens in sequence order."""
-        if len(tokens) <= max_length:
-            return tokens
-
-        # Preserve all special/control tokens and trim only non-special tokens.
-        special_tokens_set = set(special_tokens)
-        num_special = sum(1 for token in tokens if token in special_tokens_set)
-        num_non_special_to_keep = max_length - num_special
-
-        final_tokens = []
-        non_special_kept_count = 0
-        for token in tokens:
-            if token in special_tokens_set:
-                final_tokens.append(token)
-            elif non_special_kept_count < num_non_special_to_keep:
-                final_tokens.append(token)
-                non_special_kept_count += 1
-        return final_tokens
-
-    def _format_mm_content(self, text, image, video, prefix: str) -> List[Dict]:
-        """Build one multimodal content block (prefix + optional image + optional text)."""
-        # Prefix helps the model distinguish query vs document sections.
-        content = [{"type": "text", "text": prefix}]
-
-        if not text and not image and not video:
-            content.append({"type": "text", "text": "NULL"})
-            return content
-
-        if video:
-            raise ValueError("Video input is not supported in this AI100-only example.")
-
-        if image:
-            # Convert local paths to file:// URIs for the processor.
-            if isinstance(image, str):
-                image_content = image if image.startswith(("http", "oss")) else "file://" + image
-            else:
-                image_content = image
-            content.append(
-                {
-                    "type": "image",
-                    "image": image_content,
-                    "min_pixels": MIN_PIXELS,
-                    "max_pixels": MAX_PIXELS,
-                }
-            )
-
-        if text:
-            content.append({"type": "text", "text": text})
-
-        return content
-
-    def _format_mm_instruction(self, instruction: str, query: Dict, document: Dict) -> List[Dict]:
-        """Create the chat payload for one query-document pair."""
-        # Prompt shape follows the HF reranker reference format.
-        contents = [{"type": "text", "text": "<Instruct>: " + instruction}]
-
-        contents.extend(
-            self._format_mm_content(
-                query.get("text"),
-                query.get("image"),
-                query.get("video"),
-                prefix="<Query>:",
-            )
-        )
-        contents.extend(
-            self._format_mm_content(
-                document.get("text"),
-                document.get("image"),
-                document.get("video"),
-                prefix="\n<Document>:",
-            )
-        )
-
-        return [
-            {
-                "role": "system",
-                "content": [
-                    {
-                        "type": "text",
-                        "text": (
-                            "Judge whether the Document meets the requirements based on the Query and the Instruct "
-                            'provided. Note that the answer can only be "yes" or "no".'
-                        ),
-                    }
-                ],
-            },
-            {"role": "user", "content": contents},
-        ]
-
-    def _tokenize_pair(self, pair: List[Dict]) -> Dict:
-        """Tokenize a query-document pair with the exact HF multimodal pipeline."""
-        # Processor expects list-of-conversations.
-        pairs = [pair]
-        text = self.processor.apply_chat_template(pairs, tokenize=False, add_generation_prompt=True)
-
-        # Build image/video tensors + metadata for processor inputs.
-        images, videos, video_kwargs = process_vision_info(
-            pairs,
-            image_patch_size=16,
-            return_video_kwargs=True,
-            return_video_metadata=True,
-        )
-
-        if videos is not None:
-            videos, video_metadatas = zip(*videos)
-            videos = list(videos)
-            video_metadatas = list(video_metadatas)
-        else:
-            video_metadatas = None
-
-        inputs = self.processor(
-            text=text,
-            images=images,
-            videos=videos,
-            video_metadata=video_metadatas,
-            truncation=False,
-            padding=False,
-            do_resize=False,
-            **video_kwargs,
-        )
-
-        # Apply custom truncation preserving trailing template control tokens.
-        for i, input_ids in enumerate(inputs["input_ids"]):
-            inputs["input_ids"][i] = (
-                self._truncate_tokens_optimized(
-                    input_ids[:-5],
-                    self.max_length,
-                    self.processor.tokenizer.all_special_ids,
-                )
-                + input_ids[-5:]
-            )
-
-        # Re-pad through tokenizer utilities so masks align with token ids.
-        padded = self.processor.tokenizer.pad(
-            {"input_ids": inputs["input_ids"]},
-            padding=True,
-            return_tensors="pt",
-            max_length=self.max_length,
-        )
-        for key in padded:
-            inputs[key] = padded[key]
-
-        if "pixel_values" in inputs:
-            # Keep pixels fp32 before explicit cast to fp16 during vision run.
-            inputs["pixel_values"] = inputs["pixel_values"].to(torch.float32)
-
-        return inputs
-
-    def _prepare_inputs(self, tokenized_inputs: Dict, prefill_seq_len: int = None):
-        """Prepare model inputs for dual-QPC prefill execution."""
-        # True prompt length before compile-aligned padding.
-        runtime_prompt_len = int(tokenized_inputs["input_ids"].shape[1])
-        effective_prefill = runtime_prompt_len if prefill_seq_len is None else prefill_seq_len
-        if effective_prefill < runtime_prompt_len:
-            raise ValueError(
-                f"prefill_seq_len ({effective_prefill}) must be >= runtime prompt length ({runtime_prompt_len})."
-            )
-
-        # Let model helper compute position_ids and multimodal placement.
-        prepared_inputs = self.model.model.prepare_inputs_for_generation(
-            inputs=tokenized_inputs,
-            prefill_seq_len=effective_prefill,
-            batch_size=1,
-        )
-
-        # Normalize image_grid_thw to the shape consumed by compiled path.
-        if "image_grid_thw" in prepared_inputs and prepared_inputs["image_grid_thw"].ndim == 2:
-            thw = prepared_inputs["image_grid_thw"][0]
-            t, h, w = int(thw[0].item()), int(thw[1].item()), int(thw[2].item())
-            prepared_inputs["image_grid_thw"] = torch.zeros((1, t, h, w), dtype=thw.dtype)
-
-        if "pixel_values" in prepared_inputs:
-            prepared_inputs["pixel_values"] = prepared_inputs["pixel_values"].to(torch.float32)
-
-        return prepared_inputs, runtime_prompt_len
-
-    def _ensure_compiled(self, prefill_seq_len: int, height: int, width: int):
-        """Compile QPCs if needed, otherwise reuse cached compiled artifacts."""
-        # Reuse previously compiled artifacts whenever shapes are compatible.
-        if (
-            self._compiled_qpc_paths is not None
-            and prefill_seq_len <= self._compiled_prefill_seq_len
-            and height == self._compiled_height
-            and width == self._compiled_width
-        ):
-            return
-
-        reuse_vision_qpc = (
-            self._compiled_qpc_paths is not None and height == self._compiled_height and width == self._compiled_width
-        )
-
-        # Compile one max prefill specialization and optionally skip vision recompile.
-        compiled_paths = self.model.compile(
-            prefill_seq_len=prefill_seq_len,
-            ctx_len=self.ctx_len,
-            img_size=max(height, width),
-            height=height,
-            width=width,
-            num_cores=self.num_cores,
-            num_devices=self.num_devices,
-            mxfp6_matmul=self.mxfp6_matmul,
-            # vision_embed_fp32=True,
-            skip_vision=reuse_vision_qpc,
-        )
-        if reuse_vision_qpc:
-            compiled_paths["vision_qpc_path"] = self._compiled_qpc_paths["vision_qpc_path"]
-
-        self._compiled_qpc_paths = compiled_paths
-        self._compiled_prefill_seq_len = prefill_seq_len
-        self._compiled_height = height
-        self._compiled_width = width
-
-    @staticmethod
-    def _zero_vision_outputs(vision_outputs: Dict[str, np.ndarray]) -> Dict[str, np.ndarray]:
-        """Create zero-valued placeholders matching vision output buffers."""
-        return {name: np.zeros_like(value) for name, value in vision_outputs.items()}
-
-    def _run_ai100_vision(self, prepared_inputs) -> Dict[str, np.ndarray]:
-        """Run the compiled vision encoder QPC and return retained-state buffers."""
-        if "pixel_values" not in prepared_inputs or "image_grid_thw" not in prepared_inputs:
-            raise ValueError("Missing pixel_values/image_grid_thw for vision execution.")
-
-        # Vision session produces retained states consumed by language session.
-        vision_session = QAICInferenceSession(self._compiled_qpc_paths["vision_qpc_path"])
-        vision_outputs = vision_session.run(
-            {
-                # Vision qpc expects fp16 pixels + int64 grid coordinates.
-                "pixel_values": prepared_inputs["pixel_values"].detach().cpu().numpy().astype(np.float16),
-                "image_grid_thw": prepared_inputs["image_grid_thw"].detach().cpu().numpy().astype(np.int64),
-            }
-        )
-        vision_session.deactivate()
-        return vision_outputs
-
-    def _run_ai100_prefill(self, prepared_inputs, vision_template: Dict[str, np.ndarray]) -> np.ndarray:
-        """Run one prefill pass on AI100 language QPC and return logits."""
-        # Match runtime input to compiled prefill length.
-        prefill_len = prepared_inputs["position_ids"].shape[-1]
-        input_ids = prepared_inputs["input_ids"]
-        if input_ids.shape[1] < prefill_len:
-            pad = torch.full(
-                (input_ids.shape[0], prefill_len - input_ids.shape[1]),
-                1,
-                dtype=input_ids.dtype,
-                device=input_ids.device,
-            )
-            input_ids = torch.cat([input_ids, pad], dim=1)
-        else:
-            input_ids = input_ids[:, :prefill_len]
-
-        position_ids = prepared_inputs["position_ids"][..., :prefill_len]
-
-        # For text-only docs, inject zeroed retained states with matching shapes.
-        if "pixel_values" in prepared_inputs and "image_grid_thw" in prepared_inputs:
-            vision_outputs = self._run_ai100_vision(prepared_inputs)
-        else:
-            vision_outputs = self._zero_vision_outputs(vision_template)
-
-        # Skip past/retained buffers and run only required prefill inputs.
-        lang_session = QAICInferenceSession(self._compiled_qpc_paths["lang_qpc_path"])
-        lang_session.skip_buffers(
-            [
-                name
-                for name in lang_session.input_names + lang_session.output_names
-                if name.startswith("past_") or name.endswith("_RetainedState")
-            ]
-        )
-        lang_session.set_buffers(vision_outputs)
-        outputs = lang_session.run(
-            {
-                # image_idx selects the vision buffer slot for this request.
-                "input_ids": input_ids.detach().cpu().numpy().astype(np.int64),
-                "position_ids": position_ids.detach().cpu().numpy().astype(np.int64),
-                "image_idx": np.zeros((1, 1), dtype=np.int64),
-            }
-        )
-        lang_session.deactivate()
-        return outputs["logits"]
-
-    def process(self, inputs: Dict) -> List[float]:
-        """Score all documents for one query on AI100.
-
-        High-level flow:
-        1) Build model-ready query-document pairs.
-        2) Find max prompt/image shape across all docs.
-        3) Compile once at max shape (single stable specialization).
-        4) Run prefill per doc and convert logits -> score.
-        """
-        # Unpack user payload.
-        instruction = inputs["instruction"]
-        query = inputs.get("query", {})
-        documents = inputs.get("documents", [])
-
-        # Collect per-document tokenized contexts first so we can compile once
-        # with the largest prompt/image shape required by this request.
-        prepared_contexts = []
-        max_prompt_len = 0
-        max_grid_h = 22
-        max_grid_w = 34
-
-        # Build each pair in the exact chat-template format expected by the model.
-        for document in documents:
-            pair = self._format_mm_instruction(instruction, query, document)
-            tokenized = self._tokenize_pair(pair)
-            runtime_prompt_len = int(tokenized["input_ids"].shape[1])
-
-            # Track the max image grid (H, W) seen so compile dimensions can
-            # handle all documents in this batch.
-            if "image_grid_thw" in tokenized and tokenized["image_grid_thw"].numel() > 0:
-                grid = tokenized["image_grid_thw"]
-                max_grid_h = max(max_grid_h, int(grid[..., 1].max().item()))
-                max_grid_w = max(max_grid_w, int(grid[..., 2].max().item()))
-
-            prepared_contexts.append(
-                {
-                    "tokenized": tokenized,
-                    "runtime_prompt_len": runtime_prompt_len,
-                }
-            )
-            max_prompt_len = max(max_prompt_len, runtime_prompt_len)
-
-        # Empty documents list => no scores.
-        if max_prompt_len == 0:
-            return []
-
-        # Convert max grid to compile-time pixel dimensions using model patch size.
-        patch_size = int(self.model.model.config.vision_config.patch_size)
-        compile_height = max_grid_h * patch_size
-        compile_width = max_grid_w * patch_size
-
-        # Compile/reuse a single language specialization and prepare all requests
-        # to that same prefill length to avoid per-document recompiles.
-        target_prefill_seq_len = max_prompt_len
-        if self.compile_prefill_seq_len is not None:
-            if self.compile_prefill_seq_len < max_prompt_len:
-                raise ValueError(
-                    f"--compile-prefill-seq-len ({self.compile_prefill_seq_len}) must be >= "
-                    f"max runtime prompt length ({max_prompt_len})."
-                )
-            target_prefill_seq_len = self.compile_prefill_seq_len
-
-        self._ensure_compiled(target_prefill_seq_len, compile_height, compile_width)
-
-        # Prepare all documents to the same prefill length used at compile time.
-        prepared_contexts_with_prefill = []
-        vision_template = None
-        for ctx in prepared_contexts:
-            prepared_inputs, _ = self._prepare_inputs(ctx["tokenized"], prefill_seq_len=target_prefill_seq_len)
-            prepared_contexts_with_prefill.append({"prepared_inputs": prepared_inputs})
-
-            # Capture one real vision-output template so text-only docs can reuse
-            # zero-valued buffers with exact matching shapes.
-            if vision_template is None and "pixel_values" in prepared_inputs and "image_grid_thw" in prepared_inputs:
-                vision_template = self._run_ai100_vision(prepared_inputs)
-
-        # This example currently expects at least one image document to establish
-        # retained-state buffer shapes for mixed image/text batches.
-        if vision_template is None:
-            raise ValueError("At least one image document is required to initialize AI100 vision buffers.")
-
-        # Run language prefill and compute scalar score per document.
-        scores = []
-        for ctx in prepared_contexts_with_prefill:
-            logits = self._run_ai100_prefill(
-                ctx["prepared_inputs"],
-                vision_template=vision_template,
-            )
-            # Reranker score = sigmoid(logit_yes - logit_no).
-            score = self._score_from_logits(logits, self.yes_token_id, self.no_token_id)
-            scores.append(score)
-
-        return scores
-
-
-def main():
-    # Keep CLI simple: just allow model id/path override.
-    parser = argparse.ArgumentParser(description="Qwen3-VL reranker example.")
-    parser.add_argument("--model-name", type=str, default=DEFAULT_MODEL_NAME)
-    parser.add_argument("--ctx-len", type=int, default=DEFAULT_CTX_LEN, help="Context length used at compile time.")
-    parser.add_argument("--num-cores", type=int, default=DEFAULT_NUM_CORES, help="Number of AI100 cores.")
-    parser.add_argument("--num-devices", type=int, default=DEFAULT_NUM_DEVICES, help="Number of AI100 devices.")
-    parser.add_argument(
-        "--mxfp6-matmul",
-        action="store_true",
-        help="Enable MXFP6 matmul during compile (default: disabled).",
-    )
-    parser.add_argument(
-        "--compile-prefill-seq-len",
-        type=int,
-        default=None,
-        help=(
-            "Optional fixed prefill sequence length for compile/padding. "
-            "Must be >= max prompt length of the current request."
-        ),
-    )
-    args = parser.parse_args()
-
-    model = QEffQwen3VLReranker(
-        model_name_or_path=args.model_name,
-        ctx_len=args.ctx_len,
-        num_cores=args.num_cores,
-        num_devices=args.num_devices,
-        mxfp6_matmul=args.mxfp6_matmul,
-        compile_prefill_seq_len=args.compile_prefill_seq_len,
-    )
-
-    # Example input payload matching the HF reranker schema.
-    inputs = {
-        "instruction": "Retrieve images or text relevant to the user's query.",
-        "query": {"text": "A woman playing with her dog on a beach at sunset."},
-        "documents": [
-            {
-                "text": "A woman shares a joyful moment with her golden retriever on a sun-drenched beach at sunset, as the dog offers its paw in a heartwarming display of companionship and trust."
-            },
-            {"image": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg"},
-            {
-                "text": "A woman shares a joyful moment with her golden retriever on a sun-drenched beach at sunset, as the dog offers its paw in a heartwarming display of companionship and trust.",
-                "image": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg",
-            },
-        ],
-        "fps": 1.0,
-    }
-
-    # Print one score per document in the same order as inputs["documents"].
-    scores = model.process(inputs)
-    print(scores)
-
-
-if __name__ == "__main__":
-    main()
diff --git a/tests/configs/image_text_model_configs.json b/tests/configs/image_text_model_configs.json
index f4cdb6a0fd..85df559970 100644
--- a/tests/configs/image_text_model_configs.json
+++ b/tests/configs/image_text_model_configs.json
@@ -5,7 +5,7 @@
       "model_type": "llava",
       "batch_size": 1,
       "prompt_len": 784,
-      "ctx_len": 2048,
+      "ctx_len": 1024,
       "img_size": 336,
       "img_url": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/ai2d-demo.jpg",
       "text_prompt": "What does the label 15 represent? (1) lava (2) core (3) tunnel (4) ash cloud",
diff --git a/tests/transformers/models/image_text_to_text/test_reranker_mad.py b/tests/transformers/models/image_text_to_text/test_reranker_mad.py
deleted file mode 100644
index 3a6497b520..0000000000
--- a/tests/transformers/models/image_text_to_text/test_reranker_mad.py
+++ /dev/null
@@ -1,455 +0,0 @@
-# -----------------------------------------------------------------------------
-#
-# Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
-# SPDX-License-Identifier: BSD-3-Clause
-#
-# ----------------------------------------------------------------------------
-
-import json
-import os
-from typing import Dict, List, Tuple
-
-import numpy as np
-import pytest
-import torch
-from huggingface_hub import snapshot_download
-from qwen_vl_utils import process_vision_info
-from transformers import AutoConfig, AutoProcessor
-
-from QEfficient.generation.cloud_infer import QAICInferenceSession
-from QEfficient.transformers.models.modeling_auto import QEFFAutoModelForImageTextToText
-from QEfficient.utils.test_utils import load_vlm_model, set_num_layers_vlm
-
-CONFIG_PATH = "tests/configs/image_text_model_configs.json"
-
-PT_AI100_MAD_MAX = 5e-3
-MAX_LENGTH = 8192
-RERANKER_DOC_LIMIT = int(os.getenv("QEFF_RERANKER_DOC_LIMIT", "0"))
-
-IMAGE_BASE_FACTOR = 16
-IMAGE_FACTOR = IMAGE_BASE_FACTOR * 2
-MIN_PIXELS = 4 * IMAGE_FACTOR * IMAGE_FACTOR
-MAX_PIXELS = 1280 * IMAGE_FACTOR * IMAGE_FACTOR
-
-EXAMPLE_INPUTS = {
-    "instruction": "Retrieve relevant content.",
-    "query": {"text": "dog on beach"},
-    "documents": [
-        {"image": "https://picsum.photos/id/237/536/354"},
-        {"text": "A dog running on the beach."},
-    ],
-}
-
-with open(CONFIG_PATH, "r") as f:
-    config_data = json.load(f)
-    reranker_models = config_data["image_text_reranker_models"]
-
-test_reranker_models = [model_config["model_name"] for model_config in reranker_models]
-reranker_model_config_dict = {model["model_name"]: model for model in reranker_models}
-
-
-def _resolve_model_source(model_name_or_path: str) -> str:
-    if os.path.isdir(model_name_or_path):
-        return model_name_or_path
-    return snapshot_download(repo_id=model_name_or_path)
-
-
-def _format_mm_content(text, image, video, prefix: str) -> List[Dict]:
-    content = [{"type": "text", "text": prefix}]
-
-    if not text and not image and not video:
-        content.append({"type": "text", "text": "NULL"})
-        return content
-
-    if video:
-        raise ValueError("Video input is not supported in this test.")
-
-    if image:
-        if isinstance(image, str):
-            image_content = image if image.startswith(("http", "oss")) else "file://" + image
-        else:
-            image_content = image
-        content.append(
-            {
-                "type": "image",
-                "image": image_content,
-                "min_pixels": MIN_PIXELS,
-                "max_pixels": MAX_PIXELS,
-            }
-        )
-
-    if text:
-        content.append({"type": "text", "text": text})
-
-    return content
-
-
-def _format_mm_instruction(instruction: str, query: Dict, document: Dict) -> List[Dict]:
-    contents = [{"type": "text", "text": "<Instruct>: " + instruction}]
-
-    contents.extend(
-        _format_mm_content(
-            query.get("text"),
-            query.get("image"),
-            query.get("video"),
-            prefix="<Query>:",
-        )
-    )
-    contents.extend(
-        _format_mm_content(
-            document.get("text"),
-            document.get("image"),
-            document.get("video"),
-            prefix="\n<Document>:",
-        )
-    )
-
-    return [
-        {
-            "role": "system",
-            "content": [
-                {
-                    "type": "text",
-                    "text": (
-                        "Judge whether the Document meets the requirements based on the Query and the Instruct "
-                        'provided. Note that the answer can only be "yes" or "no".'
-                    ),
-                }
-            ],
-        },
-        {"role": "user", "content": contents},
-    ]
-
-
-def _truncate_tokens_optimized(tokens: List[int], max_length: int, special_tokens: List[int]) -> List[int]:
-    if len(tokens) <= max_length:
-        return tokens
-
-    special_tokens_set = set(special_tokens)
-    num_special = sum(1 for token in tokens if token in special_tokens_set)
-    num_non_special_to_keep = max_length - num_special
-
-    final_tokens = []
-    non_special_kept_count = 0
-    for token in tokens:
-        if token in special_tokens_set:
-            final_tokens.append(token)
-        elif non_special_kept_count < num_non_special_to_keep:
-            final_tokens.append(token)
-            non_special_kept_count += 1
-    return final_tokens
-
-
-def _tokenize_pair(processor, pair: List[Dict]) -> Dict:
-    pairs = [pair]
-    text = processor.apply_chat_template(pairs, tokenize=False, add_generation_prompt=True)
-
-    images, videos, video_kwargs = process_vision_info(
-        pairs,
-        image_patch_size=16,
-        return_video_kwargs=True,
-        return_video_metadata=True,
-    )
-
-    if videos is not None:
-        videos, video_metadatas = zip(*videos)
-        videos = list(videos)
-        video_metadatas = list(video_metadatas)
-    else:
-        video_metadatas = None
-
-    inputs = processor(
-        text=text,
-        images=images,
-        videos=videos,
-        video_metadata=video_metadatas,
-        truncation=False,
-        padding=False,
-        do_resize=False,
-        **video_kwargs,
-    )
-
-    for i, input_ids in enumerate(inputs["input_ids"]):
-        inputs["input_ids"][i] = (
-            _truncate_tokens_optimized(
-                input_ids[:-5],
-                MAX_LENGTH,
-                processor.tokenizer.all_special_ids,
-            )
-            + input_ids[-5:]
-        )
-
-    padded = processor.tokenizer.pad(
-        {"input_ids": inputs["input_ids"]},
-        padding=True,
-        return_tensors="pt",
-        max_length=MAX_LENGTH,
-    )
-    for key in padded:
-        inputs[key] = padded[key]
-
-    if "pixel_values" in inputs:
-        inputs["pixel_values"] = inputs["pixel_values"].to(torch.float32)
-
-    return inputs
-
-
-def _get_yes_no_token_ids(tokenizer) -> Tuple[int, int]:
-    vocab = tokenizer.get_vocab()
-    if "yes" not in vocab or "no" not in vocab:
-        raise ValueError("Could not resolve tokenizer ids for exact tokens 'yes' and 'no'.")
-    return vocab["yes"], vocab["no"]
-
-
-def _score_from_logits(logits, yes_token_id: int, no_token_id: int) -> np.ndarray:
-    if isinstance(logits, np.ndarray):
-        logits_tensor = torch.from_numpy(logits)
-    else:
-        logits_tensor = logits.detach().cpu()
-
-    if logits_tensor.ndim == 3:
-        logits_tensor = logits_tensor[:, -1, :]
-    elif logits_tensor.ndim != 2:
-        raise ValueError(f"Unsupported logits rank for score conversion: {logits_tensor.ndim}")
-
-    score = torch.sigmoid(logits_tensor[:, yes_token_id] - logits_tensor[:, no_token_id])
-    return score.detach().cpu().numpy().astype(np.float64)
-
-
-def _score_from_last_hidden(last_hidden_state: torch.Tensor, score_linear: torch.nn.Linear) -> np.ndarray:
-    score = torch.sigmoid(score_linear(last_hidden_state[:, -1])).squeeze(-1)
-    return score.detach().cpu().numpy().astype(np.float64)
-
-
-def _make_score_linear(model_hf, yes_token_id: int, no_token_id: int) -> torch.nn.Linear:
-    lm_head_weights = model_hf.lm_head.weight.data
-    weight_yes = lm_head_weights[yes_token_id]
-    weight_no = lm_head_weights[no_token_id]
-
-    linear_layer = torch.nn.Linear(weight_yes.shape[0], 1, bias=False)
-    with torch.no_grad():
-        linear_layer.weight[0] = weight_yes - weight_no
-    return linear_layer.eval()
-
-
-def _mad_stats(reference: np.ndarray, candidate: np.ndarray) -> Tuple[float, float]:
-    diff = np.abs(reference - candidate)
-    return float(np.mean(diff)), float(np.max(diff))
-
-
-def _prepare_qeff_inputs(qeff_model, tokenized_inputs: Dict, prefill_seq_len: int = None):
-    runtime_prompt_len = int(tokenized_inputs["input_ids"].shape[1])
-    effective_prefill_seq_len = runtime_prompt_len if prefill_seq_len is None else prefill_seq_len
-    if effective_prefill_seq_len < runtime_prompt_len:
-        raise ValueError(
-            f"prefill_seq_len ({effective_prefill_seq_len}) must be >= runtime prompt length ({runtime_prompt_len})."
-        )
-
-    prepared_inputs = qeff_model.model.prepare_inputs_for_generation(
-        inputs=tokenized_inputs,
-        prefill_seq_len=effective_prefill_seq_len,
-        batch_size=1,
-    )
-
-    if "image_grid_thw" in prepared_inputs and prepared_inputs["image_grid_thw"].ndim == 2:
-        thw = prepared_inputs["image_grid_thw"][0]
-        t, h, w = int(thw[0].item()), int(thw[1].item()), int(thw[2].item())
-        prepared_inputs["image_grid_thw"] = torch.zeros((1, t, h, w), dtype=thw.dtype)
-
-    if "pixel_values" in prepared_inputs:
-        prepared_inputs["pixel_values"] = prepared_inputs["pixel_values"].to(torch.float32)
-
-    return prepared_inputs, runtime_prompt_len
-
-
-def _zero_vision_outputs(vision_outputs: Dict[str, np.ndarray]) -> Dict[str, np.ndarray]:
-    return {name: np.zeros_like(value) for name, value in vision_outputs.items()}
-
-
-def _run_ai100_vision(vision_qpc_path: str, prepared_inputs) -> Dict[str, np.ndarray]:
-    vision_session = QAICInferenceSession(vision_qpc_path)
-    vision_inputs = {
-        "pixel_values": prepared_inputs["pixel_values"].detach().cpu().numpy().astype(np.float16),
-        "image_grid_thw": prepared_inputs["image_grid_thw"].detach().cpu().numpy().astype(np.int64),
-    }
-    vision_outputs = vision_session.run(vision_inputs)
-    vision_session.deactivate()
-    return vision_outputs
-
-
-def _run_ai100_prefill(qpc_paths, prepared_inputs, vision_template):
-    if not isinstance(qpc_paths, dict):
-        raise ValueError("Expected qpc_paths to be a dict with vision/lang QPC keys.")
-
-    vision_qpc_path = qpc_paths.get("vision_qpc_path")
-    lang_qpc_path = qpc_paths.get("lang_qpc_path")
-    if vision_qpc_path is None or lang_qpc_path is None:
-        raise ValueError("Missing vision_qpc_path/lang_qpc_path in compiled QPC outputs.")
-
-    prefill_len = prepared_inputs["position_ids"].shape[-1]
-    input_ids = prepared_inputs["input_ids"]
-    if input_ids.shape[1] < prefill_len:
-        pad = torch.full(
-            (input_ids.shape[0], prefill_len - input_ids.shape[1]),
-            1,
-            dtype=input_ids.dtype,
-            device=input_ids.device,
-        )
-        input_ids = torch.cat([input_ids, pad], dim=1)
-    else:
-        input_ids = input_ids[:, :prefill_len]
-    position_ids = prepared_inputs["position_ids"][..., :prefill_len]
-
-    if "pixel_values" in prepared_inputs and "image_grid_thw" in prepared_inputs:
-        vision_outputs = _run_ai100_vision(vision_qpc_path, prepared_inputs)
-    else:
-        vision_outputs = _zero_vision_outputs(vision_template)
-
-    lang_session = QAICInferenceSession(lang_qpc_path)
-    lang_session.skip_buffers(
-        [
-            name
-            for name in lang_session.input_names + lang_session.output_names
-            if name.startswith("past_") or name.endswith("_RetainedState")
-        ]
-    )
-    lang_session.set_buffers(vision_outputs)
-    lang_inputs = {
-        "input_ids": input_ids.detach().cpu().numpy().astype(np.int64),
-        "position_ids": position_ids.detach().cpu().numpy().astype(np.int64),
-        "image_idx": np.zeros((1, 1), dtype=np.int64),
-    }
-    outputs = lang_session.run(lang_inputs)
-    lang_session.deactivate()
-    return outputs["logits"]
-
-
-@pytest.mark.on_qaic
-@pytest.mark.multimodal
-@pytest.mark.regular
-@pytest.mark.parametrize("model_name", test_reranker_models)
-def test_qwen3_vl_reranker_mad_parity(model_name):
-    torch.manual_seed(42)
-    model_cfg = reranker_model_config_dict[model_name]
-    model_source = _resolve_model_source(model_name)
-
-    config = AutoConfig.from_pretrained(model_source, trust_remote_code=True, padding=True)
-    config = set_num_layers_vlm(config, n_layer=model_cfg["num_layers"])
-    if hasattr(config, "use_cache"):
-        config.use_cache = True
-    if hasattr(config, "text_config") and hasattr(config.text_config, "use_cache"):
-        config.text_config.use_cache = True
-
-    model_hf = load_vlm_model(config)
-    model_hf.eval()
-
-    qeff_model = QEFFAutoModelForImageTextToText.from_pretrained(
-        model_source,
-        kv_offload=True,
-        config=config,
-    )
-    processor = AutoProcessor.from_pretrained(model_source, trust_remote_code=True, padding=True)
-
-    yes_token_id, no_token_id = _get_yes_no_token_ids(processor.tokenizer)
-    score_linear = _make_score_linear(model_hf, yes_token_id, no_token_id).to(next(model_hf.parameters()).device)
-    score_linear = score_linear.to(dtype=next(model_hf.parameters()).dtype)
-
-    doc_contexts = []
-    max_prompt_len = 0
-    max_grid_h = 22
-    max_grid_w = 34
-
-    hf_scores_list = []
-
-    documents = EXAMPLE_INPUTS["documents"]
-    if RERANKER_DOC_LIMIT > 0:
-        documents = documents[:RERANKER_DOC_LIMIT]
-
-    for document in documents:
-        pair = _format_mm_instruction(
-            instruction=EXAMPLE_INPUTS["instruction"],
-            query=EXAMPLE_INPUTS["query"],
-            document=document,
-        )
-        tokenized = _tokenize_pair(processor, pair)
-        runtime_prompt_len = int(tokenized["input_ids"].shape[1])
-
-        hf_inputs = {}
-        for key, value in tokenized.items():
-            hf_inputs[key] = value.to(next(model_hf.parameters()).device) if torch.is_tensor(value) else value
-        with torch.no_grad():
-            hf_last_hidden = model_hf.model(**hf_inputs).last_hidden_state
-        hf_score = _score_from_last_hidden(hf_last_hidden, score_linear)[0]
-        hf_scores_list.append(float(hf_score))
-
-        if "image_grid_thw" in tokenized and tokenized["image_grid_thw"].numel() > 0:
-            grid = tokenized["image_grid_thw"]
-            max_grid_h = max(max_grid_h, int(grid[..., 1].max().item()))
-            max_grid_w = max(max_grid_w, int(grid[..., 2].max().item()))
-
-        doc_contexts.append(
-            {
-                "tokenized": tokenized,
-            }
-        )
-        max_prompt_len = max(max_prompt_len, runtime_prompt_len)
-
-    patch_size = int(qeff_model.model.config.vision_config.patch_size)
-    compile_height = max_grid_h * patch_size
-    compile_width = max_grid_w * patch_size
-
-    qpc_paths = qeff_model.compile(
-        img_size=max(compile_height, compile_width),
-        height=compile_height,
-        width=compile_width,
-        prefill_seq_len=max_prompt_len,
-        ctx_len=model_cfg["ctx_len"],
-        num_devices=1,
-        num_cores=16,
-        mxfp6_matmul=False,
-    )
-
-    ai100_scores_list = []
-
-    prepared_contexts = []
-    vision_template_ai100 = None
-    for context in doc_contexts:
-        prepared_inputs, _ = _prepare_qeff_inputs(
-            qeff_model=qeff_model,
-            tokenized_inputs=context["tokenized"],
-            prefill_seq_len=max_prompt_len,
-        )
-        prepared_contexts.append(
-            {
-                "prepared_inputs": prepared_inputs,
-            }
-        )
-        if vision_template_ai100 is None and "pixel_values" in prepared_inputs and "image_grid_thw" in prepared_inputs:
-            vision_template_ai100 = _run_ai100_vision(qpc_paths["vision_qpc_path"], prepared_inputs)
-
-    if vision_template_ai100 is None:
-        raise ValueError("Expected at least one image document to initialize vision templates.")
-
-    for context in prepared_contexts:
-        prepared_inputs_runtime = context["prepared_inputs"]
-        ai100_logits = _run_ai100_prefill(
-            qpc_paths=qpc_paths,
-            prepared_inputs=prepared_inputs_runtime,
-            vision_template=vision_template_ai100,
-        )
-        ai100_score = _score_from_logits(ai100_logits, yes_token_id, no_token_id)[0]
-        ai100_scores_list.append(float(ai100_score))
-
-    hf_scores = np.array(hf_scores_list, dtype=np.float64)
-    ai100_scores = np.array(ai100_scores_list, dtype=np.float64)
-
-    print(f"[SCORES] PyTorch(original): {hf_scores.tolist()}")
-    print(f"[SCORES] AI100: {ai100_scores.tolist()}")
-
-    pt_ai100_mad_mean, pt_ai100_mad_max = _mad_stats(hf_scores, ai100_scores)
-    print(f"[MAD] PyTorch(original) vs AI100: mean={pt_ai100_mad_mean:.6e}, max={pt_ai100_mad_max:.6e}")
-    assert pt_ai100_mad_max <= PT_AI100_MAD_MAX, (
-        f"PyTorch(original) vs AI100 MAD max {pt_ai100_mad_max:.6e} "
-        f"exceeds threshold {PT_AI100_MAD_MAX:.6e}. "
-        f"Check tokenizer ids, prompt formatting, runtime prompt length slicing, and compile dimensions."
-    )

From 612ed3e0c3aad26d60d58ae0eea1386816ffa5c0 Mon Sep 17 00:00:00 2001
From: Amit <amitraj@qti.qualcomm.com>
Date: Wed, 20 May 2026 23:32:03 +0530
Subject: [PATCH 03/11] Addressed comments and fix CI issue

Signed-off-by: Amit <amitraj@qti.qualcomm.com>
Signed-off-by: Amit Raj <amitraj@qti.qualcomm.com>
---
 examples/reranker/qwen3vl/README.md         |  7 ++-----
 examples/reranker/qwen3vl/reranker_model.py | 10 +++-------
 scripts/Jenkinsfile                         |  2 +-
 3 files changed, 6 insertions(+), 13 deletions(-)

diff --git a/examples/reranker/qwen3vl/README.md b/examples/reranker/qwen3vl/README.md
index d9d96645a8..7ebe1d7db8 100644
--- a/examples/reranker/qwen3vl/README.md
+++ b/examples/reranker/qwen3vl/README.md
@@ -23,11 +23,8 @@ pip install "qwen-vl-utils>=0.0.14"
 
 ## Scripts
 
-- `qwen3_vl_reranker.py` - runnable example that explicitly shows:
-  - `QEFFAutoModelForImageTextToText.from_pretrained(...)`
-  - `model.compile(...)` arguments for QPC generation
-  - AI100 scoring call flow
-- `reranker_model.py` - Qwen3-VL-specific helper logic (prompting/tokenization/scoring/runtime glue) adapted from the official Qwen reranker reference:
+- `qwen3_vl_reranker.py` - simple runnable API usage example.
+- `reranker_model.py` - AI100 dual-QPC implementation adapted from the official Qwen reranker reference:
   https://huggingface.co/Qwen/Qwen3-VL-Reranker-2B/blob/main/scripts/qwen3_vl_reranker.py
 
 ## Run
diff --git a/examples/reranker/qwen3vl/reranker_model.py b/examples/reranker/qwen3vl/reranker_model.py
index 33e73b05f6..8577c8a979 100644
--- a/examples/reranker/qwen3vl/reranker_model.py
+++ b/examples/reranker/qwen3vl/reranker_model.py
@@ -5,17 +5,13 @@
 #
 # ----------------------------------------------------------------------------
 
-"""Qwen3-VL-specific reranker helpers for AI100 runtime.
+"""Core AI100 reranker implementation for Qwen3-VL reranker models.
 
 The tokenization/scoring flow is adapted from the official Qwen reference:
 https://huggingface.co/Qwen/Qwen3-VL-Reranker-2B/blob/main/scripts/qwen3_vl_reranker.py
 
-This module intentionally keeps only Qwen3-VL-specific reranker logic
-(prompt construction, multimodal tokenization, yes/no score computation,
-and AI100 runtime orchestration with compiled QPC paths).
-
-Model loading (`from_pretrained`) and model compilation (`compile`) are exposed
-in `qwen3_vl_reranker.py` so users can directly see QEff API usage.
+This module isolates AI100 dual-QPC runtime details so the user-facing example
+script (`qwen3_vl_reranker.py`) remains focused on simple API usage.
 """
 
 from typing import Dict, List, Tuple
diff --git a/scripts/Jenkinsfile b/scripts/Jenkinsfile
index 49f637c2f9..0858a08254 100644
--- a/scripts/Jenkinsfile
+++ b/scripts/Jenkinsfile
@@ -64,7 +64,7 @@ pipeline {
                     pip install .[test] &&
                     pip install junitparser pytest-xdist &&
                     pip install librosa==0.10.2 soundfile==0.13.1 &&
-                    pip install qwen-vl-utils==0.0.14 &&
+                    pip install "qwen-vl-utils>=0.0.14" &&
                     pip install --extra-index-url https://download.pytorch.org/whl/cpu timm==1.0.14 torchvision==0.22.0+cpu einops==0.8.1
                     rm -rf QEfficient"
                 '''

From c4334c18e92f205355c4f9a40256f8d1c32680ec Mon Sep 17 00:00:00 2001
From: Amit Raj <amitraj@qti.qualcomm.com>
Date: Thu, 21 May 2026 11:00:53 +0530
Subject: [PATCH 04/11] Updated installation of qwen-vl-utils

Signed-off-by: Amit Raj <amitraj@qti.qualcomm.com>
---
 QEfficient/transformers/models/modeling_auto.py           | 8 ++------
 .../transformers/models/whisper/modeling_whisper.py       | 5 ++++-
 scripts/Jenkinsfile                                       | 2 +-
 3 files changed, 7 insertions(+), 8 deletions(-)

diff --git a/QEfficient/transformers/models/modeling_auto.py b/QEfficient/transformers/models/modeling_auto.py
index fc10032df6..0b1e3702b6 100755
--- a/QEfficient/transformers/models/modeling_auto.py
+++ b/QEfficient/transformers/models/modeling_auto.py
@@ -1377,7 +1377,7 @@ def export(
                 kv_offload=True,
                 continuous_batching=self.continuous_batching,
                 comp_ctx_lengths=self.comp_ctx_lengths_decode,
-                prefill_seq_len=prefill_seq_len,
+                **dummy_inputs_kwargs,
             )
             dynamic_axes = self.model.get_onnx_dynamic_axes(
                 kv_offload=True,
@@ -1385,11 +1385,7 @@ def export(
                 comp_ctx_lengths=self.comp_ctx_lengths_decode,
             )
         except TypeError:
-            inputs = self.model.get_dummy_inputs(
-                kv_offload=True,
-                comp_ctx_lengths=self.comp_ctx_lengths_decode,
-                prefill_seq_len=prefill_seq_len,
-            )
+            inputs = self.model.get_dummy_inputs(kv_offload=True, comp_ctx_lengths=self.comp_ctx_lengths_decode)
             dynamic_axes = self.model.get_onnx_dynamic_axes(
                 kv_offload=True, comp_ctx_lengths=self.comp_ctx_lengths_decode
             )
diff --git a/QEfficient/transformers/models/whisper/modeling_whisper.py b/QEfficient/transformers/models/whisper/modeling_whisper.py
index 1bdcd07ada..bf01a1779f 100644
--- a/QEfficient/transformers/models/whisper/modeling_whisper.py
+++ b/QEfficient/transformers/models/whisper/modeling_whisper.py
@@ -795,7 +795,10 @@ def get_dummy_inputs(
         **kwargs,
     ):
         bs = 1
-        seq_len = int(kwargs.get("prefill_seq_len", ONNX_EXPORT_EXAMPLE_SEQ_LEN))
+        seq_len = kwargs.get("prefill_seq_len")
+        if seq_len is None:
+            seq_len = 32
+        seq_len = int(seq_len)
         encoder_seq_len = self.config.max_source_positions
         encoder_feature_count = self.config.num_mel_bins
         num_key_value_heads = self.config.decoder_attention_heads
diff --git a/scripts/Jenkinsfile b/scripts/Jenkinsfile
index 0858a08254..49f637c2f9 100644
--- a/scripts/Jenkinsfile
+++ b/scripts/Jenkinsfile
@@ -64,7 +64,7 @@ pipeline {
                     pip install .[test] &&
                     pip install junitparser pytest-xdist &&
                     pip install librosa==0.10.2 soundfile==0.13.1 &&
-                    pip install "qwen-vl-utils>=0.0.14" &&
+                    pip install qwen-vl-utils==0.0.14 &&
                     pip install --extra-index-url https://download.pytorch.org/whl/cpu timm==1.0.14 torchvision==0.22.0+cpu einops==0.8.1
                     rm -rf QEfficient"
                 '''

From eee709853eae6a432286534292504c5c060d42a1 Mon Sep 17 00:00:00 2001
From: Amit Raj <amitraj@qti.qualcomm.com>
Date: Fri, 22 May 2026 17:58:35 +0530
Subject: [PATCH 05/11] Addressed comments

Signed-off-by: Amit Raj <amitraj@qti.qualcomm.com>
---
 examples/reranker/qwen3vl/README.md           |   7 +-
 .../reranker/qwen3vl/qwen3_vl_reranker.py     |   4 +-
 examples/reranker/qwen3vl/reranker_model.py   | 153 ++++++++++++++----
 3 files changed, 126 insertions(+), 38 deletions(-)

diff --git a/examples/reranker/qwen3vl/README.md b/examples/reranker/qwen3vl/README.md
index 7ebe1d7db8..d9d96645a8 100644
--- a/examples/reranker/qwen3vl/README.md
+++ b/examples/reranker/qwen3vl/README.md
@@ -23,8 +23,11 @@ pip install "qwen-vl-utils>=0.0.14"
 
 ## Scripts
 
-- `qwen3_vl_reranker.py` - simple runnable API usage example.
-- `reranker_model.py` - AI100 dual-QPC implementation adapted from the official Qwen reranker reference:
+- `qwen3_vl_reranker.py` - runnable example that explicitly shows:
+  - `QEFFAutoModelForImageTextToText.from_pretrained(...)`
+  - `model.compile(...)` arguments for QPC generation
+  - AI100 scoring call flow
+- `reranker_model.py` - Qwen3-VL-specific helper logic (prompting/tokenization/scoring/runtime glue) adapted from the official Qwen reranker reference:
   https://huggingface.co/Qwen/Qwen3-VL-Reranker-2B/blob/main/scripts/qwen3_vl_reranker.py
 
 ## Run
diff --git a/examples/reranker/qwen3vl/qwen3_vl_reranker.py b/examples/reranker/qwen3vl/qwen3_vl_reranker.py
index 01884d0d08..42e2cf5082 100644
--- a/examples/reranker/qwen3vl/qwen3_vl_reranker.py
+++ b/examples/reranker/qwen3vl/qwen3_vl_reranker.py
@@ -85,13 +85,13 @@ def main() -> None:
     model_source = resolve_model_source(args.model_name)
 
     # 1) Load config + processor + QEff model through public QEff/HF APIs.
-    config = AutoConfig.from_pretrained(model_source, trust_remote_code=True)
+    config = AutoConfig.from_pretrained(model_source, trust_remote_code=True, padding=True)
     if hasattr(config, "use_cache"):
         config.use_cache = True
     if hasattr(config, "text_config") and hasattr(config.text_config, "use_cache"):
         config.text_config.use_cache = True
 
-    processor = AutoProcessor.from_pretrained(model_source, trust_remote_code=True)
+    processor = AutoProcessor.from_pretrained(model_source, trust_remote_code=True, padding=True)
     model = QEFFAutoModelForImageTextToText.from_pretrained(
         model_source,
         kv_offload=True,
diff --git a/examples/reranker/qwen3vl/reranker_model.py b/examples/reranker/qwen3vl/reranker_model.py
index 8577c8a979..8cd8a5ed4f 100644
--- a/examples/reranker/qwen3vl/reranker_model.py
+++ b/examples/reranker/qwen3vl/reranker_model.py
@@ -5,32 +5,27 @@
 #
 # ----------------------------------------------------------------------------
 
-"""Core AI100 reranker implementation for Qwen3-VL reranker models.
+"""Qwen3-VL-specific reranker helpers for AI100 runtime.
 
 The tokenization/scoring flow is adapted from the official Qwen reference:
 https://huggingface.co/Qwen/Qwen3-VL-Reranker-2B/blob/main/scripts/qwen3_vl_reranker.py
 
-This module isolates AI100 dual-QPC runtime details so the user-facing example
-script (`qwen3_vl_reranker.py`) remains focused on simple API usage.
+This module intentionally keeps only Qwen3-VL-specific reranker logic
+(prompt construction, multimodal tokenization, yes/no score computation,
+and AI100 runtime orchestration with compiled QPC paths).
+
+Model loading (`from_pretrained`) and model compilation (`compile`) are exposed
+in `qwen3_vl_reranker.py` so users can directly see QEff API usage.
 """
 
 from typing import Dict, List, Tuple
 
 import numpy as np
 import torch
+from huggingface_hub import snapshot_download
+from qwen_vl_utils import process_vision_info
 
 from QEfficient.generation.cloud_infer import QAICInferenceSession
-from QEfficient.transformers.models.qwen3_vl._reranker_utils import (
-    format_mm_content,
-    format_mm_instruction,
-    get_yes_no_token_ids,
-    score_from_logits,
-    tokenize_pair,
-    truncate_tokens_optimized,
-)
-from QEfficient.transformers.models.qwen3_vl._reranker_utils import (
-    resolve_model_source as _resolve_model_source,
-)
 
 # Max token budget used by this example's manual truncation/padding flow.
 MAX_LENGTH = 8192
@@ -48,7 +43,9 @@ def resolve_model_source(model_name_or_path: str) -> str:
     Some transformers versions can fail when resolving chat templates from
     repo-id mode for this model. Using a local snapshot path avoids that path.
     """
-    return _resolve_model_source(model_name_or_path)
+    if os.path.isdir(model_name_or_path):
+        return model_name_or_path
+    return snapshot_download(repo_id=model_name_or_path)
 
 
 class QEffQwen3VLReranker:
@@ -84,40 +81,128 @@ def _score_from_logits(logits, yes_token_id: int, no_token_id: int) -> float:
         Score formula:
             sigmoid(logit_yes - logit_no)
         """
-        score = score_from_logits(logits, yes_token_id, no_token_id)
+        logits_tensor = torch.from_numpy(logits) if isinstance(logits, np.ndarray) else logits.detach().cpu()
+        if logits_tensor.ndim == 3:
+            logits_tensor = logits_tensor[:, -1, :]
+        score = torch.sigmoid(logits_tensor[:, yes_token_id] - logits_tensor[:, no_token_id])
         return float(score[0].item())
 
     @staticmethod
     def _truncate_tokens_optimized(tokens: List[int], max_length: int, special_tokens: List[int]) -> List[int]:
         """Truncate while preserving all special tokens in sequence order."""
-        return truncate_tokens_optimized(tokens, max_length, special_tokens)
+        if len(tokens) <= max_length:
+            return tokens
+
+        special_tokens_set = set(special_tokens)
+        num_special = sum(1 for token in tokens if token in special_tokens_set)
+        num_non_special_to_keep = max_length - num_special
+
+        final_tokens = []
+        non_special_kept_count = 0
+        for token in tokens:
+            if token in special_tokens_set:
+                final_tokens.append(token)
+            elif non_special_kept_count < num_non_special_to_keep:
+                final_tokens.append(token)
+                non_special_kept_count += 1
+        return final_tokens
 
     def _format_mm_content(self, text, image, video, prefix: str) -> List[Dict]:
         """Build one multimodal content block (prefix + optional image + optional text)."""
-        return format_mm_content(
-            text=text,
-            image=image,
-            video=video,
-            prefix=prefix,
-            min_pixels=MIN_PIXELS,
-            max_pixels=MAX_PIXELS,
-            unsupported_video_error="Video input is not supported in this AI100-only example.",
-        )
+        content = [{"type": "text", "text": prefix}]
+
+        if not text and not image and not video:
+            content.append({"type": "text", "text": "NULL"})
+            return content
+
+        if video:
+            raise ValueError("Video input is not supported in this AI100-only example.")
+
+        if image:
+            if isinstance(image, str):
+                image_content = image if image.startswith(("http", "oss")) else "file://" + image
+            else:
+                image_content = image
+            content.append(
+                {
+                    "type": "image",
+                    "image": image_content,
+                    "min_pixels": MIN_PIXELS,
+                    "max_pixels": MAX_PIXELS,
+                }
+            )
+
+        if text:
+            content.append({"type": "text", "text": text})
+
+        return content
 
     def _format_mm_instruction(self, instruction: str, query: Dict, document: Dict) -> List[Dict]:
         """Create the chat payload for one query-document pair."""
-        return format_mm_instruction(
-            instruction=instruction,
-            query=query,
-            document=document,
-            min_pixels=MIN_PIXELS,
-            max_pixels=MAX_PIXELS,
-            unsupported_video_error="Video input is not supported in this AI100-only example.",
+        contents = [{"type": "text", "text": "<Instruct>: " + instruction}]
+
+        contents.extend(
+            self._format_mm_content(
+                query.get("text"),
+                query.get("image"),
+                query.get("video"),
+                prefix="<Query>:",
+            )
         )
 
     def _tokenize_pair(self, pair: List[Dict]) -> Dict:
         """Tokenize a query-document pair with the exact HF multimodal pipeline."""
-        return tokenize_pair(self.processor, pair, self.max_length)
+        pairs = [pair]
+        text = self.processor.apply_chat_template(pairs, tokenize=False, add_generation_prompt=True)
+
+        images, videos, video_kwargs = process_vision_info(
+            pairs,
+            image_patch_size=16,
+            return_video_kwargs=True,
+            return_video_metadata=True,
+        )
+
+        if videos is not None:
+            videos, video_metadatas = zip(*videos)
+            videos = list(videos)
+            video_metadatas = list(video_metadatas)
+        else:
+            video_metadatas = None
+
+        inputs = self.processor(
+            text=text,
+            images=images,
+            videos=videos,
+            video_metadata=video_metadatas,
+            truncation=False,
+            padding=False,
+            do_resize=False,
+            **video_kwargs,
+        )
+
+        for i, input_ids in enumerate(inputs["input_ids"]):
+            inputs["input_ids"][i] = (
+                self._truncate_tokens_optimized(
+                    input_ids[:-5],
+                    self.max_length,
+                    self.processor.tokenizer.all_special_ids,
+                )
+                + input_ids[-5:]
+            )
+
+        padded = self.processor.tokenizer.pad(
+            {"input_ids": inputs["input_ids"]},
+            padding=True,
+            return_tensors="pt",
+            max_length=self.max_length,
+        )
+        for key in padded:
+            inputs[key] = padded[key]
+
+        if "pixel_values" in inputs:
+            inputs["pixel_values"] = inputs["pixel_values"].to(torch.float32)
+
+        return inputs
 
     def _prepare_inputs(self, tokenized_inputs: Dict, prefill_seq_len: int):
         """Prepare model inputs for dual-QPC prefill execution."""

From 7d1e2f43f7257d3174f2d66d4f5ee560de13a3da Mon Sep 17 00:00:00 2001
From: Amit Raj <amitraj@qti.qualcomm.com>
Date: Mon, 1 Jun 2026 14:25:26 +0530
Subject: [PATCH 06/11] Rebased and addressed comments

Signed-off-by: Amit Raj <amitraj@qti.qualcomm.com>
---
 .../models/whisper/modeling_whisper.py        |   5 +-
 .../reranker/qwen3vl/qwen3_vl_reranker.py     |   4 +-
 examples/reranker/qwen3vl/reranker_model.py   | 143 ++++--------------
 3 files changed, 34 insertions(+), 118 deletions(-)

diff --git a/QEfficient/transformers/models/whisper/modeling_whisper.py b/QEfficient/transformers/models/whisper/modeling_whisper.py
index bf01a1779f..1bdcd07ada 100644
--- a/QEfficient/transformers/models/whisper/modeling_whisper.py
+++ b/QEfficient/transformers/models/whisper/modeling_whisper.py
@@ -795,10 +795,7 @@ def get_dummy_inputs(
         **kwargs,
     ):
         bs = 1
-        seq_len = kwargs.get("prefill_seq_len")
-        if seq_len is None:
-            seq_len = 32
-        seq_len = int(seq_len)
+        seq_len = int(kwargs.get("prefill_seq_len", ONNX_EXPORT_EXAMPLE_SEQ_LEN))
         encoder_seq_len = self.config.max_source_positions
         encoder_feature_count = self.config.num_mel_bins
         num_key_value_heads = self.config.decoder_attention_heads
diff --git a/examples/reranker/qwen3vl/qwen3_vl_reranker.py b/examples/reranker/qwen3vl/qwen3_vl_reranker.py
index 42e2cf5082..01884d0d08 100644
--- a/examples/reranker/qwen3vl/qwen3_vl_reranker.py
+++ b/examples/reranker/qwen3vl/qwen3_vl_reranker.py
@@ -85,13 +85,13 @@ def main() -> None:
     model_source = resolve_model_source(args.model_name)
 
     # 1) Load config + processor + QEff model through public QEff/HF APIs.
-    config = AutoConfig.from_pretrained(model_source, trust_remote_code=True, padding=True)
+    config = AutoConfig.from_pretrained(model_source, trust_remote_code=True)
     if hasattr(config, "use_cache"):
         config.use_cache = True
     if hasattr(config, "text_config") and hasattr(config.text_config, "use_cache"):
         config.text_config.use_cache = True
 
-    processor = AutoProcessor.from_pretrained(model_source, trust_remote_code=True, padding=True)
+    processor = AutoProcessor.from_pretrained(model_source, trust_remote_code=True)
     model = QEFFAutoModelForImageTextToText.from_pretrained(
         model_source,
         kv_offload=True,
diff --git a/examples/reranker/qwen3vl/reranker_model.py b/examples/reranker/qwen3vl/reranker_model.py
index 8cd8a5ed4f..33e73b05f6 100644
--- a/examples/reranker/qwen3vl/reranker_model.py
+++ b/examples/reranker/qwen3vl/reranker_model.py
@@ -22,10 +22,19 @@
 
 import numpy as np
 import torch
-from huggingface_hub import snapshot_download
-from qwen_vl_utils import process_vision_info
 
 from QEfficient.generation.cloud_infer import QAICInferenceSession
+from QEfficient.transformers.models.qwen3_vl._reranker_utils import (
+    format_mm_content,
+    format_mm_instruction,
+    get_yes_no_token_ids,
+    score_from_logits,
+    tokenize_pair,
+    truncate_tokens_optimized,
+)
+from QEfficient.transformers.models.qwen3_vl._reranker_utils import (
+    resolve_model_source as _resolve_model_source,
+)
 
 # Max token budget used by this example's manual truncation/padding flow.
 MAX_LENGTH = 8192
@@ -43,9 +52,7 @@ def resolve_model_source(model_name_or_path: str) -> str:
     Some transformers versions can fail when resolving chat templates from
     repo-id mode for this model. Using a local snapshot path avoids that path.
     """
-    if os.path.isdir(model_name_or_path):
-        return model_name_or_path
-    return snapshot_download(repo_id=model_name_or_path)
+    return _resolve_model_source(model_name_or_path)
 
 
 class QEffQwen3VLReranker:
@@ -81,128 +88,40 @@ def _score_from_logits(logits, yes_token_id: int, no_token_id: int) -> float:
         Score formula:
             sigmoid(logit_yes - logit_no)
         """
-        logits_tensor = torch.from_numpy(logits) if isinstance(logits, np.ndarray) else logits.detach().cpu()
-        if logits_tensor.ndim == 3:
-            logits_tensor = logits_tensor[:, -1, :]
-        score = torch.sigmoid(logits_tensor[:, yes_token_id] - logits_tensor[:, no_token_id])
+        score = score_from_logits(logits, yes_token_id, no_token_id)
         return float(score[0].item())
 
     @staticmethod
     def _truncate_tokens_optimized(tokens: List[int], max_length: int, special_tokens: List[int]) -> List[int]:
         """Truncate while preserving all special tokens in sequence order."""
-        if len(tokens) <= max_length:
-            return tokens
-
-        special_tokens_set = set(special_tokens)
-        num_special = sum(1 for token in tokens if token in special_tokens_set)
-        num_non_special_to_keep = max_length - num_special
-
-        final_tokens = []
-        non_special_kept_count = 0
-        for token in tokens:
-            if token in special_tokens_set:
-                final_tokens.append(token)
-            elif non_special_kept_count < num_non_special_to_keep:
-                final_tokens.append(token)
-                non_special_kept_count += 1
-        return final_tokens
+        return truncate_tokens_optimized(tokens, max_length, special_tokens)
 
     def _format_mm_content(self, text, image, video, prefix: str) -> List[Dict]:
         """Build one multimodal content block (prefix + optional image + optional text)."""
-        content = [{"type": "text", "text": prefix}]
-
-        if not text and not image and not video:
-            content.append({"type": "text", "text": "NULL"})
-            return content
-
-        if video:
-            raise ValueError("Video input is not supported in this AI100-only example.")
-
-        if image:
-            if isinstance(image, str):
-                image_content = image if image.startswith(("http", "oss")) else "file://" + image
-            else:
-                image_content = image
-            content.append(
-                {
-                    "type": "image",
-                    "image": image_content,
-                    "min_pixels": MIN_PIXELS,
-                    "max_pixels": MAX_PIXELS,
-                }
-            )
-
-        if text:
-            content.append({"type": "text", "text": text})
-
-        return content
+        return format_mm_content(
+            text=text,
+            image=image,
+            video=video,
+            prefix=prefix,
+            min_pixels=MIN_PIXELS,
+            max_pixels=MAX_PIXELS,
+            unsupported_video_error="Video input is not supported in this AI100-only example.",
+        )
 
     def _format_mm_instruction(self, instruction: str, query: Dict, document: Dict) -> List[Dict]:
         """Create the chat payload for one query-document pair."""
-        contents = [{"type": "text", "text": "<Instruct>: " + instruction}]
-
-        contents.extend(
-            self._format_mm_content(
-                query.get("text"),
-                query.get("image"),
-                query.get("video"),
-                prefix="<Query>:",
-            )
+        return format_mm_instruction(
+            instruction=instruction,
+            query=query,
+            document=document,
+            min_pixels=MIN_PIXELS,
+            max_pixels=MAX_PIXELS,
+            unsupported_video_error="Video input is not supported in this AI100-only example.",
         )
 
     def _tokenize_pair(self, pair: List[Dict]) -> Dict:
         """Tokenize a query-document pair with the exact HF multimodal pipeline."""
-        pairs = [pair]
-        text = self.processor.apply_chat_template(pairs, tokenize=False, add_generation_prompt=True)
-
-        images, videos, video_kwargs = process_vision_info(
-            pairs,
-            image_patch_size=16,
-            return_video_kwargs=True,
-            return_video_metadata=True,
-        )
-
-        if videos is not None:
-            videos, video_metadatas = zip(*videos)
-            videos = list(videos)
-            video_metadatas = list(video_metadatas)
-        else:
-            video_metadatas = None
-
-        inputs = self.processor(
-            text=text,
-            images=images,
-            videos=videos,
-            video_metadata=video_metadatas,
-            truncation=False,
-            padding=False,
-            do_resize=False,
-            **video_kwargs,
-        )
-
-        for i, input_ids in enumerate(inputs["input_ids"]):
-            inputs["input_ids"][i] = (
-                self._truncate_tokens_optimized(
-                    input_ids[:-5],
-                    self.max_length,
-                    self.processor.tokenizer.all_special_ids,
-                )
-                + input_ids[-5:]
-            )
-
-        padded = self.processor.tokenizer.pad(
-            {"input_ids": inputs["input_ids"]},
-            padding=True,
-            return_tensors="pt",
-            max_length=self.max_length,
-        )
-        for key in padded:
-            inputs[key] = padded[key]
-
-        if "pixel_values" in inputs:
-            inputs["pixel_values"] = inputs["pixel_values"].to(torch.float32)
-
-        return inputs
+        return tokenize_pair(self.processor, pair, self.max_length)
 
     def _prepare_inputs(self, tokenized_inputs: Dict, prefill_seq_len: int):
         """Prepare model inputs for dual-QPC prefill execution."""

From 15d0ff1817fd50149e4c4817e4b03cfc65f38215 Mon Sep 17 00:00:00 2001
From: Amit Raj <amitraj@qti.qualcomm.com>
Date: Thu, 4 Jun 2026 01:05:16 +0530
Subject: [PATCH 07/11] Intial fix

Signed-off-by: Amit Raj <amitraj@qti.qualcomm.com>
---
 QEfficient/transformers/models/modeling_auto.py   |  9 ++++-----
 .../models/qwen3_vl/modeling_qwen3_vl.py          | 15 +++++----------
 examples/reranker/qwen3vl/README.md               |  1 -
 examples/reranker/qwen3vl/qwen3_vl_reranker.py    |  2 --
 examples/reranker/qwen3vl/reranker_model.py       |  5 +++--
 5 files changed, 12 insertions(+), 20 deletions(-)

diff --git a/QEfficient/transformers/models/modeling_auto.py b/QEfficient/transformers/models/modeling_auto.py
index 0b1e3702b6..360aaa13ec 100755
--- a/QEfficient/transformers/models/modeling_auto.py
+++ b/QEfficient/transformers/models/modeling_auto.py
@@ -1367,17 +1367,12 @@ def export(
         List[str]
             A list containing the paths to the generated ONNX graph files for both components.
         """
-        dummy_inputs_kwargs = {}
-        if prefill_seq_len is not None:
-            dummy_inputs_kwargs["prefill_seq_len"] = int(prefill_seq_len)
-
         # TODO This is a temporary change as continous batching is enabled only for few models. Once support is added for all the models this exception handing can be removed.
         try:
             inputs = self.model.get_dummy_inputs(
                 kv_offload=True,
                 continuous_batching=self.continuous_batching,
                 comp_ctx_lengths=self.comp_ctx_lengths_decode,
-                **dummy_inputs_kwargs,
             )
             dynamic_axes = self.model.get_onnx_dynamic_axes(
                 kv_offload=True,
@@ -1678,6 +1673,10 @@ def compile(
             elif prefill_seq_len == 1:
                 specializations = specializations["lang"][-1:]
                 qpc_key = "lang_decode_qpc_path"
+            elif prefill_seq_len is not None and ctx_len is not None and prefill_seq_len == ctx_len:
+                # Single-shot mode (e.g. reranker): no decode steps, only prefill kernel needed.
+                specializations = specializations["lang"][:1]
+                qpc_key = "lang_qpc_path"
             else:
                 specializations = specializations["lang"]
                 qpc_key = "lang_qpc_path"
diff --git a/QEfficient/transformers/models/qwen3_vl/modeling_qwen3_vl.py b/QEfficient/transformers/models/qwen3_vl/modeling_qwen3_vl.py
index 0f6ab210de..9f609ea2ea 100644
--- a/QEfficient/transformers/models/qwen3_vl/modeling_qwen3_vl.py
+++ b/QEfficient/transformers/models/qwen3_vl/modeling_qwen3_vl.py
@@ -847,13 +847,8 @@ def get_dummy_inputs(
         continuous_batching: bool = False,
         **kwargs,
     ):
-        prefill_seq_len = kwargs.get("prefill_seq_len", constants.ONNX_EXPORT_EXAMPLE_SEQ_LEN)
-        if prefill_seq_len is None:
-            prefill_seq_len = constants.ONNX_EXPORT_EXAMPLE_SEQ_LEN
-        prefill_seq_len = int(prefill_seq_len)
-
         inputs_shapes = {}
-        inputs_shapes["input_ids"] = (constants.ONNX_EXPORT_EXAMPLE_BATCH_SIZE, prefill_seq_len)
+        inputs_shapes["input_ids"] = (constants.ONNX_EXPORT_EXAMPLE_BATCH_SIZE, constants.ONNX_EXPORT_EXAMPLE_SEQ_LEN)
         # vision_size = 1024
         vision_size = 187
         inputs_shapes["vision_embeds"] = (
@@ -865,7 +860,7 @@ def get_dummy_inputs(
         inputs_shapes["position_ids"] = (
             3,
             constants.ONNX_EXPORT_EXAMPLE_BATCH_SIZE,
-            prefill_seq_len,
+            constants.ONNX_EXPORT_EXAMPLE_SEQ_LEN,
         )
         inputs_shapes["pixel_values"] = (748, 1536)
         inputs_shapes["image_idx"] = (1, 1)
@@ -889,8 +884,8 @@ def get_dummy_inputs(
         )
         lang_inputs["position_ids"] = (
             (
-                torch.arange(prefill_seq_len, dtype=torch.int64)
-                .view(1, prefill_seq_len)
+                torch.arange(constants.ONNX_EXPORT_EXAMPLE_SEQ_LEN, dtype=torch.int64)
+                .view(1, constants.ONNX_EXPORT_EXAMPLE_SEQ_LEN)
                 .repeat(constants.ONNX_EXPORT_EXAMPLE_BATCH_SIZE, 1)
             )
             .unsqueeze(0)
@@ -908,7 +903,7 @@ def get_dummy_inputs(
         kv_cache_shape = get_padding_shape_from_config(
             config=self.model.config.text_config,
             batch_size=fbs if continuous_batching else bs,
-            seq_len=prefill_seq_len,
+            seq_len=constants.ONNX_EXPORT_EXAMPLE_SEQ_LEN,
         )
 
         lang_inputs["past_key_values"] = [[] for _ in range(self.model.config.text_config.num_hidden_layers)]
diff --git a/examples/reranker/qwen3vl/README.md b/examples/reranker/qwen3vl/README.md
index d9d96645a8..74bc9d4a2a 100644
--- a/examples/reranker/qwen3vl/README.md
+++ b/examples/reranker/qwen3vl/README.md
@@ -49,7 +49,6 @@ With compile parameters:
 ```bash
 python examples/reranker/qwen3vl/qwen3_vl_reranker.py \
   --model-name Qwen/Qwen3-VL-Reranker-2B \
-  --ctx-len 2048 \
   --num-cores 16 \
   --num-devices 1 \
   --compile-prefill-seq-len 4096 \
diff --git a/examples/reranker/qwen3vl/qwen3_vl_reranker.py b/examples/reranker/qwen3vl/qwen3_vl_reranker.py
index 01884d0d08..504280e7d9 100644
--- a/examples/reranker/qwen3vl/qwen3_vl_reranker.py
+++ b/examples/reranker/qwen3vl/qwen3_vl_reranker.py
@@ -30,7 +30,6 @@ def parse_args() -> argparse.Namespace:
     """Parse command-line arguments for AI100 compile/inference knobs."""
     parser = argparse.ArgumentParser(description="Qwen3-VL reranker example.")
     parser.add_argument("--model-name", type=str, default="Qwen/Qwen3-VL-Reranker-2B")
-    parser.add_argument("--ctx-len", type=int, default=2048, help="Context length used at compile time.")
     parser.add_argument("--num-cores", type=int, default=16, help="Number of AI100 cores.")
     parser.add_argument("--num-devices", type=int, default=1, help="Number of AI100 devices.")
     parser.add_argument(
@@ -106,7 +105,6 @@ def main() -> None:
     # 3) Derive compile requirements from current payload.
     compile_specs = reranker.get_compile_specs(
         inputs=inputs,
-        ctx_len=args.ctx_len,
         prefill_seq_len=args.compile_prefill_seq_len,
     )
 
diff --git a/examples/reranker/qwen3vl/reranker_model.py b/examples/reranker/qwen3vl/reranker_model.py
index 33e73b05f6..32c4e65eaa 100644
--- a/examples/reranker/qwen3vl/reranker_model.py
+++ b/examples/reranker/qwen3vl/reranker_model.py
@@ -173,7 +173,7 @@ def _collect_contexts(self, inputs: Dict):
 
         return prepared_contexts, max_prompt_len, max_grid_h, max_grid_w
 
-    def get_compile_specs(self, inputs: Dict, ctx_len: int, prefill_seq_len: int = None) -> Dict[str, int]:
+    def get_compile_specs(self, inputs: Dict, prefill_seq_len: int = None) -> Dict[str, int]:
         """Return compile parameters required for this input batch."""
         _, max_prompt_len, max_grid_h, max_grid_w = self._collect_contexts(inputs)
         if max_prompt_len == 0:
@@ -189,9 +189,10 @@ def get_compile_specs(self, inputs: Dict, ctx_len: int, prefill_seq_len: int = N
         height = max_grid_h * patch_size
         width = max_grid_w * patch_size
 
+        # ctx_len == prefill_seq_len always: reranker is single-shot prefill, no decode steps.
         return {
             "prefill_seq_len": target_prefill_seq_len,
-            "ctx_len": int(ctx_len),
+            "ctx_len": target_prefill_seq_len,
             "img_size": max(height, width),
             "height": height,
             "width": width,

From 28dc773ede838e8161800a67d2168c5ad4cf9a51 Mon Sep 17 00:00:00 2001
From: Amit Raj <amitraj@qti.qualcomm.com>
Date: Thu, 4 Jun 2026 14:38:11 +0530
Subject: [PATCH 08/11] Update the exmple script and modelling files

Signed-off-by: Amit Raj <amitraj@qti.qualcomm.com>
---
 .../models/gemma3/modeling_gemma3.py          | 20 ++++---------
 .../models/internvl/modeling_internvl.py      | 20 ++++---------
 .../models/llama4/modeling_llama4.py          | 20 ++++---------
 .../models/llava/modeling_llava.py            |  8 ++---
 .../models/llava_next/modeling_llava_next.py  | 10 +++----
 .../models/mistral3/modeling_mistral3.py      | 14 ++++-----
 .../models/mllama/modeling_mllama.py          |  7 ++---
 .../models/molmo/modeling_molmo.py            | 14 ++++-----
 .../models/qwen2_5_vl/modeling_qwen2_5_vl.py  | 14 ++++-----
 .../qwen3_vl_moe/modeling_qwen3_vl_moe.py     | 14 ++++-----
 .../models/whisper/modeling_whisper.py        |  5 ++--
 tests/configs/image_text_model_configs.json   | 30 -------------------
 tests/configs/reranker_model_configs.json     | 28 +++++++++++++++++
 .../models/reranker/test_reranker_mad.py      |  7 ++---
 .../reranker/test_reranker_models_unit.py     |  9 +++---
 15 files changed, 83 insertions(+), 137 deletions(-)
 create mode 100644 tests/configs/reranker_model_configs.json

diff --git a/QEfficient/transformers/models/gemma3/modeling_gemma3.py b/QEfficient/transformers/models/gemma3/modeling_gemma3.py
index a3e9257a73..35d9c07cf8 100644
--- a/QEfficient/transformers/models/gemma3/modeling_gemma3.py
+++ b/QEfficient/transformers/models/gemma3/modeling_gemma3.py
@@ -969,16 +969,8 @@ def get_dummy_pkv_cache(self, config, batch_size, seq_len, dtype=None):
         return past_key_values
 
     def get_dummy_inputs(
-        self,
-        comp_ctx_lengths: Optional[List[int]] = None,
-        kv_offload: bool = False,
-        continuous_batching: bool = False,
-        **kwargs,
+        self, comp_ctx_lengths: Optional[List[int]] = None, kv_offload: bool = False, continuous_batching: bool = False
     ):
-        prefill_seq_len = kwargs.get("prefill_seq_len")
-        if prefill_seq_len is None:
-            prefill_seq_len = constants.ONNX_EXPORT_EXAMPLE_SEQ_LEN
-        prefill_seq_len = int(prefill_seq_len)
         if vis_cfg := getattr(self.config, "vision_config", None):
             img_size = getattr(vis_cfg, "image_size", 896)
         else:
@@ -987,7 +979,7 @@ def get_dummy_inputs(
         mm_tokens_per_image = getattr(self.config, "mm_tokens_per_image", 256)
         # Define shapes
         inputs_shapes = {}
-        inputs_shapes["input_ids"] = (constants.ONNX_EXPORT_EXAMPLE_BATCH_SIZE, prefill_seq_len)
+        inputs_shapes["input_ids"] = (constants.ONNX_EXPORT_EXAMPLE_BATCH_SIZE, constants.ONNX_EXPORT_EXAMPLE_SEQ_LEN)
         inputs_shapes["vision_embeds"] = (
             1,  # constants.INTERN_NUM_PATCHES,
             mm_tokens_per_image,  # constants.INTERN_FEATURE_SIZE,
@@ -995,7 +987,7 @@ def get_dummy_inputs(
         )
         inputs_shapes["position_ids"] = (
             constants.ONNX_EXPORT_EXAMPLE_BATCH_SIZE,
-            prefill_seq_len,
+            constants.ONNX_EXPORT_EXAMPLE_SEQ_LEN,
         )
         inputs_shapes["pixel_values"] = (
             constants.ONNX_EXPORT_EXAMPLE_BATCH_SIZE,
@@ -1012,8 +1004,8 @@ def get_dummy_inputs(
         lang_inputs["input_ids"] = torch.zeros((inputs_shapes["input_ids"]), dtype=torch.int64)
         lang_inputs["vision_embeds"] = torch.zeros((inputs_shapes["vision_embeds"]), dtype=self.config.torch_dtype)
         lang_inputs["position_ids"] = (
-            torch.arange(prefill_seq_len, dtype=torch.int64)
-            .view(1, prefill_seq_len)
+            torch.arange(constants.ONNX_EXPORT_EXAMPLE_SEQ_LEN, dtype=torch.int64)
+            .view(1, constants.ONNX_EXPORT_EXAMPLE_SEQ_LEN)
             .repeat(constants.ONNX_EXPORT_EXAMPLE_BATCH_SIZE, 1)
         )
         lang_inputs["image_idx"] = torch.zeros((inputs_shapes["image_idx"]), dtype=torch.int64)
@@ -1025,7 +1017,7 @@ def get_dummy_inputs(
         lang_inputs["past_key_values"] = self.get_dummy_pkv_cache(
             config=self.language_model.config,
             batch_size=fbs if continuous_batching else bs,
-            seq_len=prefill_seq_len,
+            seq_len=constants.ONNX_EXPORT_EXAMPLE_SEQ_LEN,
         )
 
         if comp_ctx_lengths is not None:
diff --git a/QEfficient/transformers/models/internvl/modeling_internvl.py b/QEfficient/transformers/models/internvl/modeling_internvl.py
index 563c42e256..821381ac0d 100644
--- a/QEfficient/transformers/models/internvl/modeling_internvl.py
+++ b/QEfficient/transformers/models/internvl/modeling_internvl.py
@@ -273,16 +273,8 @@ def get_output_names(self, kv_offload: bool = False):
         return output_names
 
     def get_dummy_inputs(
-        self,
-        comp_ctx_lengths: Optional[List[int]] = None,
-        kv_offload: bool = False,
-        continuous_batching: bool = False,
-        **kwargs,
+        self, comp_ctx_lengths: Optional[List[int]] = None, kv_offload: bool = False, continuous_batching: bool = False
     ):
-        prefill_seq_len = kwargs.get("prefill_seq_len")
-        if prefill_seq_len is None:
-            prefill_seq_len = constants.ONNX_EXPORT_EXAMPLE_SEQ_LEN
-        prefill_seq_len = int(prefill_seq_len)
         if vis_cfg := getattr(self.config, "vision_config", None):
             img_size = getattr(vis_cfg, "image_size", constants.INTERN_IMG_SIZE)
         else:
@@ -301,7 +293,7 @@ def get_dummy_inputs(
 
         # Define shapes
         inputs_shapes = {}
-        inputs_shapes["input_ids"] = (constants.ONNX_EXPORT_EXAMPLE_BATCH_SIZE, prefill_seq_len)
+        inputs_shapes["input_ids"] = (constants.ONNX_EXPORT_EXAMPLE_BATCH_SIZE, constants.ONNX_EXPORT_EXAMPLE_SEQ_LEN)
         inputs_shapes["vision_embeds"] = (
             1,
             computed_feature_size * constants.ONNX_EXPORT_EXAMPLE_BATCH_SIZE,
@@ -309,7 +301,7 @@ def get_dummy_inputs(
         )
         inputs_shapes["position_ids"] = (
             constants.ONNX_EXPORT_EXAMPLE_BATCH_SIZE,
-            prefill_seq_len,
+            constants.ONNX_EXPORT_EXAMPLE_SEQ_LEN,
         )
         inputs_shapes["pixel_values"] = (
             constants.INTERN_NUM_PATCHES * constants.ONNX_EXPORT_EXAMPLE_BATCH_SIZE,
@@ -329,8 +321,8 @@ def get_dummy_inputs(
             (inputs_shapes["vision_embeds"]), dtype=self.config.vision_config.torch_dtype
         )
         lang_inputs["position_ids"] = (
-            torch.arange(prefill_seq_len, dtype=torch.int64)
-            .view(1, prefill_seq_len)
+            torch.arange(constants.ONNX_EXPORT_EXAMPLE_SEQ_LEN, dtype=torch.int64)
+            .view(1, constants.ONNX_EXPORT_EXAMPLE_SEQ_LEN)
             .repeat(constants.ONNX_EXPORT_EXAMPLE_BATCH_SIZE, 1)
         )
         lang_inputs["image_idx"] = torch.zeros((1, 1), dtype=torch.int64)
@@ -342,7 +334,7 @@ def get_dummy_inputs(
         kv_cache_shape = get_padding_shape_from_config(
             config=self.language_model.config,
             batch_size=fbs if continuous_batching else bs,
-            seq_len=prefill_seq_len,
+            seq_len=constants.ONNX_EXPORT_EXAMPLE_SEQ_LEN,
         )
 
         lang_inputs["past_key_values"] = [[] for _ in range(self.language_model.config.num_hidden_layers)]
diff --git a/QEfficient/transformers/models/llama4/modeling_llama4.py b/QEfficient/transformers/models/llama4/modeling_llama4.py
index 2cf5dbb2e9..7f90262bec 100644
--- a/QEfficient/transformers/models/llama4/modeling_llama4.py
+++ b/QEfficient/transformers/models/llama4/modeling_llama4.py
@@ -1185,16 +1185,8 @@ def get_dummy_pkv_cache(self, config, batch_size, seq_len):
         return past_key_values
 
     def get_dummy_inputs(
-        self,
-        comp_ctx_lengths: Optional[List[int]] = None,
-        kv_offload: bool = False,
-        continuous_batching: bool = False,
-        **kwargs,
+        self, comp_ctx_lengths: Optional[List[int]] = None, kv_offload: bool = False, continuous_batching: bool = False
     ):
-        prefill_seq_len = kwargs.get("prefill_seq_len")
-        if prefill_seq_len is None:
-            prefill_seq_len = constants.ONNX_EXPORT_EXAMPLE_SEQ_LEN
-        prefill_seq_len = int(prefill_seq_len)
         if vis_cfg := getattr(self.config, "vision_config", None):
             img_size = getattr(vis_cfg, "image_size", 336)
         else:
@@ -1202,7 +1194,7 @@ def get_dummy_inputs(
 
         # Define shapes
         inputs_shapes = {}
-        inputs_shapes["input_ids"] = (constants.ONNX_EXPORT_EXAMPLE_BATCH_SIZE, prefill_seq_len)
+        inputs_shapes["input_ids"] = (constants.ONNX_EXPORT_EXAMPLE_BATCH_SIZE, constants.ONNX_EXPORT_EXAMPLE_SEQ_LEN)
         max_num_tiles = 17
         downsample_ratio = int(round(1.0 / (self.config.vision_config.pixel_shuffle_ratio**2)))
         num_features_per_tile = int(
@@ -1218,7 +1210,7 @@ def get_dummy_inputs(
         )
         inputs_shapes["position_ids"] = (
             constants.ONNX_EXPORT_EXAMPLE_BATCH_SIZE,
-            prefill_seq_len,
+            constants.ONNX_EXPORT_EXAMPLE_SEQ_LEN,
         )
         inputs_shapes["pixel_values"] = (
             max_num_tiles,  # constants.INTERN_NUM_PATCHES,
@@ -1234,8 +1226,8 @@ def get_dummy_inputs(
         lang_inputs["input_ids"] = torch.zeros((inputs_shapes["input_ids"]), dtype=torch.int64)
         lang_inputs["vision_embeds"] = torch.zeros((inputs_shapes["vision_embeds"]), dtype=self.config.torch_dtype)
         lang_inputs["position_ids"] = (
-            torch.arange(prefill_seq_len, dtype=torch.int64)
-            .view(1, prefill_seq_len)
+            torch.arange(constants.ONNX_EXPORT_EXAMPLE_SEQ_LEN, dtype=torch.int64)
+            .view(1, constants.ONNX_EXPORT_EXAMPLE_SEQ_LEN)
             .repeat(constants.ONNX_EXPORT_EXAMPLE_BATCH_SIZE, 1)
         )
         lang_inputs["image_idx"] = torch.zeros((inputs_shapes["image_idx"]), dtype=torch.int64)
@@ -1247,7 +1239,7 @@ def get_dummy_inputs(
         past_key_values = self.get_dummy_pkv_cache(
             config=self.language_model.config,
             batch_size=fbs if continuous_batching else bs,
-            seq_len=prefill_seq_len,
+            seq_len=constants.ONNX_EXPORT_EXAMPLE_SEQ_LEN,
         )
 
         lang_inputs["past_key_values"] = [[] for _ in range(self.language_model.config.num_hidden_layers)]
diff --git a/QEfficient/transformers/models/llava/modeling_llava.py b/QEfficient/transformers/models/llava/modeling_llava.py
index 88bb5e1027..3fdfd11b9e 100644
--- a/QEfficient/transformers/models/llava/modeling_llava.py
+++ b/QEfficient/transformers/models/llava/modeling_llava.py
@@ -168,10 +168,6 @@ def get_dummy_inputs(
         continuous_batching: bool = False,
         **kwargs,
     ):
-        prefill_seq_len = kwargs.get("prefill_seq_len")
-        if prefill_seq_len is None:
-            prefill_seq_len = SEQ_LEN
-        prefill_seq_len = int(prefill_seq_len)
         num_layers = self.config.text_config.num_hidden_layers
         num_key_value_heads = self.config.text_config.num_key_value_heads
         head_dim = self.config.text_config.hidden_size // self.config.text_config.num_attention_heads
@@ -186,11 +182,11 @@ def get_dummy_inputs(
             "pixel_values": torch.zeros((BS, NUM_CHANNEL, img_size, img_size), dtype=self.config.torch_dtype),
         }
         lang_inputs = {
-            "input_ids": torch.ones((BS, prefill_seq_len), dtype=torch.int64),
+            "input_ids": torch.ones((BS, SEQ_LEN), dtype=torch.int64),
             "vision_embeds": torch.ones(
                 (BS, vision_size, self.model.language_model.config.hidden_size), dtype=self.config.torch_dtype
             ),
-            "attention_mask": torch.ones((BS, prefill_seq_len), dtype=torch.int64),
+            "attention_mask": torch.ones((BS, SEQ_LEN), dtype=torch.int64),
             "image_idx": torch.zeros((1, 1), dtype=torch.int64),
         }
         lang_inputs["position_ids"] = lang_inputs.pop("attention_mask").cumsum(1)
diff --git a/QEfficient/transformers/models/llava_next/modeling_llava_next.py b/QEfficient/transformers/models/llava_next/modeling_llava_next.py
index 342269ce50..c2a9137006 100755
--- a/QEfficient/transformers/models/llava_next/modeling_llava_next.py
+++ b/QEfficient/transformers/models/llava_next/modeling_llava_next.py
@@ -195,10 +195,6 @@ def get_dummy_inputs(
         continuous_batching: bool = False,
         **kwargs,
     ):
-        prefill_seq_len = kwargs.get("prefill_seq_len")
-        if prefill_seq_len is None:
-            prefill_seq_len = constants.GRANITEVISION_SEQ_LEN
-        prefill_seq_len = int(prefill_seq_len)
         num_layers = self.config.text_config.num_hidden_layers
         num_key_value_heads = self.config.text_config.num_key_value_heads
         head_dim = self.config.text_config.hidden_size // self.config.text_config.num_attention_heads
@@ -225,9 +221,11 @@ def get_dummy_inputs(
             ),
         }
         lang_inputs = {
-            "input_ids": torch.ones((constants.ONNX_EXPORT_EXAMPLE_BATCH_SIZE, prefill_seq_len), dtype=torch.int64),
+            "input_ids": torch.ones(
+                (constants.ONNX_EXPORT_EXAMPLE_BATCH_SIZE, constants.GRANITEVISION_SEQ_LEN), dtype=torch.int64
+            ),
             "attention_mask": torch.ones(
-                (constants.ONNX_EXPORT_EXAMPLE_BATCH_SIZE, prefill_seq_len), dtype=torch.int64
+                (constants.ONNX_EXPORT_EXAMPLE_BATCH_SIZE, constants.GRANITEVISION_SEQ_LEN), dtype=torch.int64
             ),
             "vision_embeds": torch.ones(
                 (
diff --git a/QEfficient/transformers/models/mistral3/modeling_mistral3.py b/QEfficient/transformers/models/mistral3/modeling_mistral3.py
index 628d1dee2c..9c37353328 100644
--- a/QEfficient/transformers/models/mistral3/modeling_mistral3.py
+++ b/QEfficient/transformers/models/mistral3/modeling_mistral3.py
@@ -346,12 +346,8 @@ def get_dummy_inputs(
         continuous_batching: bool = False,
         **kwargs,
     ):
-        prefill_seq_len = kwargs.get("prefill_seq_len")
-        if prefill_seq_len is None:
-            prefill_seq_len = constants.ONNX_EXPORT_EXAMPLE_SEQ_LEN
-        prefill_seq_len = int(prefill_seq_len)
         inputs_shapes = {}
-        inputs_shapes["input_ids"] = (constants.ONNX_EXPORT_EXAMPLE_BATCH_SIZE, prefill_seq_len)
+        inputs_shapes["input_ids"] = (constants.ONNX_EXPORT_EXAMPLE_BATCH_SIZE, constants.ONNX_EXPORT_EXAMPLE_SEQ_LEN)
         height = self.config.vision_config.image_size
         width = self.config.vision_config.image_size
         patch_size = self.config.vision_config.patch_size
@@ -367,7 +363,7 @@ def get_dummy_inputs(
         )
         inputs_shapes["position_ids"] = (
             constants.ONNX_EXPORT_EXAMPLE_BATCH_SIZE,
-            prefill_seq_len,
+            constants.ONNX_EXPORT_EXAMPLE_SEQ_LEN,
         )
         inputs_shapes["pixel_values"] = (
             constants.ONNX_EXPORT_EXAMPLE_BATCH_SIZE,
@@ -384,8 +380,8 @@ def get_dummy_inputs(
         lang_inputs["input_ids"] = torch.zeros((inputs_shapes["input_ids"]), dtype=torch.int64)
         lang_inputs["vision_embeds"] = torch.zeros((inputs_shapes["vision_embeds"]), dtype=self.config.torch_dtype)
         lang_inputs["position_ids"] = (
-            torch.arange(prefill_seq_len, dtype=torch.int64)
-            .view(1, prefill_seq_len)
+            torch.arange(constants.ONNX_EXPORT_EXAMPLE_SEQ_LEN, dtype=torch.int64)
+            .view(1, constants.ONNX_EXPORT_EXAMPLE_SEQ_LEN)
             .repeat(constants.ONNX_EXPORT_EXAMPLE_BATCH_SIZE, 1)
         )
         lang_inputs["image_idx"] = torch.zeros((inputs_shapes["image_idx"]), dtype=torch.int64)
@@ -397,7 +393,7 @@ def get_dummy_inputs(
         kv_cache_shape = get_padding_shape_from_config(
             config=self.model.config.text_config,
             batch_size=fbs if continuous_batching else bs,
-            seq_len=prefill_seq_len,
+            seq_len=constants.ONNX_EXPORT_EXAMPLE_SEQ_LEN,
         )
 
         lang_inputs["past_key_values"] = [[] for _ in range(self.model.language_model.config.num_hidden_layers)]
diff --git a/QEfficient/transformers/models/mllama/modeling_mllama.py b/QEfficient/transformers/models/mllama/modeling_mllama.py
index 45649662a7..d9310c02e4 100644
--- a/QEfficient/transformers/models/mllama/modeling_mllama.py
+++ b/QEfficient/transformers/models/mllama/modeling_mllama.py
@@ -924,12 +924,9 @@ def forward(
         logits = self.lm_head(hidden_states).float()
         return logits, image_idx, outputs.past_key_values, pixel_values
 
-    def get_dummy_inputs(self, comp_ctx_lengths: Optional[List[int]] = None, kv_offload: bool = False, **kwargs):
+    def get_dummy_inputs(self, comp_ctx_lengths: Optional[List[int]] = None, kv_offload: bool = False):
         BS = constants.ONNX_EXPORT_EXAMPLE_BATCH_SIZE
-        seq_len = kwargs.get("prefill_seq_len")
-        if seq_len is None:
-            seq_len = constants.ONNX_EXPORT_EXAMPLE_SEQ_LEN
-        SEQ_LEN = int(seq_len)
+        SEQ_LEN = constants.ONNX_EXPORT_EXAMPLE_SEQ_LEN
         CTX_LEN = constants.ONNX_EXPORT_CTX_LEN
 
         txt_cfg = self.config.get_text_config()
diff --git a/QEfficient/transformers/models/molmo/modeling_molmo.py b/QEfficient/transformers/models/molmo/modeling_molmo.py
index d59ca4e017..3eefba47f5 100644
--- a/QEfficient/transformers/models/molmo/modeling_molmo.py
+++ b/QEfficient/transformers/models/molmo/modeling_molmo.py
@@ -931,13 +931,9 @@ def get_dummy_inputs(
         continuous_batching: bool = False,
         **kwargs,
     ):
-        prefill_seq_len = kwargs.get("prefill_seq_len")
-        if prefill_seq_len is None:
-            prefill_seq_len = constants.ONNX_EXPORT_EXAMPLE_SEQ_LEN
-        prefill_seq_len = int(prefill_seq_len)
         inputs_shapes = {}
         inputs_shapes_lang = {}
-        inputs_shapes["input_ids"] = (constants.ONNX_EXPORT_EXAMPLE_BATCH_SIZE, prefill_seq_len)
+        inputs_shapes["input_ids"] = (constants.ONNX_EXPORT_EXAMPLE_BATCH_SIZE, constants.ONNX_EXPORT_EXAMPLE_SEQ_LEN)
 
         inputs_shapes["vision_embeds"] = (
             constants.ONNX_EXPORT_EXAMPLE_BATCH_SIZE,
@@ -946,7 +942,7 @@ def get_dummy_inputs(
         )
         inputs_shapes["position_ids"] = (
             constants.ONNX_EXPORT_EXAMPLE_BATCH_SIZE,
-            prefill_seq_len,
+            constants.ONNX_EXPORT_EXAMPLE_SEQ_LEN,
         )
         inputs_shapes["pixel_values"] = (
             constants.ONNX_EXPORT_EXAMPLE_BATCH_SIZE,
@@ -980,8 +976,8 @@ def get_dummy_inputs(
         lang_inputs["input_ids"] = torch.zeros((inputs_shapes["input_ids"]), dtype=torch.int64)
         lang_inputs["vision_embeds"] = torch.zeros((inputs_shapes["vision_embeds"]), dtype=self.config.torch_dtype)
         lang_inputs["position_ids"] = (
-            torch.arange(prefill_seq_len, dtype=torch.int64)
-            .view(1, prefill_seq_len)
+            torch.arange(constants.ONNX_EXPORT_EXAMPLE_SEQ_LEN, dtype=torch.int64)
+            .view(1, constants.ONNX_EXPORT_EXAMPLE_SEQ_LEN)
             .repeat(constants.ONNX_EXPORT_EXAMPLE_BATCH_SIZE, 1)
         )
         lang_inputs["image_idx"] = torch.zeros((inputs_shapes["image_idx"]), dtype=torch.int64)
@@ -993,7 +989,7 @@ def get_dummy_inputs(
         kv_cache_shape = get_padding_shape_from_config(
             config=self.config,
             batch_size=fbs if continuous_batching else bs,
-            seq_len=prefill_seq_len,
+            seq_len=constants.ONNX_EXPORT_EXAMPLE_SEQ_LEN,
         )
 
         lang_inputs["past_key_values"] = [[] for _ in range(self.model.config.n_layers)]
diff --git a/QEfficient/transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py b/QEfficient/transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py
index 357c4af16e..dd70a31c95 100644
--- a/QEfficient/transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py
+++ b/QEfficient/transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py
@@ -831,12 +831,8 @@ def get_dummy_inputs(
         continuous_batching: bool = False,
         **kwargs,
     ):
-        prefill_seq_len = kwargs.get("prefill_seq_len", constants.ONNX_EXPORT_EXAMPLE_SEQ_LEN)
-        if prefill_seq_len is None:
-            prefill_seq_len = constants.ONNX_EXPORT_EXAMPLE_SEQ_LEN
-        prefill_seq_len = int(prefill_seq_len)
         inputs_shapes = {}
-        inputs_shapes["input_ids"] = (constants.ONNX_EXPORT_EXAMPLE_BATCH_SIZE, prefill_seq_len)
+        inputs_shapes["input_ids"] = (constants.ONNX_EXPORT_EXAMPLE_BATCH_SIZE, constants.ONNX_EXPORT_EXAMPLE_SEQ_LEN)
 
         vision_size = 3577
         inputs_shapes["vision_embeds"] = (
@@ -848,7 +844,7 @@ def get_dummy_inputs(
         inputs_shapes["position_ids"] = (
             3,
             constants.ONNX_EXPORT_EXAMPLE_BATCH_SIZE,
-            prefill_seq_len,
+            constants.ONNX_EXPORT_EXAMPLE_SEQ_LEN,
         )
         inputs_shapes["pixel_values"] = (14308, 1176)
         inputs_shapes["image_idx"] = (1, 1)
@@ -862,8 +858,8 @@ def get_dummy_inputs(
         lang_inputs["vision_embeds"] = torch.zeros((inputs_shapes["vision_embeds"]), dtype=self.config.torch_dtype)
         lang_inputs["position_ids"] = (
             (
-                torch.arange(prefill_seq_len, dtype=torch.int64)
-                .view(1, prefill_seq_len)
+                torch.arange(constants.ONNX_EXPORT_EXAMPLE_SEQ_LEN, dtype=torch.int64)
+                .view(1, constants.ONNX_EXPORT_EXAMPLE_SEQ_LEN)
                 .repeat(constants.ONNX_EXPORT_EXAMPLE_BATCH_SIZE, 1)
             )
             .unsqueeze(0)
@@ -878,7 +874,7 @@ def get_dummy_inputs(
         kv_cache_shape = get_padding_shape_from_config(
             config=self.model.config.text_config,
             batch_size=fbs if continuous_batching else bs,
-            seq_len=prefill_seq_len,
+            seq_len=constants.ONNX_EXPORT_EXAMPLE_SEQ_LEN,
         )
 
         lang_inputs["past_key_values"] = [[] for _ in range(self.model.config.text_config.num_hidden_layers)]
diff --git a/QEfficient/transformers/models/qwen3_vl_moe/modeling_qwen3_vl_moe.py b/QEfficient/transformers/models/qwen3_vl_moe/modeling_qwen3_vl_moe.py
index dc741969a4..317c5ee261 100644
--- a/QEfficient/transformers/models/qwen3_vl_moe/modeling_qwen3_vl_moe.py
+++ b/QEfficient/transformers/models/qwen3_vl_moe/modeling_qwen3_vl_moe.py
@@ -867,12 +867,8 @@ def get_dummy_inputs(
         continuous_batching: bool = False,
         **kwargs,
     ):
-        prefill_seq_len = kwargs.get("prefill_seq_len")
-        if prefill_seq_len is None:
-            prefill_seq_len = constants.ONNX_EXPORT_EXAMPLE_SEQ_LEN
-        prefill_seq_len = int(prefill_seq_len)
         inputs_shapes = {}
-        inputs_shapes["input_ids"] = (constants.ONNX_EXPORT_EXAMPLE_BATCH_SIZE, prefill_seq_len)
+        inputs_shapes["input_ids"] = (constants.ONNX_EXPORT_EXAMPLE_BATCH_SIZE, constants.ONNX_EXPORT_EXAMPLE_SEQ_LEN)
         # vision_size = 1024
         vision_size = 187
         inputs_shapes["vision_embeds"] = (
@@ -884,7 +880,7 @@ def get_dummy_inputs(
         inputs_shapes["position_ids"] = (
             3,
             constants.ONNX_EXPORT_EXAMPLE_BATCH_SIZE,
-            prefill_seq_len,
+            constants.ONNX_EXPORT_EXAMPLE_SEQ_LEN,
         )
         inputs_shapes["pixel_values"] = (748, 1536)
         inputs_shapes["image_idx"] = (1, 1)
@@ -908,8 +904,8 @@ def get_dummy_inputs(
         )
         lang_inputs["position_ids"] = (
             (
-                torch.arange(prefill_seq_len, dtype=torch.int64)
-                .view(1, prefill_seq_len)
+                torch.arange(constants.ONNX_EXPORT_EXAMPLE_SEQ_LEN, dtype=torch.int64)
+                .view(1, constants.ONNX_EXPORT_EXAMPLE_SEQ_LEN)
                 .repeat(constants.ONNX_EXPORT_EXAMPLE_BATCH_SIZE, 1)
             )
             .unsqueeze(0)
@@ -927,7 +923,7 @@ def get_dummy_inputs(
         kv_cache_shape = get_padding_shape_from_config(
             config=self.model.config.text_config,
             batch_size=fbs if continuous_batching else bs,
-            seq_len=prefill_seq_len,
+            seq_len=constants.ONNX_EXPORT_EXAMPLE_SEQ_LEN,
         )
 
         lang_inputs["past_key_values"] = [[] for _ in range(self.model.config.text_config.num_hidden_layers)]
diff --git a/QEfficient/transformers/models/whisper/modeling_whisper.py b/QEfficient/transformers/models/whisper/modeling_whisper.py
index 1bdcd07ada..89c52c9517 100644
--- a/QEfficient/transformers/models/whisper/modeling_whisper.py
+++ b/QEfficient/transformers/models/whisper/modeling_whisper.py
@@ -30,7 +30,7 @@
 from QEfficient.transformers.cache_utils import QEffEncoderDecoderCache
 from QEfficient.transformers.modeling_attn_mask_utils import _create_causal_mask
 from QEfficient.utils._utils import IOInfo
-from QEfficient.utils.constants import MIN_MASKED_ATTENTION_VALUE, ONNX_EXPORT_EXAMPLE_SEQ_LEN
+from QEfficient.utils.constants import MIN_MASKED_ATTENTION_VALUE
 
 
 class QEffWhisperPositionalEmbedding(WhisperPositionalEmbedding):
@@ -792,10 +792,9 @@ def forward(
 
     def get_dummy_inputs(
         self,
-        **kwargs,
     ):
         bs = 1
-        seq_len = int(kwargs.get("prefill_seq_len", ONNX_EXPORT_EXAMPLE_SEQ_LEN))
+        seq_len = 32
         encoder_seq_len = self.config.max_source_positions
         encoder_feature_count = self.config.num_mel_bins
         num_key_value_heads = self.config.decoder_attention_heads
diff --git a/tests/configs/image_text_model_configs.json b/tests/configs/image_text_model_configs.json
index 85df559970..8181faf430 100644
--- a/tests/configs/image_text_model_configs.json
+++ b/tests/configs/image_text_model_configs.json
@@ -724,36 +724,6 @@
       }
     }
   ],
-  "image_text_reranker_models": [
-    {
-      "model_name": "Qwen/Qwen3-VL-Reranker-2B",
-      "model_type": "qwen3_vl",
-      "batch_size": 1,
-      "prompt_len": 128,
-      "ctx_len": 1024,
-      "img_size": 1540,
-      "img_url": "https://picsum.photos/id/237/536/354",
-      "instruction": "Retrieve candidates relevant to the query.",
-      "query_text": "A woman playing with her dog on a beach at sunset.",
-      "document_text": "A woman and her dog spend time together on a beach during sunset.",
-      "num_layers": 1,
-      "additional_params": {}
-    },
-    {
-      "model_name": "Qwen/Qwen3-VL-Reranker-8B",
-      "model_type": "qwen3_vl",
-      "batch_size": 1,
-      "prompt_len": 128,
-      "ctx_len": 1024,
-      "img_size": 1540,
-      "img_url": "https://picsum.photos/id/237/536/354",
-      "instruction": "Retrieve candidates relevant to the query.",
-      "query_text": "A woman playing with her dog on a beach at sunset.",
-      "document_text": "A woman and her dog spend time together on a beach during sunset.",
-      "num_layers": 1,
-      "additional_params": {}
-    }
-  ],
   "image_text_embedding_models": [
     {
       "model_name": "Qwen/Qwen3-VL-Embedding-8B",
diff --git a/tests/configs/reranker_model_configs.json b/tests/configs/reranker_model_configs.json
new file mode 100644
index 0000000000..4427b9da0c
--- /dev/null
+++ b/tests/configs/reranker_model_configs.json
@@ -0,0 +1,28 @@
+[
+  {
+    "model_name": "Qwen/Qwen3-VL-Reranker-2B",
+    "model_type": "qwen3_vl",
+    "batch_size": 1,
+    "prompt_len": 128,
+    "img_size": 1540,
+    "img_url": "https://picsum.photos/id/237/536/354",
+    "instruction": "Retrieve candidates relevant to the query.",
+    "query_text": "A woman playing with her dog on a beach at sunset.",
+    "document_text": "A woman and her dog spend time together on a beach during sunset.",
+    "num_layers": 1,
+    "additional_params": {}
+  },
+  {
+    "model_name": "Qwen/Qwen3-VL-Reranker-8B",
+    "model_type": "qwen3_vl",
+    "batch_size": 1,
+    "prompt_len": 128,
+    "img_size": 1540,
+    "img_url": "https://picsum.photos/id/237/536/354",
+    "instruction": "Retrieve candidates relevant to the query.",
+    "query_text": "A woman playing with her dog on a beach at sunset.",
+    "document_text": "A woman and her dog spend time together on a beach during sunset.",
+    "num_layers": 1,
+    "additional_params": {}
+  }
+]
diff --git a/tests/transformers/models/reranker/test_reranker_mad.py b/tests/transformers/models/reranker/test_reranker_mad.py
index 148935c5a7..4677f96933 100644
--- a/tests/transformers/models/reranker/test_reranker_mad.py
+++ b/tests/transformers/models/reranker/test_reranker_mad.py
@@ -39,7 +39,7 @@
 )
 from QEfficient.utils.test_utils import load_vlm_model, set_num_layers_vlm
 
-CONFIG_PATH = os.path.join(os.path.dirname(__file__), "../../../configs/image_text_model_configs.json")
+CONFIG_PATH = os.path.join(os.path.dirname(__file__), "../../../configs/reranker_model_configs.json")
 
 PT_AI100_MAD_MAX = 5e-3
 MAX_LENGTH = 8192
@@ -60,8 +60,7 @@
 }
 
 with open(CONFIG_PATH, "r") as f:
-    config_data = json.load(f)
-    reranker_models = config_data["image_text_reranker_models"]
+    reranker_models = json.load(f)
 
 test_reranker_models = [model_config["model_name"] for model_config in reranker_models]
 reranker_model_config_dict = {model["model_name"]: model for model in reranker_models}
@@ -298,7 +297,7 @@ def test_qwen3_vl_reranker_mad_parity(model_name):
         height=compile_height,
         width=compile_width,
         prefill_seq_len=max_prompt_len,
-        ctx_len=model_cfg["ctx_len"],
+        ctx_len=max_prompt_len,
         num_devices=1,
         num_cores=16,
         mxfp6_matmul=False,
diff --git a/tests/unit_test/models/reranker/test_reranker_models_unit.py b/tests/unit_test/models/reranker/test_reranker_models_unit.py
index f3036502e1..b79a3d29c9 100644
--- a/tests/unit_test/models/reranker/test_reranker_models_unit.py
+++ b/tests/unit_test/models/reranker/test_reranker_models_unit.py
@@ -8,7 +8,7 @@
 Generic unit coverage for image-text reranker model entries.
 
 This test is intentionally model-list driven:
-  - Add/remove reranker models only in tests/configs/image_text_model_configs.json
+  - Add/remove reranker models only in tests/configs/reranker_model_configs.json
   - The same unit checks run for every configured reranker model
 """
 
@@ -22,13 +22,12 @@
 
 from QEfficient.utils.test_utils import set_num_layers_vlm
 
-CONFIG_PATH = os.path.join(os.path.dirname(__file__), "../../../configs/image_text_model_configs.json")
+CONFIG_PATH = os.path.join(os.path.dirname(__file__), "../../../configs/reranker_model_configs.json")
 
 
 def _load_reranker_model_configs() -> List[Dict]:
     with open(CONFIG_PATH, "r", encoding="utf-8") as file:
-        config_data = json.load(file)
-    return config_data.get("image_text_reranker_models", [])
+        return json.load(file)
 
 
 RERANKER_MODEL_CONFIGS = _load_reranker_model_configs()
@@ -51,7 +50,7 @@ def _vision_num_layers(config) -> int:
 
 def test_reranker_model_list_is_present():
     assert RERANKER_MODEL_CONFIGS, (
-        "image_text_reranker_models is empty. Add reranker entries in tests/configs/image_text_model_configs.json."
+        "reranker_model_configs.json is empty. Add reranker entries in tests/configs/reranker_model_configs.json."
     )
 
 

From fc44abeb8fa23b5e19e46c24f899b9d0d550f954 Mon Sep 17 00:00:00 2001
From: Amit Raj <amitraj@qti.qualcomm.com>
Date: Thu, 4 Jun 2026 21:24:06 +0530
Subject: [PATCH 09/11] Embedding: single prefill specialization, remove
 ctx_len from API

Mirror of the reranker fix: Qwen3-VL embedding is single-shot prefill
(reads last-token hidden state as embedding vector, no decode loop).
`get_compile_specs` now returns ctx_len == prefill_seq_len, triggering
Solution A in modeling_auto.py to compile only the Prefill kernel.

Signed-off-by: Amit Raj <amitraj@qti.qualcomm.com>
---
 .../transformers/models/qwen3_vl/_embedding_utils.py       | 7 +++----
 examples/embeddings/qwen3vl/README.md                      | 1 -
 examples/embeddings/qwen3vl/qwen3_vl_embedding.py          | 3 ---
 tests/configs/image_text_model_configs.json                | 1 -
 .../models/embedding_models/test_qwen3vl_embedding_mad.py  | 1 -
 .../models/embedding/test_qwen3vl_embedding_unit.py        | 4 ++--
 6 files changed, 5 insertions(+), 12 deletions(-)

diff --git a/QEfficient/transformers/models/qwen3_vl/_embedding_utils.py b/QEfficient/transformers/models/qwen3_vl/_embedding_utils.py
index ca0316371d..bce751db9d 100644
--- a/QEfficient/transformers/models/qwen3_vl/_embedding_utils.py
+++ b/QEfficient/transformers/models/qwen3_vl/_embedding_utils.py
@@ -257,9 +257,7 @@ def _collect_contexts(self, inputs: List[Dict[str, Any]]):
 
         return contexts, max_prompt_len, max_grid_h, max_grid_w
 
-    def get_compile_specs(
-        self, inputs: List[Dict[str, Any]], ctx_len: int, prefill_seq_len: int = None
-    ) -> Dict[str, int]:
+    def get_compile_specs(self, inputs: List[Dict[str, Any]], prefill_seq_len: int = None) -> Dict[str, int]:
         """Compute compile-time spec values for the current input batch."""
         _, max_prompt_len, max_grid_h, max_grid_w = self._collect_contexts(inputs)
         if max_prompt_len == 0:
@@ -275,9 +273,10 @@ def get_compile_specs(
         height = max_grid_h * patch_size
         width = max_grid_w * patch_size
 
+        # ctx_len == prefill_seq_len always: embedding is single-shot prefill, no decode steps.
         return {
             "prefill_seq_len": target_prefill_seq_len,
-            "ctx_len": int(ctx_len),
+            "ctx_len": target_prefill_seq_len,
             "img_size": max(height, width),
             "height": height,
             "width": width,
diff --git a/examples/embeddings/qwen3vl/README.md b/examples/embeddings/qwen3vl/README.md
index cff14908cc..6f89fade06 100644
--- a/examples/embeddings/qwen3vl/README.md
+++ b/examples/embeddings/qwen3vl/README.md
@@ -40,7 +40,6 @@ With compile parameters:
 ```bash
 python examples/embeddings/qwen3vl/qwen3_vl_embedding.py \
   --model-name Qwen/Qwen3-VL-Embedding-8B \
-  --ctx-len 2048 \
   --num-cores 16 \
   --num-devices 1 \
   --compile-prefill-seq-len 4096 \
diff --git a/examples/embeddings/qwen3vl/qwen3_vl_embedding.py b/examples/embeddings/qwen3vl/qwen3_vl_embedding.py
index bd707ffb08..b3124352a6 100644
--- a/examples/embeddings/qwen3vl/qwen3_vl_embedding.py
+++ b/examples/embeddings/qwen3vl/qwen3_vl_embedding.py
@@ -24,7 +24,6 @@
 from QEfficient.transformers.models.qwen3_vl._embedding_utils import configure_embedding_model_config
 
 DEFAULT_MODEL_NAME = "Qwen/Qwen3-VL-Embedding-8B"
-DEFAULT_CTX_LEN = 2048
 DEFAULT_NUM_CORES = 16
 DEFAULT_NUM_DEVICES = 1
 DEFAULT_NUM_HIDDEN_LAYERS = 36
@@ -36,7 +35,6 @@ def parse_args() -> argparse.Namespace:
     """Parse command-line arguments for AI100 compile/inference knobs."""
     parser = argparse.ArgumentParser(description="Qwen3-VL embedding example.")
     parser.add_argument("--model-name", type=str, default=DEFAULT_MODEL_NAME)
-    parser.add_argument("--ctx-len", type=int, default=DEFAULT_CTX_LEN, help="Context length used at compile time.")
     parser.add_argument("--num-cores", type=int, default=DEFAULT_NUM_CORES, help="Number of AI100 cores.")
     parser.add_argument("--num-devices", type=int, default=DEFAULT_NUM_DEVICES, help="Number of AI100 devices.")
     parser.add_argument(
@@ -121,7 +119,6 @@ def main() -> None:
     # 3) Derive compile requirements from current payload.
     compile_specs = embedder.get_compile_specs(
         inputs=model_inputs,
-        ctx_len=args.ctx_len,
         prefill_seq_len=args.compile_prefill_seq_len,
     )
 
diff --git a/tests/configs/image_text_model_configs.json b/tests/configs/image_text_model_configs.json
index 8181faf430..d98d0e08a2 100644
--- a/tests/configs/image_text_model_configs.json
+++ b/tests/configs/image_text_model_configs.json
@@ -729,7 +729,6 @@
       "model_name": "Qwen/Qwen3-VL-Embedding-8B",
       "model_type": "qwen3_vl",
       "batch_size": 1,
-      "ctx_len": 2048,
       "num_layers": 1,
       "vision_depth": 9,
       "deepstack_index": 8,
diff --git a/tests/transformers/models/embedding_models/test_qwen3vl_embedding_mad.py b/tests/transformers/models/embedding_models/test_qwen3vl_embedding_mad.py
index d540593b86..885372355d 100644
--- a/tests/transformers/models/embedding_models/test_qwen3vl_embedding_mad.py
+++ b/tests/transformers/models/embedding_models/test_qwen3vl_embedding_mad.py
@@ -108,7 +108,6 @@ def test_qwen3_vl_embedding_cpu_vs_ai100_mad_parity(model_name):
     model_inputs = EXAMPLE_QUERIES + EXAMPLE_DOCUMENTS
     compile_specs = embedder.get_compile_specs(
         inputs=model_inputs,
-        ctx_len=model_cfg["ctx_len"],
         prefill_seq_len=model_cfg.get("compile_prefill_seq_len", None),
     )
     qpc_paths = qeff_model.compile(
diff --git a/tests/unit_test/models/embedding/test_qwen3vl_embedding_unit.py b/tests/unit_test/models/embedding/test_qwen3vl_embedding_unit.py
index ae7c88e837..a602a0f7dd 100644
--- a/tests/unit_test/models/embedding/test_qwen3vl_embedding_unit.py
+++ b/tests/unit_test/models/embedding/test_qwen3vl_embedding_unit.py
@@ -118,8 +118,8 @@ def _fake_run_ai100_prefill(prepared_inputs, vision_outputs, lang_qpc_path):
     monkeypatch.setattr(QEffQwen3VLEmbedder, "_run_ai100_vision", staticmethod(_fake_run_ai100_vision))
     monkeypatch.setattr(QEffQwen3VLEmbedder, "_run_ai100_prefill", staticmethod(_fake_run_ai100_prefill))
 
-    compile_specs = embedder.get_compile_specs(inputs=[{}, {}], ctx_len=64, prefill_seq_len=12)
-    assert compile_specs == {"prefill_seq_len": 12, "ctx_len": 64, "img_size": 160, "height": 96, "width": 160}
+    compile_specs = embedder.get_compile_specs(inputs=[{}, {}], prefill_seq_len=12)
+    assert compile_specs == {"prefill_seq_len": 12, "ctx_len": 12, "img_size": 160, "height": 96, "width": 160}
 
     embeddings = embedder.process(
         inputs=[{}, {}],

From 5fbc0d8c8968fd5f1d7bf16107fb76caa335ffcc Mon Sep 17 00:00:00 2001
From: Amit Raj <amitraj@qti.qualcomm.com>
Date: Thu, 4 Jun 2026 22:44:01 +0530
Subject: [PATCH 10/11] Address review comments: use
 ONNX_EXPORT_EXAMPLE_SEQ_LEN constant and simplify config path

Signed-off-by: Amit Raj <amitraj@qti.qualcomm.com>
---
 QEfficient/transformers/models/whisper/modeling_whisper.py   | 4 ++--
 tests/unit_test/models/reranker/test_reranker_models_unit.py | 3 +--
 2 files changed, 3 insertions(+), 4 deletions(-)

diff --git a/QEfficient/transformers/models/whisper/modeling_whisper.py b/QEfficient/transformers/models/whisper/modeling_whisper.py
index 89c52c9517..4c30166289 100644
--- a/QEfficient/transformers/models/whisper/modeling_whisper.py
+++ b/QEfficient/transformers/models/whisper/modeling_whisper.py
@@ -30,7 +30,7 @@
 from QEfficient.transformers.cache_utils import QEffEncoderDecoderCache
 from QEfficient.transformers.modeling_attn_mask_utils import _create_causal_mask
 from QEfficient.utils._utils import IOInfo
-from QEfficient.utils.constants import MIN_MASKED_ATTENTION_VALUE
+from QEfficient.utils.constants import MIN_MASKED_ATTENTION_VALUE, ONNX_EXPORT_EXAMPLE_SEQ_LEN
 
 
 class QEffWhisperPositionalEmbedding(WhisperPositionalEmbedding):
@@ -794,7 +794,7 @@ def get_dummy_inputs(
         self,
     ):
         bs = 1
-        seq_len = 32
+        seq_len = ONNX_EXPORT_EXAMPLE_SEQ_LEN
         encoder_seq_len = self.config.max_source_positions
         encoder_feature_count = self.config.num_mel_bins
         num_key_value_heads = self.config.decoder_attention_heads
diff --git a/tests/unit_test/models/reranker/test_reranker_models_unit.py b/tests/unit_test/models/reranker/test_reranker_models_unit.py
index b79a3d29c9..7d1321a98b 100644
--- a/tests/unit_test/models/reranker/test_reranker_models_unit.py
+++ b/tests/unit_test/models/reranker/test_reranker_models_unit.py
@@ -14,7 +14,6 @@
 
 import copy
 import json
-import os
 from typing import Dict, List
 
 import pytest
@@ -22,7 +21,7 @@
 
 from QEfficient.utils.test_utils import set_num_layers_vlm
 
-CONFIG_PATH = os.path.join(os.path.dirname(__file__), "../../../configs/reranker_model_configs.json")
+CONFIG_PATH = "tests/configs/reranker_model_configs.json"
 
 
 def _load_reranker_model_configs() -> List[Dict]:

From e14f780835a4244040c85107086fd39b5c0bde60 Mon Sep 17 00:00:00 2001
From: Amit Raj <amitraj@qti.qualcomm.com>
Date: Fri, 5 Jun 2026 12:22:47 +0530
Subject: [PATCH 11/11] Added support of embedding and reranker model export
 wnd compile without kv input outpur

Signed-off-by: Amit Raj <amitraj@qti.qualcomm.com>
---
 QEfficient/blocking/attention_blocking.py     |  1 +
 .../transformers/models/modeling_auto.py      |  3 +
 .../models/qwen3_vl/modeling_qwen3_vl.py      | 70 ++++++++++++++++---
 .../reranker/qwen3vl/qwen3_vl_reranker.py     |  1 +
 4 files changed, 64 insertions(+), 11 deletions(-)

diff --git a/QEfficient/blocking/attention_blocking.py b/QEfficient/blocking/attention_blocking.py
index b753420132..cae8840811 100644
--- a/QEfficient/blocking/attention_blocking.py
+++ b/QEfficient/blocking/attention_blocking.py
@@ -81,6 +81,7 @@ def past_key_value_update(
     position_ids: Optional[torch.LongTensor] = None,
     sliding_window: Optional[int] = None,
 ):
+    cache_kwargs = {}
     if past_key_value is not None:
         cache_kwargs = {"batch_index": batch_index, "position_ids": position_ids}
         if sliding_window is not None:
diff --git a/QEfficient/transformers/models/modeling_auto.py b/QEfficient/transformers/models/modeling_auto.py
index 360aaa13ec..4f5ad61d3d 100755
--- a/QEfficient/transformers/models/modeling_auto.py
+++ b/QEfficient/transformers/models/modeling_auto.py
@@ -1268,6 +1268,9 @@ def __init__(
             )
         self.model = model
         self.config = model.config
+        # Propagate qaic_config to the full model so helpers like _is_single_shot_mode
+        # can detect the mode when get_output_names/get_dummy_inputs are called on it.
+        model.qaic_config = qaic_config
 
         self.vision_model = QEffVisionEncoderForTextImageToTextModel(model, **kwargs)
         self.lang_model = QEffCausalLMForTextImageToTextModel(model, qaic_config=qaic_config, **kwargs)
diff --git a/QEfficient/transformers/models/qwen3_vl/modeling_qwen3_vl.py b/QEfficient/transformers/models/qwen3_vl/modeling_qwen3_vl.py
index 9f609ea2ea..cd39a98f0c 100644
--- a/QEfficient/transformers/models/qwen3_vl/modeling_qwen3_vl.py
+++ b/QEfficient/transformers/models/qwen3_vl/modeling_qwen3_vl.py
@@ -57,6 +57,18 @@ def _should_export_embedding_output(module) -> bool:
     return False
 
 
+def _is_single_shot_mode(module) -> bool:
+    """True when model is single-shot prefill only (reranker/embedding) — no KV cache needed."""
+    for holder in (module, getattr(module, "model", None)):
+        if holder is None:
+            continue
+        qaic_config = getattr(holder, "qaic_config", None)
+        if isinstance(qaic_config, dict):
+            if qaic_config.get("no_kv_cache", False) or qaic_config.get("export_embedding", False):
+                return True
+    return False
+
+
 def qeff_apply_interleaved_mrope(freqs, mrope_section):
     """Apply interleaved MRoPE to 3D rotary embeddings.
     Reorganizes frequency layout from chunked [TTT...HHH...WWW] to
@@ -549,7 +561,9 @@ def forward(
     ) -> Union[Tuple, BaseModelOutputWithPast]:
         if (input_ids is None) ^ (inputs_embeds is not None):
             raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
-        if self.config.use_cache and not isinstance(past_key_values, Cache):
+        return_legacy_cache = False
+        effective_use_cache = use_cache if use_cache is not None else self.config.use_cache
+        if effective_use_cache and not isinstance(past_key_values, Cache):
             return_legacy_cache = True
             past_key_values = QEffDynamicCache.from_legacy_cache(past_key_values)
 
@@ -567,7 +581,11 @@ def forward(
         elif position_ids.dim() == 2:
             position_ids = position_ids[None, ...].expand(3, position_ids.shape[0], -1)
 
-        target_length = attention_mask.shape[-1] if isinstance(attention_mask, torch.Tensor) else past_seen_tokens
+        target_length = (
+            attention_mask.shape[-1]
+            if isinstance(attention_mask, torch.Tensor)
+            else (past_seen_tokens if past_seen_tokens > 0 else inputs_embeds.shape[1])
+        )
         causal_mask = _create_causal_mask(
             position_ids=position_ids[0], target_length=target_length, sliding_window=None
         )
@@ -696,7 +714,7 @@ def forward(
         deepstack_features,
         position_ids,
         image_idx,
-        past_key_values,
+        past_key_values=None,
         batch_index: Optional[torch.LongTensor] = None,
         comp_ctx_lengths: Optional[List[int]] = None,
     ):
@@ -705,7 +723,7 @@ def forward(
         selected = input_ids == self.model.config.image_token_id
         indices1 = selected.to(torch.int64).cumsum(1) - 1
         indices1 = torch.where(indices1 != -1, indices1 + image_idx, indices1)
-        indices0 = torch.arange(selected.unsqueeze(0).shape[0]).view(-1, 1)
+        indices0 = torch.arange(selected.shape[0], device=selected.device).view(-1, 1)
         image_features_expanded = vision_embeds.reshape(-1, C).unsqueeze(0)[indices0, indices1]
 
         num_features, bs, split_size, C = deepstack_features.shape
@@ -723,13 +741,14 @@ def forward(
             visual_pos_masks = image_mask
             deepstack_visual_embeds = deepstack_features_expanded
 
+        single_shot = _is_single_shot_mode(self)
         outputs = self.language_model(
             inputs_embeds=inputs_embeds,
             position_ids=position_ids,
-            past_key_values=past_key_values,
+            past_key_values=None if single_shot else past_key_values,
             comp_ctx_lengths=comp_ctx_lengths,
             batch_index=batch_index,
-            use_cache=True,
+            use_cache=not single_shot,
             visual_pos_masks=visual_pos_masks,
             deepstack_visual_embeds=deepstack_visual_embeds,
         )
@@ -737,6 +756,10 @@ def forward(
         hidden_states = outputs.last_hidden_state[torch.arange(position_ids[0].shape[0]).view(-1, 1), logit_index]
         logits = self.model.lm_head(hidden_states)
         image_idx = (indices1.max() + 1).unsqueeze(0).unsqueeze(0)
+        if single_shot:
+            if _should_export_embedding_output(self):
+                return logits, vision_embeds, deepstack_features, image_idx, hidden_states
+            return logits, vision_embeds, deepstack_features, image_idx
         if _should_export_embedding_output(self):
             return logits, vision_embeds, deepstack_features, image_idx, hidden_states, outputs.past_key_values
         return logits, vision_embeds, deepstack_features, image_idx, outputs.past_key_values
@@ -920,6 +943,8 @@ def get_dummy_inputs(
             lang_inputs["comp_ctx_lengths"] = torch.randint(0, 100, (40,), dtype=torch.int64)
         inputs = {}
         if kv_offload:
+            if _is_single_shot_mode(self):
+                lang_inputs.pop("past_key_values")
             inputs["vision"] = vision_inputs
             inputs["lang"] = lang_inputs
         else:
@@ -1101,6 +1126,11 @@ def smart_resize(
 
             lang = [lang_prefill, lang_decode]
 
+        # Single-shot (reranker/embedding): no KV cache → ctx_len not referenced in ONNX
+        if _is_single_shot_mode(self):
+            for spec in lang:
+                spec.pop("ctx_len", None)
+
         specializations = {}
 
         if kv_offload:
@@ -1149,6 +1179,10 @@ def get_onnx_dynamic_axes(
         dynamic_axes = {}
 
         if kv_offload:
+            if _is_single_shot_mode(self):
+                for i in range(num_layers):
+                    lang_dynamic_axes.pop(f"past_key.{i}", None)
+                    lang_dynamic_axes.pop(f"past_value.{i}", None)
             dynamic_axes["vision"] = vision_dynamic_axes
             dynamic_axes["lang"] = lang_dynamic_axes
         else:
@@ -1166,11 +1200,25 @@ def get_output_names(self, kv_offload: bool = False):
 
         output_names = {}
         if kv_offload:
-            lang_output_names.insert(1, "vision_embeds_RetainedState")
-            lang_output_names.insert(2, "image_idx_output")
-            lang_output_names.insert(2, "deepstack_features_RetainedState")
-            if _should_export_embedding_output(self):
-                lang_output_names.insert(4, "embedding_output")
+            if _is_single_shot_mode(self):
+                # Single-shot: keep vision/deepstack retained states, drop KV retained states.
+                # Order matches QEffQwen3VLDecoderWrapper.forward single-shot return:
+                # reranker: (logits, vision_embeds, deepstack_features, image_idx)
+                # embedding: (logits, vision_embeds, deepstack_features, image_idx, hidden_states)
+                lang_output_names = [
+                    "logits",
+                    "vision_embeds_RetainedState",
+                    "deepstack_features_RetainedState",
+                    "image_idx_output",
+                ]
+                if _should_export_embedding_output(self):
+                    lang_output_names.append("embedding_output")  # hidden_states is output[4]
+            else:
+                lang_output_names.insert(1, "vision_embeds_RetainedState")
+                lang_output_names.insert(2, "image_idx_output")
+                lang_output_names.insert(2, "deepstack_features_RetainedState")
+                if _should_export_embedding_output(self):
+                    lang_output_names.insert(4, "embedding_output")
             output_names["vision"] = vision_output_names
             output_names["lang"] = lang_output_names
         else:
diff --git a/examples/reranker/qwen3vl/qwen3_vl_reranker.py b/examples/reranker/qwen3vl/qwen3_vl_reranker.py
index 504280e7d9..a3d05c3d21 100644
--- a/examples/reranker/qwen3vl/qwen3_vl_reranker.py
+++ b/examples/reranker/qwen3vl/qwen3_vl_reranker.py
@@ -96,6 +96,7 @@ def main() -> None:
         kv_offload=True,
         trust_remote_code=True,
         config=config,
+        qaic_config={"no_kv_cache": True},
     )
 
     # 2) Build reranker helper and reference payload.