From 08bb022d83f4fb76b0fb481140d4563ab5b0db06 Mon Sep 17 00:00:00 2001 From: Amit Raj Date: Sat, 18 Apr 2026 10:55:00 +0000 Subject: [PATCH 01/11] Enabling support of rerankers models 2B and 8B of qwen3vl bucket Signed-off-by: Amit Raj --- .../transformers/models/modeling_auto.py | 8 +- .../models/qwen3vl/reranker/README.md | 52 ++ .../qwen3vl/reranker/qwen3_vl_reranker.py | 555 ++++++++++++++++++ tests/configs/image_text_model_configs.json | 2 +- .../image_text_to_text/test_reranker_mad.py | 455 ++++++++++++++ 5 files changed, 1069 insertions(+), 3 deletions(-) create mode 100644 examples/image_text_to_text/models/qwen3vl/reranker/README.md create mode 100644 examples/image_text_to_text/models/qwen3vl/reranker/qwen3_vl_reranker.py create mode 100644 tests/transformers/models/image_text_to_text/test_reranker_mad.py diff --git a/QEfficient/transformers/models/modeling_auto.py b/QEfficient/transformers/models/modeling_auto.py index 0b1e3702b6..fc10032df6 100755 --- a/QEfficient/transformers/models/modeling_auto.py +++ b/QEfficient/transformers/models/modeling_auto.py @@ -1377,7 +1377,7 @@ def export( kv_offload=True, continuous_batching=self.continuous_batching, comp_ctx_lengths=self.comp_ctx_lengths_decode, - **dummy_inputs_kwargs, + prefill_seq_len=prefill_seq_len, ) dynamic_axes = self.model.get_onnx_dynamic_axes( kv_offload=True, @@ -1385,7 +1385,11 @@ def export( comp_ctx_lengths=self.comp_ctx_lengths_decode, ) except TypeError: - inputs = self.model.get_dummy_inputs(kv_offload=True, comp_ctx_lengths=self.comp_ctx_lengths_decode) + inputs = self.model.get_dummy_inputs( + kv_offload=True, + comp_ctx_lengths=self.comp_ctx_lengths_decode, + prefill_seq_len=prefill_seq_len, + ) dynamic_axes = self.model.get_onnx_dynamic_axes( kv_offload=True, comp_ctx_lengths=self.comp_ctx_lengths_decode ) diff --git a/examples/image_text_to_text/models/qwen3vl/reranker/README.md b/examples/image_text_to_text/models/qwen3vl/reranker/README.md new file mode 100644 index 0000000000..a3e715478d --- /dev/null +++ b/examples/image_text_to_text/models/qwen3vl/reranker/README.md @@ -0,0 +1,52 @@ +# Qwen3-VL Reranker Inference + +This directory contains an AI100 example for running Qwen3-VL reranker models with QEfficient and printing per-document relevance scores. + +Supported models: +- `Qwen/Qwen3-VL-Reranker-2B` +- `Qwen/Qwen3-VL-Reranker-8B` + +## What this example does + +- Loads Qwen3-VL reranker from Hugging Face (or local snapshot path). +- Uses QEff dual-QPC execution (vision encoder + language model). +- Runs the same query against multiple text/image documents. +- Prints one score per document in input order. + +## Required package + +- `qwen-vl-utils>=0.0.14` + +```bash +pip install "qwen-vl-utils>=0.0.14" +``` + +## Script + +- `qwen3_vl_reranker.py` + +## Run + +```bash +python examples/image_text_to_text/models/qwen3vl/reranker/qwen3_vl_reranker.py \ + --model-name Qwen/Qwen3-VL-Reranker-2B +``` + +Or run with 8B: + +```bash +python examples/image_text_to_text/models/qwen3vl/reranker/qwen3_vl_reranker.py \ + --model-name Qwen/Qwen3-VL-Reranker-8B +``` + +With compile parameters: + +```bash +python examples/image_text_to_text/models/qwen3vl/reranker/qwen3_vl_reranker.py \ + --model-name Qwen/Qwen3-VL-Reranker-2B \ + --ctx-len 2048 \ + --num-cores 16 \ + --num-devices 1 \ + --compile-prefill-seq-len 4096 \ + --mxfp6-matmul +``` diff --git a/examples/image_text_to_text/models/qwen3vl/reranker/qwen3_vl_reranker.py b/examples/image_text_to_text/models/qwen3vl/reranker/qwen3_vl_reranker.py new file mode 100644 index 0000000000..2fdd225571 --- /dev/null +++ b/examples/image_text_to_text/models/qwen3vl/reranker/qwen3_vl_reranker.py @@ -0,0 +1,555 @@ +# ----------------------------------------------------------------------------- +# +# Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries. +# SPDX-License-Identifier: BSD-3-Clause +# +# ----------------------------------------------------------------------------- + +import argparse +import os +from typing import Dict, List, Tuple + +import numpy as np +import torch +from huggingface_hub import snapshot_download +from qwen_vl_utils import process_vision_info +from transformers import AutoConfig, AutoProcessor + +from QEfficient import QEFFAutoModelForImageTextToText +from QEfficient.generation.cloud_infer import QAICInferenceSession + +DEFAULT_MODEL_NAME = "Qwen/Qwen3-VL-Reranker-2B" +DEFAULT_CTX_LEN = 2048 +DEFAULT_NUM_CORES = 16 +DEFAULT_NUM_DEVICES = 1 + +# Max token budget used by this example's manual truncation/padding flow. +MAX_LENGTH = 8192 +# Pixel constraints used by Qwen3-VL preprocessing. +IMAGE_BASE_FACTOR = 16 +IMAGE_FACTOR = IMAGE_BASE_FACTOR * 2 +MIN_PIXELS = 4 * IMAGE_FACTOR * IMAGE_FACTOR +MAX_PIXELS = 1280 * IMAGE_FACTOR * IMAGE_FACTOR +FPS = 1.0 + + +class QEffQwen3VLReranker: + @staticmethod + def _resolve_model_source(model_name_or_path: str) -> str: + """Return a local model path when given an HF repo id. + + Why: + Some transformers versions can fail when resolving chat templates from + repo-id mode for this model. Using a local snapshot path avoids that path. + """ + if os.path.isdir(model_name_or_path): + return model_name_or_path + return snapshot_download(repo_id=model_name_or_path) + + def __init__( + self, + model_name_or_path: str = DEFAULT_MODEL_NAME, + ctx_len: int = DEFAULT_CTX_LEN, + num_cores: int = DEFAULT_NUM_CORES, + num_devices: int = DEFAULT_NUM_DEVICES, + mxfp6_matmul: bool = False, + compile_prefill_seq_len: int = None, + ): + """Initialize the AI100-only reranker wrapper. + + This loads: + - HF config/processor for prompt and multimodal preprocessing. + - QEFF dual-QPC model wrapper (vision encoder + language decoder). + - Token ids for "yes"/"no" used to compute reranker scores. + + Parameters + ---------- + model_name_or_path: + HF model id or local snapshot path. + """ + self.model_name_or_path = model_name_or_path + self.model_source = self._resolve_model_source(model_name_or_path) + self.ctx_len = ctx_len + self.num_cores = num_cores + self.num_devices = num_devices + self.mxfp6_matmul = mxfp6_matmul + self.compile_prefill_seq_len = compile_prefill_seq_len + self.max_length = MAX_LENGTH + self.fps = FPS + + # Use local snapshot for stable processor/chat-template loading. + config = AutoConfig.from_pretrained(self.model_source, trust_remote_code=True, padding=True) + if hasattr(config, "use_cache"): + config.use_cache = True + if hasattr(config, "text_config") and hasattr(config.text_config, "use_cache"): + config.text_config.use_cache = True + + self.processor = AutoProcessor.from_pretrained(self.model_source, trust_remote_code=True, padding=True) + self.model = QEFFAutoModelForImageTextToText.from_pretrained( + self.model_source, + kv_offload=True, + trust_remote_code=True, + config=config, + ) + + self.yes_token_id, self.no_token_id = self._get_yes_no_token_ids(self.processor.tokenizer) + self._compiled_qpc_paths = None + self._compiled_prefill_seq_len = 0 + self._compiled_height = None + self._compiled_width = None + + @staticmethod + def _get_yes_no_token_ids(tokenizer) -> Tuple[int, int]: + """Resolve tokenizer ids for the exact tokens 'yes' and 'no'.""" + vocab = tokenizer.get_vocab() + if "yes" not in vocab or "no" not in vocab: + raise ValueError("Could not resolve tokenizer ids for exact tokens 'yes' and 'no'.") + return vocab["yes"], vocab["no"] + + @staticmethod + def _score_from_logits(logits, yes_token_id: int, no_token_id: int) -> float: + """Convert model logits into a reranker relevance score. + + Score formula: + sigmoid(logit_yes - logit_no) + """ + # Convert runtime output to torch and use final-token logits. + logits_tensor = torch.from_numpy(logits) if isinstance(logits, np.ndarray) else logits.detach().cpu() + if logits_tensor.ndim == 3: + logits_tensor = logits_tensor[:, -1, :] + # Binary relevance score from yes/no logit gap. + score = torch.sigmoid(logits_tensor[:, yes_token_id] - logits_tensor[:, no_token_id]) + return float(score[0].item()) + + @staticmethod + def _truncate_tokens_optimized(tokens: List[int], max_length: int, special_tokens: List[int]) -> List[int]: + """Truncate while preserving all special tokens in sequence order.""" + if len(tokens) <= max_length: + return tokens + + # Preserve all special/control tokens and trim only non-special tokens. + special_tokens_set = set(special_tokens) + num_special = sum(1 for token in tokens if token in special_tokens_set) + num_non_special_to_keep = max_length - num_special + + final_tokens = [] + non_special_kept_count = 0 + for token in tokens: + if token in special_tokens_set: + final_tokens.append(token) + elif non_special_kept_count < num_non_special_to_keep: + final_tokens.append(token) + non_special_kept_count += 1 + return final_tokens + + def _format_mm_content(self, text, image, video, prefix: str) -> List[Dict]: + """Build one multimodal content block (prefix + optional image + optional text).""" + # Prefix helps the model distinguish query vs document sections. + content = [{"type": "text", "text": prefix}] + + if not text and not image and not video: + content.append({"type": "text", "text": "NULL"}) + return content + + if video: + raise ValueError("Video input is not supported in this AI100-only example.") + + if image: + # Convert local paths to file:// URIs for the processor. + if isinstance(image, str): + image_content = image if image.startswith(("http", "oss")) else "file://" + image + else: + image_content = image + content.append( + { + "type": "image", + "image": image_content, + "min_pixels": MIN_PIXELS, + "max_pixels": MAX_PIXELS, + } + ) + + if text: + content.append({"type": "text", "text": text}) + + return content + + def _format_mm_instruction(self, instruction: str, query: Dict, document: Dict) -> List[Dict]: + """Create the chat payload for one query-document pair.""" + # Prompt shape follows the HF reranker reference format. + contents = [{"type": "text", "text": ": " + instruction}] + + contents.extend( + self._format_mm_content( + query.get("text"), + query.get("image"), + query.get("video"), + prefix=":", + ) + ) + contents.extend( + self._format_mm_content( + document.get("text"), + document.get("image"), + document.get("video"), + prefix="\n:", + ) + ) + + return [ + { + "role": "system", + "content": [ + { + "type": "text", + "text": ( + "Judge whether the Document meets the requirements based on the Query and the Instruct " + 'provided. Note that the answer can only be "yes" or "no".' + ), + } + ], + }, + {"role": "user", "content": contents}, + ] + + def _tokenize_pair(self, pair: List[Dict]) -> Dict: + """Tokenize a query-document pair with the exact HF multimodal pipeline.""" + # Processor expects list-of-conversations. + pairs = [pair] + text = self.processor.apply_chat_template(pairs, tokenize=False, add_generation_prompt=True) + + # Build image/video tensors + metadata for processor inputs. + images, videos, video_kwargs = process_vision_info( + pairs, + image_patch_size=16, + return_video_kwargs=True, + return_video_metadata=True, + ) + + if videos is not None: + videos, video_metadatas = zip(*videos) + videos = list(videos) + video_metadatas = list(video_metadatas) + else: + video_metadatas = None + + inputs = self.processor( + text=text, + images=images, + videos=videos, + video_metadata=video_metadatas, + truncation=False, + padding=False, + do_resize=False, + **video_kwargs, + ) + + # Apply custom truncation preserving trailing template control tokens. + for i, input_ids in enumerate(inputs["input_ids"]): + inputs["input_ids"][i] = ( + self._truncate_tokens_optimized( + input_ids[:-5], + self.max_length, + self.processor.tokenizer.all_special_ids, + ) + + input_ids[-5:] + ) + + # Re-pad through tokenizer utilities so masks align with token ids. + padded = self.processor.tokenizer.pad( + {"input_ids": inputs["input_ids"]}, + padding=True, + return_tensors="pt", + max_length=self.max_length, + ) + for key in padded: + inputs[key] = padded[key] + + if "pixel_values" in inputs: + # Keep pixels fp32 before explicit cast to fp16 during vision run. + inputs["pixel_values"] = inputs["pixel_values"].to(torch.float32) + + return inputs + + def _prepare_inputs(self, tokenized_inputs: Dict, prefill_seq_len: int = None): + """Prepare model inputs for dual-QPC prefill execution.""" + # True prompt length before compile-aligned padding. + runtime_prompt_len = int(tokenized_inputs["input_ids"].shape[1]) + effective_prefill = runtime_prompt_len if prefill_seq_len is None else prefill_seq_len + if effective_prefill < runtime_prompt_len: + raise ValueError( + f"prefill_seq_len ({effective_prefill}) must be >= runtime prompt length ({runtime_prompt_len})." + ) + + # Let model helper compute position_ids and multimodal placement. + prepared_inputs = self.model.model.prepare_inputs_for_generation( + inputs=tokenized_inputs, + prefill_seq_len=effective_prefill, + batch_size=1, + ) + + # Normalize image_grid_thw to the shape consumed by compiled path. + if "image_grid_thw" in prepared_inputs and prepared_inputs["image_grid_thw"].ndim == 2: + thw = prepared_inputs["image_grid_thw"][0] + t, h, w = int(thw[0].item()), int(thw[1].item()), int(thw[2].item()) + prepared_inputs["image_grid_thw"] = torch.zeros((1, t, h, w), dtype=thw.dtype) + + if "pixel_values" in prepared_inputs: + prepared_inputs["pixel_values"] = prepared_inputs["pixel_values"].to(torch.float32) + + return prepared_inputs, runtime_prompt_len + + def _ensure_compiled(self, prefill_seq_len: int, height: int, width: int): + """Compile QPCs if needed, otherwise reuse cached compiled artifacts.""" + # Reuse previously compiled artifacts whenever shapes are compatible. + if ( + self._compiled_qpc_paths is not None + and prefill_seq_len <= self._compiled_prefill_seq_len + and height == self._compiled_height + and width == self._compiled_width + ): + return + + reuse_vision_qpc = ( + self._compiled_qpc_paths is not None and height == self._compiled_height and width == self._compiled_width + ) + + # Compile one max prefill specialization and optionally skip vision recompile. + compiled_paths = self.model.compile( + prefill_seq_len=prefill_seq_len, + ctx_len=self.ctx_len, + img_size=max(height, width), + height=height, + width=width, + num_cores=self.num_cores, + num_devices=self.num_devices, + mxfp6_matmul=self.mxfp6_matmul, + # vision_embed_fp32=True, + skip_vision=reuse_vision_qpc, + ) + if reuse_vision_qpc: + compiled_paths["vision_qpc_path"] = self._compiled_qpc_paths["vision_qpc_path"] + + self._compiled_qpc_paths = compiled_paths + self._compiled_prefill_seq_len = prefill_seq_len + self._compiled_height = height + self._compiled_width = width + + @staticmethod + def _zero_vision_outputs(vision_outputs: Dict[str, np.ndarray]) -> Dict[str, np.ndarray]: + """Create zero-valued placeholders matching vision output buffers.""" + return {name: np.zeros_like(value) for name, value in vision_outputs.items()} + + def _run_ai100_vision(self, prepared_inputs) -> Dict[str, np.ndarray]: + """Run the compiled vision encoder QPC and return retained-state buffers.""" + if "pixel_values" not in prepared_inputs or "image_grid_thw" not in prepared_inputs: + raise ValueError("Missing pixel_values/image_grid_thw for vision execution.") + + # Vision session produces retained states consumed by language session. + vision_session = QAICInferenceSession(self._compiled_qpc_paths["vision_qpc_path"]) + vision_outputs = vision_session.run( + { + # Vision qpc expects fp16 pixels + int64 grid coordinates. + "pixel_values": prepared_inputs["pixel_values"].detach().cpu().numpy().astype(np.float16), + "image_grid_thw": prepared_inputs["image_grid_thw"].detach().cpu().numpy().astype(np.int64), + } + ) + vision_session.deactivate() + return vision_outputs + + def _run_ai100_prefill(self, prepared_inputs, vision_template: Dict[str, np.ndarray]) -> np.ndarray: + """Run one prefill pass on AI100 language QPC and return logits.""" + # Match runtime input to compiled prefill length. + prefill_len = prepared_inputs["position_ids"].shape[-1] + input_ids = prepared_inputs["input_ids"] + if input_ids.shape[1] < prefill_len: + pad = torch.full( + (input_ids.shape[0], prefill_len - input_ids.shape[1]), + 1, + dtype=input_ids.dtype, + device=input_ids.device, + ) + input_ids = torch.cat([input_ids, pad], dim=1) + else: + input_ids = input_ids[:, :prefill_len] + + position_ids = prepared_inputs["position_ids"][..., :prefill_len] + + # For text-only docs, inject zeroed retained states with matching shapes. + if "pixel_values" in prepared_inputs and "image_grid_thw" in prepared_inputs: + vision_outputs = self._run_ai100_vision(prepared_inputs) + else: + vision_outputs = self._zero_vision_outputs(vision_template) + + # Skip past/retained buffers and run only required prefill inputs. + lang_session = QAICInferenceSession(self._compiled_qpc_paths["lang_qpc_path"]) + lang_session.skip_buffers( + [ + name + for name in lang_session.input_names + lang_session.output_names + if name.startswith("past_") or name.endswith("_RetainedState") + ] + ) + lang_session.set_buffers(vision_outputs) + outputs = lang_session.run( + { + # image_idx selects the vision buffer slot for this request. + "input_ids": input_ids.detach().cpu().numpy().astype(np.int64), + "position_ids": position_ids.detach().cpu().numpy().astype(np.int64), + "image_idx": np.zeros((1, 1), dtype=np.int64), + } + ) + lang_session.deactivate() + return outputs["logits"] + + def process(self, inputs: Dict) -> List[float]: + """Score all documents for one query on AI100. + + High-level flow: + 1) Build model-ready query-document pairs. + 2) Find max prompt/image shape across all docs. + 3) Compile once at max shape (single stable specialization). + 4) Run prefill per doc and convert logits -> score. + """ + # Unpack user payload. + instruction = inputs["instruction"] + query = inputs.get("query", {}) + documents = inputs.get("documents", []) + + # Collect per-document tokenized contexts first so we can compile once + # with the largest prompt/image shape required by this request. + prepared_contexts = [] + max_prompt_len = 0 + max_grid_h = 22 + max_grid_w = 34 + + # Build each pair in the exact chat-template format expected by the model. + for document in documents: + pair = self._format_mm_instruction(instruction, query, document) + tokenized = self._tokenize_pair(pair) + runtime_prompt_len = int(tokenized["input_ids"].shape[1]) + + # Track the max image grid (H, W) seen so compile dimensions can + # handle all documents in this batch. + if "image_grid_thw" in tokenized and tokenized["image_grid_thw"].numel() > 0: + grid = tokenized["image_grid_thw"] + max_grid_h = max(max_grid_h, int(grid[..., 1].max().item())) + max_grid_w = max(max_grid_w, int(grid[..., 2].max().item())) + + prepared_contexts.append( + { + "tokenized": tokenized, + "runtime_prompt_len": runtime_prompt_len, + } + ) + max_prompt_len = max(max_prompt_len, runtime_prompt_len) + + # Empty documents list => no scores. + if max_prompt_len == 0: + return [] + + # Convert max grid to compile-time pixel dimensions using model patch size. + patch_size = int(self.model.model.config.vision_config.patch_size) + compile_height = max_grid_h * patch_size + compile_width = max_grid_w * patch_size + + # Compile/reuse a single language specialization and prepare all requests + # to that same prefill length to avoid per-document recompiles. + target_prefill_seq_len = max_prompt_len + if self.compile_prefill_seq_len is not None: + if self.compile_prefill_seq_len < max_prompt_len: + raise ValueError( + f"--compile-prefill-seq-len ({self.compile_prefill_seq_len}) must be >= " + f"max runtime prompt length ({max_prompt_len})." + ) + target_prefill_seq_len = self.compile_prefill_seq_len + + self._ensure_compiled(target_prefill_seq_len, compile_height, compile_width) + + # Prepare all documents to the same prefill length used at compile time. + prepared_contexts_with_prefill = [] + vision_template = None + for ctx in prepared_contexts: + prepared_inputs, _ = self._prepare_inputs(ctx["tokenized"], prefill_seq_len=target_prefill_seq_len) + prepared_contexts_with_prefill.append({"prepared_inputs": prepared_inputs}) + + # Capture one real vision-output template so text-only docs can reuse + # zero-valued buffers with exact matching shapes. + if vision_template is None and "pixel_values" in prepared_inputs and "image_grid_thw" in prepared_inputs: + vision_template = self._run_ai100_vision(prepared_inputs) + + # This example currently expects at least one image document to establish + # retained-state buffer shapes for mixed image/text batches. + if vision_template is None: + raise ValueError("At least one image document is required to initialize AI100 vision buffers.") + + # Run language prefill and compute scalar score per document. + scores = [] + for ctx in prepared_contexts_with_prefill: + logits = self._run_ai100_prefill( + ctx["prepared_inputs"], + vision_template=vision_template, + ) + # Reranker score = sigmoid(logit_yes - logit_no). + score = self._score_from_logits(logits, self.yes_token_id, self.no_token_id) + scores.append(score) + + return scores + + +def main(): + # Keep CLI simple: just allow model id/path override. + parser = argparse.ArgumentParser(description="Qwen3-VL reranker example.") + parser.add_argument("--model-name", type=str, default=DEFAULT_MODEL_NAME) + parser.add_argument("--ctx-len", type=int, default=DEFAULT_CTX_LEN, help="Context length used at compile time.") + parser.add_argument("--num-cores", type=int, default=DEFAULT_NUM_CORES, help="Number of AI100 cores.") + parser.add_argument("--num-devices", type=int, default=DEFAULT_NUM_DEVICES, help="Number of AI100 devices.") + parser.add_argument( + "--mxfp6-matmul", + action="store_true", + help="Enable MXFP6 matmul during compile (default: disabled).", + ) + parser.add_argument( + "--compile-prefill-seq-len", + type=int, + default=None, + help=( + "Optional fixed prefill sequence length for compile/padding. " + "Must be >= max prompt length of the current request." + ), + ) + args = parser.parse_args() + + model = QEffQwen3VLReranker( + model_name_or_path=args.model_name, + ctx_len=args.ctx_len, + num_cores=args.num_cores, + num_devices=args.num_devices, + mxfp6_matmul=args.mxfp6_matmul, + compile_prefill_seq_len=args.compile_prefill_seq_len, + ) + + # Example input payload matching the HF reranker schema. + inputs = { + "instruction": "Retrieve images or text relevant to the user's query.", + "query": {"text": "A woman playing with her dog on a beach at sunset."}, + "documents": [ + { + "text": "A woman shares a joyful moment with her golden retriever on a sun-drenched beach at sunset, as the dog offers its paw in a heartwarming display of companionship and trust." + }, + {"image": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg"}, + { + "text": "A woman shares a joyful moment with her golden retriever on a sun-drenched beach at sunset, as the dog offers its paw in a heartwarming display of companionship and trust.", + "image": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg", + }, + ], + "fps": 1.0, + } + + # Print one score per document in the same order as inputs["documents"]. + scores = model.process(inputs) + print(scores) + + +if __name__ == "__main__": + main() diff --git a/tests/configs/image_text_model_configs.json b/tests/configs/image_text_model_configs.json index 85df559970..f4cdb6a0fd 100644 --- a/tests/configs/image_text_model_configs.json +++ b/tests/configs/image_text_model_configs.json @@ -5,7 +5,7 @@ "model_type": "llava", "batch_size": 1, "prompt_len": 784, - "ctx_len": 1024, + "ctx_len": 2048, "img_size": 336, "img_url": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/ai2d-demo.jpg", "text_prompt": "What does the label 15 represent? (1) lava (2) core (3) tunnel (4) ash cloud", diff --git a/tests/transformers/models/image_text_to_text/test_reranker_mad.py b/tests/transformers/models/image_text_to_text/test_reranker_mad.py new file mode 100644 index 0000000000..3a6497b520 --- /dev/null +++ b/tests/transformers/models/image_text_to_text/test_reranker_mad.py @@ -0,0 +1,455 @@ +# ----------------------------------------------------------------------------- +# +# Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries. +# SPDX-License-Identifier: BSD-3-Clause +# +# ---------------------------------------------------------------------------- + +import json +import os +from typing import Dict, List, Tuple + +import numpy as np +import pytest +import torch +from huggingface_hub import snapshot_download +from qwen_vl_utils import process_vision_info +from transformers import AutoConfig, AutoProcessor + +from QEfficient.generation.cloud_infer import QAICInferenceSession +from QEfficient.transformers.models.modeling_auto import QEFFAutoModelForImageTextToText +from QEfficient.utils.test_utils import load_vlm_model, set_num_layers_vlm + +CONFIG_PATH = "tests/configs/image_text_model_configs.json" + +PT_AI100_MAD_MAX = 5e-3 +MAX_LENGTH = 8192 +RERANKER_DOC_LIMIT = int(os.getenv("QEFF_RERANKER_DOC_LIMIT", "0")) + +IMAGE_BASE_FACTOR = 16 +IMAGE_FACTOR = IMAGE_BASE_FACTOR * 2 +MIN_PIXELS = 4 * IMAGE_FACTOR * IMAGE_FACTOR +MAX_PIXELS = 1280 * IMAGE_FACTOR * IMAGE_FACTOR + +EXAMPLE_INPUTS = { + "instruction": "Retrieve relevant content.", + "query": {"text": "dog on beach"}, + "documents": [ + {"image": "https://picsum.photos/id/237/536/354"}, + {"text": "A dog running on the beach."}, + ], +} + +with open(CONFIG_PATH, "r") as f: + config_data = json.load(f) + reranker_models = config_data["image_text_reranker_models"] + +test_reranker_models = [model_config["model_name"] for model_config in reranker_models] +reranker_model_config_dict = {model["model_name"]: model for model in reranker_models} + + +def _resolve_model_source(model_name_or_path: str) -> str: + if os.path.isdir(model_name_or_path): + return model_name_or_path + return snapshot_download(repo_id=model_name_or_path) + + +def _format_mm_content(text, image, video, prefix: str) -> List[Dict]: + content = [{"type": "text", "text": prefix}] + + if not text and not image and not video: + content.append({"type": "text", "text": "NULL"}) + return content + + if video: + raise ValueError("Video input is not supported in this test.") + + if image: + if isinstance(image, str): + image_content = image if image.startswith(("http", "oss")) else "file://" + image + else: + image_content = image + content.append( + { + "type": "image", + "image": image_content, + "min_pixels": MIN_PIXELS, + "max_pixels": MAX_PIXELS, + } + ) + + if text: + content.append({"type": "text", "text": text}) + + return content + + +def _format_mm_instruction(instruction: str, query: Dict, document: Dict) -> List[Dict]: + contents = [{"type": "text", "text": ": " + instruction}] + + contents.extend( + _format_mm_content( + query.get("text"), + query.get("image"), + query.get("video"), + prefix=":", + ) + ) + contents.extend( + _format_mm_content( + document.get("text"), + document.get("image"), + document.get("video"), + prefix="\n:", + ) + ) + + return [ + { + "role": "system", + "content": [ + { + "type": "text", + "text": ( + "Judge whether the Document meets the requirements based on the Query and the Instruct " + 'provided. Note that the answer can only be "yes" or "no".' + ), + } + ], + }, + {"role": "user", "content": contents}, + ] + + +def _truncate_tokens_optimized(tokens: List[int], max_length: int, special_tokens: List[int]) -> List[int]: + if len(tokens) <= max_length: + return tokens + + special_tokens_set = set(special_tokens) + num_special = sum(1 for token in tokens if token in special_tokens_set) + num_non_special_to_keep = max_length - num_special + + final_tokens = [] + non_special_kept_count = 0 + for token in tokens: + if token in special_tokens_set: + final_tokens.append(token) + elif non_special_kept_count < num_non_special_to_keep: + final_tokens.append(token) + non_special_kept_count += 1 + return final_tokens + + +def _tokenize_pair(processor, pair: List[Dict]) -> Dict: + pairs = [pair] + text = processor.apply_chat_template(pairs, tokenize=False, add_generation_prompt=True) + + images, videos, video_kwargs = process_vision_info( + pairs, + image_patch_size=16, + return_video_kwargs=True, + return_video_metadata=True, + ) + + if videos is not None: + videos, video_metadatas = zip(*videos) + videos = list(videos) + video_metadatas = list(video_metadatas) + else: + video_metadatas = None + + inputs = processor( + text=text, + images=images, + videos=videos, + video_metadata=video_metadatas, + truncation=False, + padding=False, + do_resize=False, + **video_kwargs, + ) + + for i, input_ids in enumerate(inputs["input_ids"]): + inputs["input_ids"][i] = ( + _truncate_tokens_optimized( + input_ids[:-5], + MAX_LENGTH, + processor.tokenizer.all_special_ids, + ) + + input_ids[-5:] + ) + + padded = processor.tokenizer.pad( + {"input_ids": inputs["input_ids"]}, + padding=True, + return_tensors="pt", + max_length=MAX_LENGTH, + ) + for key in padded: + inputs[key] = padded[key] + + if "pixel_values" in inputs: + inputs["pixel_values"] = inputs["pixel_values"].to(torch.float32) + + return inputs + + +def _get_yes_no_token_ids(tokenizer) -> Tuple[int, int]: + vocab = tokenizer.get_vocab() + if "yes" not in vocab or "no" not in vocab: + raise ValueError("Could not resolve tokenizer ids for exact tokens 'yes' and 'no'.") + return vocab["yes"], vocab["no"] + + +def _score_from_logits(logits, yes_token_id: int, no_token_id: int) -> np.ndarray: + if isinstance(logits, np.ndarray): + logits_tensor = torch.from_numpy(logits) + else: + logits_tensor = logits.detach().cpu() + + if logits_tensor.ndim == 3: + logits_tensor = logits_tensor[:, -1, :] + elif logits_tensor.ndim != 2: + raise ValueError(f"Unsupported logits rank for score conversion: {logits_tensor.ndim}") + + score = torch.sigmoid(logits_tensor[:, yes_token_id] - logits_tensor[:, no_token_id]) + return score.detach().cpu().numpy().astype(np.float64) + + +def _score_from_last_hidden(last_hidden_state: torch.Tensor, score_linear: torch.nn.Linear) -> np.ndarray: + score = torch.sigmoid(score_linear(last_hidden_state[:, -1])).squeeze(-1) + return score.detach().cpu().numpy().astype(np.float64) + + +def _make_score_linear(model_hf, yes_token_id: int, no_token_id: int) -> torch.nn.Linear: + lm_head_weights = model_hf.lm_head.weight.data + weight_yes = lm_head_weights[yes_token_id] + weight_no = lm_head_weights[no_token_id] + + linear_layer = torch.nn.Linear(weight_yes.shape[0], 1, bias=False) + with torch.no_grad(): + linear_layer.weight[0] = weight_yes - weight_no + return linear_layer.eval() + + +def _mad_stats(reference: np.ndarray, candidate: np.ndarray) -> Tuple[float, float]: + diff = np.abs(reference - candidate) + return float(np.mean(diff)), float(np.max(diff)) + + +def _prepare_qeff_inputs(qeff_model, tokenized_inputs: Dict, prefill_seq_len: int = None): + runtime_prompt_len = int(tokenized_inputs["input_ids"].shape[1]) + effective_prefill_seq_len = runtime_prompt_len if prefill_seq_len is None else prefill_seq_len + if effective_prefill_seq_len < runtime_prompt_len: + raise ValueError( + f"prefill_seq_len ({effective_prefill_seq_len}) must be >= runtime prompt length ({runtime_prompt_len})." + ) + + prepared_inputs = qeff_model.model.prepare_inputs_for_generation( + inputs=tokenized_inputs, + prefill_seq_len=effective_prefill_seq_len, + batch_size=1, + ) + + if "image_grid_thw" in prepared_inputs and prepared_inputs["image_grid_thw"].ndim == 2: + thw = prepared_inputs["image_grid_thw"][0] + t, h, w = int(thw[0].item()), int(thw[1].item()), int(thw[2].item()) + prepared_inputs["image_grid_thw"] = torch.zeros((1, t, h, w), dtype=thw.dtype) + + if "pixel_values" in prepared_inputs: + prepared_inputs["pixel_values"] = prepared_inputs["pixel_values"].to(torch.float32) + + return prepared_inputs, runtime_prompt_len + + +def _zero_vision_outputs(vision_outputs: Dict[str, np.ndarray]) -> Dict[str, np.ndarray]: + return {name: np.zeros_like(value) for name, value in vision_outputs.items()} + + +def _run_ai100_vision(vision_qpc_path: str, prepared_inputs) -> Dict[str, np.ndarray]: + vision_session = QAICInferenceSession(vision_qpc_path) + vision_inputs = { + "pixel_values": prepared_inputs["pixel_values"].detach().cpu().numpy().astype(np.float16), + "image_grid_thw": prepared_inputs["image_grid_thw"].detach().cpu().numpy().astype(np.int64), + } + vision_outputs = vision_session.run(vision_inputs) + vision_session.deactivate() + return vision_outputs + + +def _run_ai100_prefill(qpc_paths, prepared_inputs, vision_template): + if not isinstance(qpc_paths, dict): + raise ValueError("Expected qpc_paths to be a dict with vision/lang QPC keys.") + + vision_qpc_path = qpc_paths.get("vision_qpc_path") + lang_qpc_path = qpc_paths.get("lang_qpc_path") + if vision_qpc_path is None or lang_qpc_path is None: + raise ValueError("Missing vision_qpc_path/lang_qpc_path in compiled QPC outputs.") + + prefill_len = prepared_inputs["position_ids"].shape[-1] + input_ids = prepared_inputs["input_ids"] + if input_ids.shape[1] < prefill_len: + pad = torch.full( + (input_ids.shape[0], prefill_len - input_ids.shape[1]), + 1, + dtype=input_ids.dtype, + device=input_ids.device, + ) + input_ids = torch.cat([input_ids, pad], dim=1) + else: + input_ids = input_ids[:, :prefill_len] + position_ids = prepared_inputs["position_ids"][..., :prefill_len] + + if "pixel_values" in prepared_inputs and "image_grid_thw" in prepared_inputs: + vision_outputs = _run_ai100_vision(vision_qpc_path, prepared_inputs) + else: + vision_outputs = _zero_vision_outputs(vision_template) + + lang_session = QAICInferenceSession(lang_qpc_path) + lang_session.skip_buffers( + [ + name + for name in lang_session.input_names + lang_session.output_names + if name.startswith("past_") or name.endswith("_RetainedState") + ] + ) + lang_session.set_buffers(vision_outputs) + lang_inputs = { + "input_ids": input_ids.detach().cpu().numpy().astype(np.int64), + "position_ids": position_ids.detach().cpu().numpy().astype(np.int64), + "image_idx": np.zeros((1, 1), dtype=np.int64), + } + outputs = lang_session.run(lang_inputs) + lang_session.deactivate() + return outputs["logits"] + + +@pytest.mark.on_qaic +@pytest.mark.multimodal +@pytest.mark.regular +@pytest.mark.parametrize("model_name", test_reranker_models) +def test_qwen3_vl_reranker_mad_parity(model_name): + torch.manual_seed(42) + model_cfg = reranker_model_config_dict[model_name] + model_source = _resolve_model_source(model_name) + + config = AutoConfig.from_pretrained(model_source, trust_remote_code=True, padding=True) + config = set_num_layers_vlm(config, n_layer=model_cfg["num_layers"]) + if hasattr(config, "use_cache"): + config.use_cache = True + if hasattr(config, "text_config") and hasattr(config.text_config, "use_cache"): + config.text_config.use_cache = True + + model_hf = load_vlm_model(config) + model_hf.eval() + + qeff_model = QEFFAutoModelForImageTextToText.from_pretrained( + model_source, + kv_offload=True, + config=config, + ) + processor = AutoProcessor.from_pretrained(model_source, trust_remote_code=True, padding=True) + + yes_token_id, no_token_id = _get_yes_no_token_ids(processor.tokenizer) + score_linear = _make_score_linear(model_hf, yes_token_id, no_token_id).to(next(model_hf.parameters()).device) + score_linear = score_linear.to(dtype=next(model_hf.parameters()).dtype) + + doc_contexts = [] + max_prompt_len = 0 + max_grid_h = 22 + max_grid_w = 34 + + hf_scores_list = [] + + documents = EXAMPLE_INPUTS["documents"] + if RERANKER_DOC_LIMIT > 0: + documents = documents[:RERANKER_DOC_LIMIT] + + for document in documents: + pair = _format_mm_instruction( + instruction=EXAMPLE_INPUTS["instruction"], + query=EXAMPLE_INPUTS["query"], + document=document, + ) + tokenized = _tokenize_pair(processor, pair) + runtime_prompt_len = int(tokenized["input_ids"].shape[1]) + + hf_inputs = {} + for key, value in tokenized.items(): + hf_inputs[key] = value.to(next(model_hf.parameters()).device) if torch.is_tensor(value) else value + with torch.no_grad(): + hf_last_hidden = model_hf.model(**hf_inputs).last_hidden_state + hf_score = _score_from_last_hidden(hf_last_hidden, score_linear)[0] + hf_scores_list.append(float(hf_score)) + + if "image_grid_thw" in tokenized and tokenized["image_grid_thw"].numel() > 0: + grid = tokenized["image_grid_thw"] + max_grid_h = max(max_grid_h, int(grid[..., 1].max().item())) + max_grid_w = max(max_grid_w, int(grid[..., 2].max().item())) + + doc_contexts.append( + { + "tokenized": tokenized, + } + ) + max_prompt_len = max(max_prompt_len, runtime_prompt_len) + + patch_size = int(qeff_model.model.config.vision_config.patch_size) + compile_height = max_grid_h * patch_size + compile_width = max_grid_w * patch_size + + qpc_paths = qeff_model.compile( + img_size=max(compile_height, compile_width), + height=compile_height, + width=compile_width, + prefill_seq_len=max_prompt_len, + ctx_len=model_cfg["ctx_len"], + num_devices=1, + num_cores=16, + mxfp6_matmul=False, + ) + + ai100_scores_list = [] + + prepared_contexts = [] + vision_template_ai100 = None + for context in doc_contexts: + prepared_inputs, _ = _prepare_qeff_inputs( + qeff_model=qeff_model, + tokenized_inputs=context["tokenized"], + prefill_seq_len=max_prompt_len, + ) + prepared_contexts.append( + { + "prepared_inputs": prepared_inputs, + } + ) + if vision_template_ai100 is None and "pixel_values" in prepared_inputs and "image_grid_thw" in prepared_inputs: + vision_template_ai100 = _run_ai100_vision(qpc_paths["vision_qpc_path"], prepared_inputs) + + if vision_template_ai100 is None: + raise ValueError("Expected at least one image document to initialize vision templates.") + + for context in prepared_contexts: + prepared_inputs_runtime = context["prepared_inputs"] + ai100_logits = _run_ai100_prefill( + qpc_paths=qpc_paths, + prepared_inputs=prepared_inputs_runtime, + vision_template=vision_template_ai100, + ) + ai100_score = _score_from_logits(ai100_logits, yes_token_id, no_token_id)[0] + ai100_scores_list.append(float(ai100_score)) + + hf_scores = np.array(hf_scores_list, dtype=np.float64) + ai100_scores = np.array(ai100_scores_list, dtype=np.float64) + + print(f"[SCORES] PyTorch(original): {hf_scores.tolist()}") + print(f"[SCORES] AI100: {ai100_scores.tolist()}") + + pt_ai100_mad_mean, pt_ai100_mad_max = _mad_stats(hf_scores, ai100_scores) + print(f"[MAD] PyTorch(original) vs AI100: mean={pt_ai100_mad_mean:.6e}, max={pt_ai100_mad_max:.6e}") + assert pt_ai100_mad_max <= PT_AI100_MAD_MAX, ( + f"PyTorch(original) vs AI100 MAD max {pt_ai100_mad_max:.6e} " + f"exceeds threshold {PT_AI100_MAD_MAX:.6e}. " + f"Check tokenizer ids, prompt formatting, runtime prompt length slicing, and compile dimensions." + ) From 711fd8100adde2b5acb2e1837908c2b86a6f08cf Mon Sep 17 00:00:00 2001 From: Amit Raj Date: Tue, 19 May 2026 08:50:39 +0530 Subject: [PATCH 02/11] Functionality changes to PR and rebase with main branch Signed-off-by: Amit Raj --- .../models/qwen3vl/reranker/README.md | 52 -- .../qwen3vl/reranker/qwen3_vl_reranker.py | 555 ------------------ tests/configs/image_text_model_configs.json | 2 +- .../image_text_to_text/test_reranker_mad.py | 455 -------------- 4 files changed, 1 insertion(+), 1063 deletions(-) delete mode 100644 examples/image_text_to_text/models/qwen3vl/reranker/README.md delete mode 100644 examples/image_text_to_text/models/qwen3vl/reranker/qwen3_vl_reranker.py delete mode 100644 tests/transformers/models/image_text_to_text/test_reranker_mad.py diff --git a/examples/image_text_to_text/models/qwen3vl/reranker/README.md b/examples/image_text_to_text/models/qwen3vl/reranker/README.md deleted file mode 100644 index a3e715478d..0000000000 --- a/examples/image_text_to_text/models/qwen3vl/reranker/README.md +++ /dev/null @@ -1,52 +0,0 @@ -# Qwen3-VL Reranker Inference - -This directory contains an AI100 example for running Qwen3-VL reranker models with QEfficient and printing per-document relevance scores. - -Supported models: -- `Qwen/Qwen3-VL-Reranker-2B` -- `Qwen/Qwen3-VL-Reranker-8B` - -## What this example does - -- Loads Qwen3-VL reranker from Hugging Face (or local snapshot path). -- Uses QEff dual-QPC execution (vision encoder + language model). -- Runs the same query against multiple text/image documents. -- Prints one score per document in input order. - -## Required package - -- `qwen-vl-utils>=0.0.14` - -```bash -pip install "qwen-vl-utils>=0.0.14" -``` - -## Script - -- `qwen3_vl_reranker.py` - -## Run - -```bash -python examples/image_text_to_text/models/qwen3vl/reranker/qwen3_vl_reranker.py \ - --model-name Qwen/Qwen3-VL-Reranker-2B -``` - -Or run with 8B: - -```bash -python examples/image_text_to_text/models/qwen3vl/reranker/qwen3_vl_reranker.py \ - --model-name Qwen/Qwen3-VL-Reranker-8B -``` - -With compile parameters: - -```bash -python examples/image_text_to_text/models/qwen3vl/reranker/qwen3_vl_reranker.py \ - --model-name Qwen/Qwen3-VL-Reranker-2B \ - --ctx-len 2048 \ - --num-cores 16 \ - --num-devices 1 \ - --compile-prefill-seq-len 4096 \ - --mxfp6-matmul -``` diff --git a/examples/image_text_to_text/models/qwen3vl/reranker/qwen3_vl_reranker.py b/examples/image_text_to_text/models/qwen3vl/reranker/qwen3_vl_reranker.py deleted file mode 100644 index 2fdd225571..0000000000 --- a/examples/image_text_to_text/models/qwen3vl/reranker/qwen3_vl_reranker.py +++ /dev/null @@ -1,555 +0,0 @@ -# ----------------------------------------------------------------------------- -# -# Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries. -# SPDX-License-Identifier: BSD-3-Clause -# -# ----------------------------------------------------------------------------- - -import argparse -import os -from typing import Dict, List, Tuple - -import numpy as np -import torch -from huggingface_hub import snapshot_download -from qwen_vl_utils import process_vision_info -from transformers import AutoConfig, AutoProcessor - -from QEfficient import QEFFAutoModelForImageTextToText -from QEfficient.generation.cloud_infer import QAICInferenceSession - -DEFAULT_MODEL_NAME = "Qwen/Qwen3-VL-Reranker-2B" -DEFAULT_CTX_LEN = 2048 -DEFAULT_NUM_CORES = 16 -DEFAULT_NUM_DEVICES = 1 - -# Max token budget used by this example's manual truncation/padding flow. -MAX_LENGTH = 8192 -# Pixel constraints used by Qwen3-VL preprocessing. -IMAGE_BASE_FACTOR = 16 -IMAGE_FACTOR = IMAGE_BASE_FACTOR * 2 -MIN_PIXELS = 4 * IMAGE_FACTOR * IMAGE_FACTOR -MAX_PIXELS = 1280 * IMAGE_FACTOR * IMAGE_FACTOR -FPS = 1.0 - - -class QEffQwen3VLReranker: - @staticmethod - def _resolve_model_source(model_name_or_path: str) -> str: - """Return a local model path when given an HF repo id. - - Why: - Some transformers versions can fail when resolving chat templates from - repo-id mode for this model. Using a local snapshot path avoids that path. - """ - if os.path.isdir(model_name_or_path): - return model_name_or_path - return snapshot_download(repo_id=model_name_or_path) - - def __init__( - self, - model_name_or_path: str = DEFAULT_MODEL_NAME, - ctx_len: int = DEFAULT_CTX_LEN, - num_cores: int = DEFAULT_NUM_CORES, - num_devices: int = DEFAULT_NUM_DEVICES, - mxfp6_matmul: bool = False, - compile_prefill_seq_len: int = None, - ): - """Initialize the AI100-only reranker wrapper. - - This loads: - - HF config/processor for prompt and multimodal preprocessing. - - QEFF dual-QPC model wrapper (vision encoder + language decoder). - - Token ids for "yes"/"no" used to compute reranker scores. - - Parameters - ---------- - model_name_or_path: - HF model id or local snapshot path. - """ - self.model_name_or_path = model_name_or_path - self.model_source = self._resolve_model_source(model_name_or_path) - self.ctx_len = ctx_len - self.num_cores = num_cores - self.num_devices = num_devices - self.mxfp6_matmul = mxfp6_matmul - self.compile_prefill_seq_len = compile_prefill_seq_len - self.max_length = MAX_LENGTH - self.fps = FPS - - # Use local snapshot for stable processor/chat-template loading. - config = AutoConfig.from_pretrained(self.model_source, trust_remote_code=True, padding=True) - if hasattr(config, "use_cache"): - config.use_cache = True - if hasattr(config, "text_config") and hasattr(config.text_config, "use_cache"): - config.text_config.use_cache = True - - self.processor = AutoProcessor.from_pretrained(self.model_source, trust_remote_code=True, padding=True) - self.model = QEFFAutoModelForImageTextToText.from_pretrained( - self.model_source, - kv_offload=True, - trust_remote_code=True, - config=config, - ) - - self.yes_token_id, self.no_token_id = self._get_yes_no_token_ids(self.processor.tokenizer) - self._compiled_qpc_paths = None - self._compiled_prefill_seq_len = 0 - self._compiled_height = None - self._compiled_width = None - - @staticmethod - def _get_yes_no_token_ids(tokenizer) -> Tuple[int, int]: - """Resolve tokenizer ids for the exact tokens 'yes' and 'no'.""" - vocab = tokenizer.get_vocab() - if "yes" not in vocab or "no" not in vocab: - raise ValueError("Could not resolve tokenizer ids for exact tokens 'yes' and 'no'.") - return vocab["yes"], vocab["no"] - - @staticmethod - def _score_from_logits(logits, yes_token_id: int, no_token_id: int) -> float: - """Convert model logits into a reranker relevance score. - - Score formula: - sigmoid(logit_yes - logit_no) - """ - # Convert runtime output to torch and use final-token logits. - logits_tensor = torch.from_numpy(logits) if isinstance(logits, np.ndarray) else logits.detach().cpu() - if logits_tensor.ndim == 3: - logits_tensor = logits_tensor[:, -1, :] - # Binary relevance score from yes/no logit gap. - score = torch.sigmoid(logits_tensor[:, yes_token_id] - logits_tensor[:, no_token_id]) - return float(score[0].item()) - - @staticmethod - def _truncate_tokens_optimized(tokens: List[int], max_length: int, special_tokens: List[int]) -> List[int]: - """Truncate while preserving all special tokens in sequence order.""" - if len(tokens) <= max_length: - return tokens - - # Preserve all special/control tokens and trim only non-special tokens. - special_tokens_set = set(special_tokens) - num_special = sum(1 for token in tokens if token in special_tokens_set) - num_non_special_to_keep = max_length - num_special - - final_tokens = [] - non_special_kept_count = 0 - for token in tokens: - if token in special_tokens_set: - final_tokens.append(token) - elif non_special_kept_count < num_non_special_to_keep: - final_tokens.append(token) - non_special_kept_count += 1 - return final_tokens - - def _format_mm_content(self, text, image, video, prefix: str) -> List[Dict]: - """Build one multimodal content block (prefix + optional image + optional text).""" - # Prefix helps the model distinguish query vs document sections. - content = [{"type": "text", "text": prefix}] - - if not text and not image and not video: - content.append({"type": "text", "text": "NULL"}) - return content - - if video: - raise ValueError("Video input is not supported in this AI100-only example.") - - if image: - # Convert local paths to file:// URIs for the processor. - if isinstance(image, str): - image_content = image if image.startswith(("http", "oss")) else "file://" + image - else: - image_content = image - content.append( - { - "type": "image", - "image": image_content, - "min_pixels": MIN_PIXELS, - "max_pixels": MAX_PIXELS, - } - ) - - if text: - content.append({"type": "text", "text": text}) - - return content - - def _format_mm_instruction(self, instruction: str, query: Dict, document: Dict) -> List[Dict]: - """Create the chat payload for one query-document pair.""" - # Prompt shape follows the HF reranker reference format. - contents = [{"type": "text", "text": ": " + instruction}] - - contents.extend( - self._format_mm_content( - query.get("text"), - query.get("image"), - query.get("video"), - prefix=":", - ) - ) - contents.extend( - self._format_mm_content( - document.get("text"), - document.get("image"), - document.get("video"), - prefix="\n:", - ) - ) - - return [ - { - "role": "system", - "content": [ - { - "type": "text", - "text": ( - "Judge whether the Document meets the requirements based on the Query and the Instruct " - 'provided. Note that the answer can only be "yes" or "no".' - ), - } - ], - }, - {"role": "user", "content": contents}, - ] - - def _tokenize_pair(self, pair: List[Dict]) -> Dict: - """Tokenize a query-document pair with the exact HF multimodal pipeline.""" - # Processor expects list-of-conversations. - pairs = [pair] - text = self.processor.apply_chat_template(pairs, tokenize=False, add_generation_prompt=True) - - # Build image/video tensors + metadata for processor inputs. - images, videos, video_kwargs = process_vision_info( - pairs, - image_patch_size=16, - return_video_kwargs=True, - return_video_metadata=True, - ) - - if videos is not None: - videos, video_metadatas = zip(*videos) - videos = list(videos) - video_metadatas = list(video_metadatas) - else: - video_metadatas = None - - inputs = self.processor( - text=text, - images=images, - videos=videos, - video_metadata=video_metadatas, - truncation=False, - padding=False, - do_resize=False, - **video_kwargs, - ) - - # Apply custom truncation preserving trailing template control tokens. - for i, input_ids in enumerate(inputs["input_ids"]): - inputs["input_ids"][i] = ( - self._truncate_tokens_optimized( - input_ids[:-5], - self.max_length, - self.processor.tokenizer.all_special_ids, - ) - + input_ids[-5:] - ) - - # Re-pad through tokenizer utilities so masks align with token ids. - padded = self.processor.tokenizer.pad( - {"input_ids": inputs["input_ids"]}, - padding=True, - return_tensors="pt", - max_length=self.max_length, - ) - for key in padded: - inputs[key] = padded[key] - - if "pixel_values" in inputs: - # Keep pixels fp32 before explicit cast to fp16 during vision run. - inputs["pixel_values"] = inputs["pixel_values"].to(torch.float32) - - return inputs - - def _prepare_inputs(self, tokenized_inputs: Dict, prefill_seq_len: int = None): - """Prepare model inputs for dual-QPC prefill execution.""" - # True prompt length before compile-aligned padding. - runtime_prompt_len = int(tokenized_inputs["input_ids"].shape[1]) - effective_prefill = runtime_prompt_len if prefill_seq_len is None else prefill_seq_len - if effective_prefill < runtime_prompt_len: - raise ValueError( - f"prefill_seq_len ({effective_prefill}) must be >= runtime prompt length ({runtime_prompt_len})." - ) - - # Let model helper compute position_ids and multimodal placement. - prepared_inputs = self.model.model.prepare_inputs_for_generation( - inputs=tokenized_inputs, - prefill_seq_len=effective_prefill, - batch_size=1, - ) - - # Normalize image_grid_thw to the shape consumed by compiled path. - if "image_grid_thw" in prepared_inputs and prepared_inputs["image_grid_thw"].ndim == 2: - thw = prepared_inputs["image_grid_thw"][0] - t, h, w = int(thw[0].item()), int(thw[1].item()), int(thw[2].item()) - prepared_inputs["image_grid_thw"] = torch.zeros((1, t, h, w), dtype=thw.dtype) - - if "pixel_values" in prepared_inputs: - prepared_inputs["pixel_values"] = prepared_inputs["pixel_values"].to(torch.float32) - - return prepared_inputs, runtime_prompt_len - - def _ensure_compiled(self, prefill_seq_len: int, height: int, width: int): - """Compile QPCs if needed, otherwise reuse cached compiled artifacts.""" - # Reuse previously compiled artifacts whenever shapes are compatible. - if ( - self._compiled_qpc_paths is not None - and prefill_seq_len <= self._compiled_prefill_seq_len - and height == self._compiled_height - and width == self._compiled_width - ): - return - - reuse_vision_qpc = ( - self._compiled_qpc_paths is not None and height == self._compiled_height and width == self._compiled_width - ) - - # Compile one max prefill specialization and optionally skip vision recompile. - compiled_paths = self.model.compile( - prefill_seq_len=prefill_seq_len, - ctx_len=self.ctx_len, - img_size=max(height, width), - height=height, - width=width, - num_cores=self.num_cores, - num_devices=self.num_devices, - mxfp6_matmul=self.mxfp6_matmul, - # vision_embed_fp32=True, - skip_vision=reuse_vision_qpc, - ) - if reuse_vision_qpc: - compiled_paths["vision_qpc_path"] = self._compiled_qpc_paths["vision_qpc_path"] - - self._compiled_qpc_paths = compiled_paths - self._compiled_prefill_seq_len = prefill_seq_len - self._compiled_height = height - self._compiled_width = width - - @staticmethod - def _zero_vision_outputs(vision_outputs: Dict[str, np.ndarray]) -> Dict[str, np.ndarray]: - """Create zero-valued placeholders matching vision output buffers.""" - return {name: np.zeros_like(value) for name, value in vision_outputs.items()} - - def _run_ai100_vision(self, prepared_inputs) -> Dict[str, np.ndarray]: - """Run the compiled vision encoder QPC and return retained-state buffers.""" - if "pixel_values" not in prepared_inputs or "image_grid_thw" not in prepared_inputs: - raise ValueError("Missing pixel_values/image_grid_thw for vision execution.") - - # Vision session produces retained states consumed by language session. - vision_session = QAICInferenceSession(self._compiled_qpc_paths["vision_qpc_path"]) - vision_outputs = vision_session.run( - { - # Vision qpc expects fp16 pixels + int64 grid coordinates. - "pixel_values": prepared_inputs["pixel_values"].detach().cpu().numpy().astype(np.float16), - "image_grid_thw": prepared_inputs["image_grid_thw"].detach().cpu().numpy().astype(np.int64), - } - ) - vision_session.deactivate() - return vision_outputs - - def _run_ai100_prefill(self, prepared_inputs, vision_template: Dict[str, np.ndarray]) -> np.ndarray: - """Run one prefill pass on AI100 language QPC and return logits.""" - # Match runtime input to compiled prefill length. - prefill_len = prepared_inputs["position_ids"].shape[-1] - input_ids = prepared_inputs["input_ids"] - if input_ids.shape[1] < prefill_len: - pad = torch.full( - (input_ids.shape[0], prefill_len - input_ids.shape[1]), - 1, - dtype=input_ids.dtype, - device=input_ids.device, - ) - input_ids = torch.cat([input_ids, pad], dim=1) - else: - input_ids = input_ids[:, :prefill_len] - - position_ids = prepared_inputs["position_ids"][..., :prefill_len] - - # For text-only docs, inject zeroed retained states with matching shapes. - if "pixel_values" in prepared_inputs and "image_grid_thw" in prepared_inputs: - vision_outputs = self._run_ai100_vision(prepared_inputs) - else: - vision_outputs = self._zero_vision_outputs(vision_template) - - # Skip past/retained buffers and run only required prefill inputs. - lang_session = QAICInferenceSession(self._compiled_qpc_paths["lang_qpc_path"]) - lang_session.skip_buffers( - [ - name - for name in lang_session.input_names + lang_session.output_names - if name.startswith("past_") or name.endswith("_RetainedState") - ] - ) - lang_session.set_buffers(vision_outputs) - outputs = lang_session.run( - { - # image_idx selects the vision buffer slot for this request. - "input_ids": input_ids.detach().cpu().numpy().astype(np.int64), - "position_ids": position_ids.detach().cpu().numpy().astype(np.int64), - "image_idx": np.zeros((1, 1), dtype=np.int64), - } - ) - lang_session.deactivate() - return outputs["logits"] - - def process(self, inputs: Dict) -> List[float]: - """Score all documents for one query on AI100. - - High-level flow: - 1) Build model-ready query-document pairs. - 2) Find max prompt/image shape across all docs. - 3) Compile once at max shape (single stable specialization). - 4) Run prefill per doc and convert logits -> score. - """ - # Unpack user payload. - instruction = inputs["instruction"] - query = inputs.get("query", {}) - documents = inputs.get("documents", []) - - # Collect per-document tokenized contexts first so we can compile once - # with the largest prompt/image shape required by this request. - prepared_contexts = [] - max_prompt_len = 0 - max_grid_h = 22 - max_grid_w = 34 - - # Build each pair in the exact chat-template format expected by the model. - for document in documents: - pair = self._format_mm_instruction(instruction, query, document) - tokenized = self._tokenize_pair(pair) - runtime_prompt_len = int(tokenized["input_ids"].shape[1]) - - # Track the max image grid (H, W) seen so compile dimensions can - # handle all documents in this batch. - if "image_grid_thw" in tokenized and tokenized["image_grid_thw"].numel() > 0: - grid = tokenized["image_grid_thw"] - max_grid_h = max(max_grid_h, int(grid[..., 1].max().item())) - max_grid_w = max(max_grid_w, int(grid[..., 2].max().item())) - - prepared_contexts.append( - { - "tokenized": tokenized, - "runtime_prompt_len": runtime_prompt_len, - } - ) - max_prompt_len = max(max_prompt_len, runtime_prompt_len) - - # Empty documents list => no scores. - if max_prompt_len == 0: - return [] - - # Convert max grid to compile-time pixel dimensions using model patch size. - patch_size = int(self.model.model.config.vision_config.patch_size) - compile_height = max_grid_h * patch_size - compile_width = max_grid_w * patch_size - - # Compile/reuse a single language specialization and prepare all requests - # to that same prefill length to avoid per-document recompiles. - target_prefill_seq_len = max_prompt_len - if self.compile_prefill_seq_len is not None: - if self.compile_prefill_seq_len < max_prompt_len: - raise ValueError( - f"--compile-prefill-seq-len ({self.compile_prefill_seq_len}) must be >= " - f"max runtime prompt length ({max_prompt_len})." - ) - target_prefill_seq_len = self.compile_prefill_seq_len - - self._ensure_compiled(target_prefill_seq_len, compile_height, compile_width) - - # Prepare all documents to the same prefill length used at compile time. - prepared_contexts_with_prefill = [] - vision_template = None - for ctx in prepared_contexts: - prepared_inputs, _ = self._prepare_inputs(ctx["tokenized"], prefill_seq_len=target_prefill_seq_len) - prepared_contexts_with_prefill.append({"prepared_inputs": prepared_inputs}) - - # Capture one real vision-output template so text-only docs can reuse - # zero-valued buffers with exact matching shapes. - if vision_template is None and "pixel_values" in prepared_inputs and "image_grid_thw" in prepared_inputs: - vision_template = self._run_ai100_vision(prepared_inputs) - - # This example currently expects at least one image document to establish - # retained-state buffer shapes for mixed image/text batches. - if vision_template is None: - raise ValueError("At least one image document is required to initialize AI100 vision buffers.") - - # Run language prefill and compute scalar score per document. - scores = [] - for ctx in prepared_contexts_with_prefill: - logits = self._run_ai100_prefill( - ctx["prepared_inputs"], - vision_template=vision_template, - ) - # Reranker score = sigmoid(logit_yes - logit_no). - score = self._score_from_logits(logits, self.yes_token_id, self.no_token_id) - scores.append(score) - - return scores - - -def main(): - # Keep CLI simple: just allow model id/path override. - parser = argparse.ArgumentParser(description="Qwen3-VL reranker example.") - parser.add_argument("--model-name", type=str, default=DEFAULT_MODEL_NAME) - parser.add_argument("--ctx-len", type=int, default=DEFAULT_CTX_LEN, help="Context length used at compile time.") - parser.add_argument("--num-cores", type=int, default=DEFAULT_NUM_CORES, help="Number of AI100 cores.") - parser.add_argument("--num-devices", type=int, default=DEFAULT_NUM_DEVICES, help="Number of AI100 devices.") - parser.add_argument( - "--mxfp6-matmul", - action="store_true", - help="Enable MXFP6 matmul during compile (default: disabled).", - ) - parser.add_argument( - "--compile-prefill-seq-len", - type=int, - default=None, - help=( - "Optional fixed prefill sequence length for compile/padding. " - "Must be >= max prompt length of the current request." - ), - ) - args = parser.parse_args() - - model = QEffQwen3VLReranker( - model_name_or_path=args.model_name, - ctx_len=args.ctx_len, - num_cores=args.num_cores, - num_devices=args.num_devices, - mxfp6_matmul=args.mxfp6_matmul, - compile_prefill_seq_len=args.compile_prefill_seq_len, - ) - - # Example input payload matching the HF reranker schema. - inputs = { - "instruction": "Retrieve images or text relevant to the user's query.", - "query": {"text": "A woman playing with her dog on a beach at sunset."}, - "documents": [ - { - "text": "A woman shares a joyful moment with her golden retriever on a sun-drenched beach at sunset, as the dog offers its paw in a heartwarming display of companionship and trust." - }, - {"image": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg"}, - { - "text": "A woman shares a joyful moment with her golden retriever on a sun-drenched beach at sunset, as the dog offers its paw in a heartwarming display of companionship and trust.", - "image": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg", - }, - ], - "fps": 1.0, - } - - # Print one score per document in the same order as inputs["documents"]. - scores = model.process(inputs) - print(scores) - - -if __name__ == "__main__": - main() diff --git a/tests/configs/image_text_model_configs.json b/tests/configs/image_text_model_configs.json index f4cdb6a0fd..85df559970 100644 --- a/tests/configs/image_text_model_configs.json +++ b/tests/configs/image_text_model_configs.json @@ -5,7 +5,7 @@ "model_type": "llava", "batch_size": 1, "prompt_len": 784, - "ctx_len": 2048, + "ctx_len": 1024, "img_size": 336, "img_url": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/ai2d-demo.jpg", "text_prompt": "What does the label 15 represent? (1) lava (2) core (3) tunnel (4) ash cloud", diff --git a/tests/transformers/models/image_text_to_text/test_reranker_mad.py b/tests/transformers/models/image_text_to_text/test_reranker_mad.py deleted file mode 100644 index 3a6497b520..0000000000 --- a/tests/transformers/models/image_text_to_text/test_reranker_mad.py +++ /dev/null @@ -1,455 +0,0 @@ -# ----------------------------------------------------------------------------- -# -# Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries. -# SPDX-License-Identifier: BSD-3-Clause -# -# ---------------------------------------------------------------------------- - -import json -import os -from typing import Dict, List, Tuple - -import numpy as np -import pytest -import torch -from huggingface_hub import snapshot_download -from qwen_vl_utils import process_vision_info -from transformers import AutoConfig, AutoProcessor - -from QEfficient.generation.cloud_infer import QAICInferenceSession -from QEfficient.transformers.models.modeling_auto import QEFFAutoModelForImageTextToText -from QEfficient.utils.test_utils import load_vlm_model, set_num_layers_vlm - -CONFIG_PATH = "tests/configs/image_text_model_configs.json" - -PT_AI100_MAD_MAX = 5e-3 -MAX_LENGTH = 8192 -RERANKER_DOC_LIMIT = int(os.getenv("QEFF_RERANKER_DOC_LIMIT", "0")) - -IMAGE_BASE_FACTOR = 16 -IMAGE_FACTOR = IMAGE_BASE_FACTOR * 2 -MIN_PIXELS = 4 * IMAGE_FACTOR * IMAGE_FACTOR -MAX_PIXELS = 1280 * IMAGE_FACTOR * IMAGE_FACTOR - -EXAMPLE_INPUTS = { - "instruction": "Retrieve relevant content.", - "query": {"text": "dog on beach"}, - "documents": [ - {"image": "https://picsum.photos/id/237/536/354"}, - {"text": "A dog running on the beach."}, - ], -} - -with open(CONFIG_PATH, "r") as f: - config_data = json.load(f) - reranker_models = config_data["image_text_reranker_models"] - -test_reranker_models = [model_config["model_name"] for model_config in reranker_models] -reranker_model_config_dict = {model["model_name"]: model for model in reranker_models} - - -def _resolve_model_source(model_name_or_path: str) -> str: - if os.path.isdir(model_name_or_path): - return model_name_or_path - return snapshot_download(repo_id=model_name_or_path) - - -def _format_mm_content(text, image, video, prefix: str) -> List[Dict]: - content = [{"type": "text", "text": prefix}] - - if not text and not image and not video: - content.append({"type": "text", "text": "NULL"}) - return content - - if video: - raise ValueError("Video input is not supported in this test.") - - if image: - if isinstance(image, str): - image_content = image if image.startswith(("http", "oss")) else "file://" + image - else: - image_content = image - content.append( - { - "type": "image", - "image": image_content, - "min_pixels": MIN_PIXELS, - "max_pixels": MAX_PIXELS, - } - ) - - if text: - content.append({"type": "text", "text": text}) - - return content - - -def _format_mm_instruction(instruction: str, query: Dict, document: Dict) -> List[Dict]: - contents = [{"type": "text", "text": ": " + instruction}] - - contents.extend( - _format_mm_content( - query.get("text"), - query.get("image"), - query.get("video"), - prefix=":", - ) - ) - contents.extend( - _format_mm_content( - document.get("text"), - document.get("image"), - document.get("video"), - prefix="\n:", - ) - ) - - return [ - { - "role": "system", - "content": [ - { - "type": "text", - "text": ( - "Judge whether the Document meets the requirements based on the Query and the Instruct " - 'provided. Note that the answer can only be "yes" or "no".' - ), - } - ], - }, - {"role": "user", "content": contents}, - ] - - -def _truncate_tokens_optimized(tokens: List[int], max_length: int, special_tokens: List[int]) -> List[int]: - if len(tokens) <= max_length: - return tokens - - special_tokens_set = set(special_tokens) - num_special = sum(1 for token in tokens if token in special_tokens_set) - num_non_special_to_keep = max_length - num_special - - final_tokens = [] - non_special_kept_count = 0 - for token in tokens: - if token in special_tokens_set: - final_tokens.append(token) - elif non_special_kept_count < num_non_special_to_keep: - final_tokens.append(token) - non_special_kept_count += 1 - return final_tokens - - -def _tokenize_pair(processor, pair: List[Dict]) -> Dict: - pairs = [pair] - text = processor.apply_chat_template(pairs, tokenize=False, add_generation_prompt=True) - - images, videos, video_kwargs = process_vision_info( - pairs, - image_patch_size=16, - return_video_kwargs=True, - return_video_metadata=True, - ) - - if videos is not None: - videos, video_metadatas = zip(*videos) - videos = list(videos) - video_metadatas = list(video_metadatas) - else: - video_metadatas = None - - inputs = processor( - text=text, - images=images, - videos=videos, - video_metadata=video_metadatas, - truncation=False, - padding=False, - do_resize=False, - **video_kwargs, - ) - - for i, input_ids in enumerate(inputs["input_ids"]): - inputs["input_ids"][i] = ( - _truncate_tokens_optimized( - input_ids[:-5], - MAX_LENGTH, - processor.tokenizer.all_special_ids, - ) - + input_ids[-5:] - ) - - padded = processor.tokenizer.pad( - {"input_ids": inputs["input_ids"]}, - padding=True, - return_tensors="pt", - max_length=MAX_LENGTH, - ) - for key in padded: - inputs[key] = padded[key] - - if "pixel_values" in inputs: - inputs["pixel_values"] = inputs["pixel_values"].to(torch.float32) - - return inputs - - -def _get_yes_no_token_ids(tokenizer) -> Tuple[int, int]: - vocab = tokenizer.get_vocab() - if "yes" not in vocab or "no" not in vocab: - raise ValueError("Could not resolve tokenizer ids for exact tokens 'yes' and 'no'.") - return vocab["yes"], vocab["no"] - - -def _score_from_logits(logits, yes_token_id: int, no_token_id: int) -> np.ndarray: - if isinstance(logits, np.ndarray): - logits_tensor = torch.from_numpy(logits) - else: - logits_tensor = logits.detach().cpu() - - if logits_tensor.ndim == 3: - logits_tensor = logits_tensor[:, -1, :] - elif logits_tensor.ndim != 2: - raise ValueError(f"Unsupported logits rank for score conversion: {logits_tensor.ndim}") - - score = torch.sigmoid(logits_tensor[:, yes_token_id] - logits_tensor[:, no_token_id]) - return score.detach().cpu().numpy().astype(np.float64) - - -def _score_from_last_hidden(last_hidden_state: torch.Tensor, score_linear: torch.nn.Linear) -> np.ndarray: - score = torch.sigmoid(score_linear(last_hidden_state[:, -1])).squeeze(-1) - return score.detach().cpu().numpy().astype(np.float64) - - -def _make_score_linear(model_hf, yes_token_id: int, no_token_id: int) -> torch.nn.Linear: - lm_head_weights = model_hf.lm_head.weight.data - weight_yes = lm_head_weights[yes_token_id] - weight_no = lm_head_weights[no_token_id] - - linear_layer = torch.nn.Linear(weight_yes.shape[0], 1, bias=False) - with torch.no_grad(): - linear_layer.weight[0] = weight_yes - weight_no - return linear_layer.eval() - - -def _mad_stats(reference: np.ndarray, candidate: np.ndarray) -> Tuple[float, float]: - diff = np.abs(reference - candidate) - return float(np.mean(diff)), float(np.max(diff)) - - -def _prepare_qeff_inputs(qeff_model, tokenized_inputs: Dict, prefill_seq_len: int = None): - runtime_prompt_len = int(tokenized_inputs["input_ids"].shape[1]) - effective_prefill_seq_len = runtime_prompt_len if prefill_seq_len is None else prefill_seq_len - if effective_prefill_seq_len < runtime_prompt_len: - raise ValueError( - f"prefill_seq_len ({effective_prefill_seq_len}) must be >= runtime prompt length ({runtime_prompt_len})." - ) - - prepared_inputs = qeff_model.model.prepare_inputs_for_generation( - inputs=tokenized_inputs, - prefill_seq_len=effective_prefill_seq_len, - batch_size=1, - ) - - if "image_grid_thw" in prepared_inputs and prepared_inputs["image_grid_thw"].ndim == 2: - thw = prepared_inputs["image_grid_thw"][0] - t, h, w = int(thw[0].item()), int(thw[1].item()), int(thw[2].item()) - prepared_inputs["image_grid_thw"] = torch.zeros((1, t, h, w), dtype=thw.dtype) - - if "pixel_values" in prepared_inputs: - prepared_inputs["pixel_values"] = prepared_inputs["pixel_values"].to(torch.float32) - - return prepared_inputs, runtime_prompt_len - - -def _zero_vision_outputs(vision_outputs: Dict[str, np.ndarray]) -> Dict[str, np.ndarray]: - return {name: np.zeros_like(value) for name, value in vision_outputs.items()} - - -def _run_ai100_vision(vision_qpc_path: str, prepared_inputs) -> Dict[str, np.ndarray]: - vision_session = QAICInferenceSession(vision_qpc_path) - vision_inputs = { - "pixel_values": prepared_inputs["pixel_values"].detach().cpu().numpy().astype(np.float16), - "image_grid_thw": prepared_inputs["image_grid_thw"].detach().cpu().numpy().astype(np.int64), - } - vision_outputs = vision_session.run(vision_inputs) - vision_session.deactivate() - return vision_outputs - - -def _run_ai100_prefill(qpc_paths, prepared_inputs, vision_template): - if not isinstance(qpc_paths, dict): - raise ValueError("Expected qpc_paths to be a dict with vision/lang QPC keys.") - - vision_qpc_path = qpc_paths.get("vision_qpc_path") - lang_qpc_path = qpc_paths.get("lang_qpc_path") - if vision_qpc_path is None or lang_qpc_path is None: - raise ValueError("Missing vision_qpc_path/lang_qpc_path in compiled QPC outputs.") - - prefill_len = prepared_inputs["position_ids"].shape[-1] - input_ids = prepared_inputs["input_ids"] - if input_ids.shape[1] < prefill_len: - pad = torch.full( - (input_ids.shape[0], prefill_len - input_ids.shape[1]), - 1, - dtype=input_ids.dtype, - device=input_ids.device, - ) - input_ids = torch.cat([input_ids, pad], dim=1) - else: - input_ids = input_ids[:, :prefill_len] - position_ids = prepared_inputs["position_ids"][..., :prefill_len] - - if "pixel_values" in prepared_inputs and "image_grid_thw" in prepared_inputs: - vision_outputs = _run_ai100_vision(vision_qpc_path, prepared_inputs) - else: - vision_outputs = _zero_vision_outputs(vision_template) - - lang_session = QAICInferenceSession(lang_qpc_path) - lang_session.skip_buffers( - [ - name - for name in lang_session.input_names + lang_session.output_names - if name.startswith("past_") or name.endswith("_RetainedState") - ] - ) - lang_session.set_buffers(vision_outputs) - lang_inputs = { - "input_ids": input_ids.detach().cpu().numpy().astype(np.int64), - "position_ids": position_ids.detach().cpu().numpy().astype(np.int64), - "image_idx": np.zeros((1, 1), dtype=np.int64), - } - outputs = lang_session.run(lang_inputs) - lang_session.deactivate() - return outputs["logits"] - - -@pytest.mark.on_qaic -@pytest.mark.multimodal -@pytest.mark.regular -@pytest.mark.parametrize("model_name", test_reranker_models) -def test_qwen3_vl_reranker_mad_parity(model_name): - torch.manual_seed(42) - model_cfg = reranker_model_config_dict[model_name] - model_source = _resolve_model_source(model_name) - - config = AutoConfig.from_pretrained(model_source, trust_remote_code=True, padding=True) - config = set_num_layers_vlm(config, n_layer=model_cfg["num_layers"]) - if hasattr(config, "use_cache"): - config.use_cache = True - if hasattr(config, "text_config") and hasattr(config.text_config, "use_cache"): - config.text_config.use_cache = True - - model_hf = load_vlm_model(config) - model_hf.eval() - - qeff_model = QEFFAutoModelForImageTextToText.from_pretrained( - model_source, - kv_offload=True, - config=config, - ) - processor = AutoProcessor.from_pretrained(model_source, trust_remote_code=True, padding=True) - - yes_token_id, no_token_id = _get_yes_no_token_ids(processor.tokenizer) - score_linear = _make_score_linear(model_hf, yes_token_id, no_token_id).to(next(model_hf.parameters()).device) - score_linear = score_linear.to(dtype=next(model_hf.parameters()).dtype) - - doc_contexts = [] - max_prompt_len = 0 - max_grid_h = 22 - max_grid_w = 34 - - hf_scores_list = [] - - documents = EXAMPLE_INPUTS["documents"] - if RERANKER_DOC_LIMIT > 0: - documents = documents[:RERANKER_DOC_LIMIT] - - for document in documents: - pair = _format_mm_instruction( - instruction=EXAMPLE_INPUTS["instruction"], - query=EXAMPLE_INPUTS["query"], - document=document, - ) - tokenized = _tokenize_pair(processor, pair) - runtime_prompt_len = int(tokenized["input_ids"].shape[1]) - - hf_inputs = {} - for key, value in tokenized.items(): - hf_inputs[key] = value.to(next(model_hf.parameters()).device) if torch.is_tensor(value) else value - with torch.no_grad(): - hf_last_hidden = model_hf.model(**hf_inputs).last_hidden_state - hf_score = _score_from_last_hidden(hf_last_hidden, score_linear)[0] - hf_scores_list.append(float(hf_score)) - - if "image_grid_thw" in tokenized and tokenized["image_grid_thw"].numel() > 0: - grid = tokenized["image_grid_thw"] - max_grid_h = max(max_grid_h, int(grid[..., 1].max().item())) - max_grid_w = max(max_grid_w, int(grid[..., 2].max().item())) - - doc_contexts.append( - { - "tokenized": tokenized, - } - ) - max_prompt_len = max(max_prompt_len, runtime_prompt_len) - - patch_size = int(qeff_model.model.config.vision_config.patch_size) - compile_height = max_grid_h * patch_size - compile_width = max_grid_w * patch_size - - qpc_paths = qeff_model.compile( - img_size=max(compile_height, compile_width), - height=compile_height, - width=compile_width, - prefill_seq_len=max_prompt_len, - ctx_len=model_cfg["ctx_len"], - num_devices=1, - num_cores=16, - mxfp6_matmul=False, - ) - - ai100_scores_list = [] - - prepared_contexts = [] - vision_template_ai100 = None - for context in doc_contexts: - prepared_inputs, _ = _prepare_qeff_inputs( - qeff_model=qeff_model, - tokenized_inputs=context["tokenized"], - prefill_seq_len=max_prompt_len, - ) - prepared_contexts.append( - { - "prepared_inputs": prepared_inputs, - } - ) - if vision_template_ai100 is None and "pixel_values" in prepared_inputs and "image_grid_thw" in prepared_inputs: - vision_template_ai100 = _run_ai100_vision(qpc_paths["vision_qpc_path"], prepared_inputs) - - if vision_template_ai100 is None: - raise ValueError("Expected at least one image document to initialize vision templates.") - - for context in prepared_contexts: - prepared_inputs_runtime = context["prepared_inputs"] - ai100_logits = _run_ai100_prefill( - qpc_paths=qpc_paths, - prepared_inputs=prepared_inputs_runtime, - vision_template=vision_template_ai100, - ) - ai100_score = _score_from_logits(ai100_logits, yes_token_id, no_token_id)[0] - ai100_scores_list.append(float(ai100_score)) - - hf_scores = np.array(hf_scores_list, dtype=np.float64) - ai100_scores = np.array(ai100_scores_list, dtype=np.float64) - - print(f"[SCORES] PyTorch(original): {hf_scores.tolist()}") - print(f"[SCORES] AI100: {ai100_scores.tolist()}") - - pt_ai100_mad_mean, pt_ai100_mad_max = _mad_stats(hf_scores, ai100_scores) - print(f"[MAD] PyTorch(original) vs AI100: mean={pt_ai100_mad_mean:.6e}, max={pt_ai100_mad_max:.6e}") - assert pt_ai100_mad_max <= PT_AI100_MAD_MAX, ( - f"PyTorch(original) vs AI100 MAD max {pt_ai100_mad_max:.6e} " - f"exceeds threshold {PT_AI100_MAD_MAX:.6e}. " - f"Check tokenizer ids, prompt formatting, runtime prompt length slicing, and compile dimensions." - ) From 612ed3e0c3aad26d60d58ae0eea1386816ffa5c0 Mon Sep 17 00:00:00 2001 From: Amit Date: Wed, 20 May 2026 23:32:03 +0530 Subject: [PATCH 03/11] Addressed comments and fix CI issue Signed-off-by: Amit Signed-off-by: Amit Raj --- examples/reranker/qwen3vl/README.md | 7 ++----- examples/reranker/qwen3vl/reranker_model.py | 10 +++------- scripts/Jenkinsfile | 2 +- 3 files changed, 6 insertions(+), 13 deletions(-) diff --git a/examples/reranker/qwen3vl/README.md b/examples/reranker/qwen3vl/README.md index d9d96645a8..7ebe1d7db8 100644 --- a/examples/reranker/qwen3vl/README.md +++ b/examples/reranker/qwen3vl/README.md @@ -23,11 +23,8 @@ pip install "qwen-vl-utils>=0.0.14" ## Scripts -- `qwen3_vl_reranker.py` - runnable example that explicitly shows: - - `QEFFAutoModelForImageTextToText.from_pretrained(...)` - - `model.compile(...)` arguments for QPC generation - - AI100 scoring call flow -- `reranker_model.py` - Qwen3-VL-specific helper logic (prompting/tokenization/scoring/runtime glue) adapted from the official Qwen reranker reference: +- `qwen3_vl_reranker.py` - simple runnable API usage example. +- `reranker_model.py` - AI100 dual-QPC implementation adapted from the official Qwen reranker reference: https://huggingface.co/Qwen/Qwen3-VL-Reranker-2B/blob/main/scripts/qwen3_vl_reranker.py ## Run diff --git a/examples/reranker/qwen3vl/reranker_model.py b/examples/reranker/qwen3vl/reranker_model.py index 33e73b05f6..8577c8a979 100644 --- a/examples/reranker/qwen3vl/reranker_model.py +++ b/examples/reranker/qwen3vl/reranker_model.py @@ -5,17 +5,13 @@ # # ---------------------------------------------------------------------------- -"""Qwen3-VL-specific reranker helpers for AI100 runtime. +"""Core AI100 reranker implementation for Qwen3-VL reranker models. The tokenization/scoring flow is adapted from the official Qwen reference: https://huggingface.co/Qwen/Qwen3-VL-Reranker-2B/blob/main/scripts/qwen3_vl_reranker.py -This module intentionally keeps only Qwen3-VL-specific reranker logic -(prompt construction, multimodal tokenization, yes/no score computation, -and AI100 runtime orchestration with compiled QPC paths). - -Model loading (`from_pretrained`) and model compilation (`compile`) are exposed -in `qwen3_vl_reranker.py` so users can directly see QEff API usage. +This module isolates AI100 dual-QPC runtime details so the user-facing example +script (`qwen3_vl_reranker.py`) remains focused on simple API usage. """ from typing import Dict, List, Tuple diff --git a/scripts/Jenkinsfile b/scripts/Jenkinsfile index 49f637c2f9..0858a08254 100644 --- a/scripts/Jenkinsfile +++ b/scripts/Jenkinsfile @@ -64,7 +64,7 @@ pipeline { pip install .[test] && pip install junitparser pytest-xdist && pip install librosa==0.10.2 soundfile==0.13.1 && - pip install qwen-vl-utils==0.0.14 && + pip install "qwen-vl-utils>=0.0.14" && pip install --extra-index-url https://download.pytorch.org/whl/cpu timm==1.0.14 torchvision==0.22.0+cpu einops==0.8.1 rm -rf QEfficient" ''' From c4334c18e92f205355c4f9a40256f8d1c32680ec Mon Sep 17 00:00:00 2001 From: Amit Raj Date: Thu, 21 May 2026 11:00:53 +0530 Subject: [PATCH 04/11] Updated installation of qwen-vl-utils Signed-off-by: Amit Raj --- QEfficient/transformers/models/modeling_auto.py | 8 ++------ .../transformers/models/whisper/modeling_whisper.py | 5 ++++- scripts/Jenkinsfile | 2 +- 3 files changed, 7 insertions(+), 8 deletions(-) diff --git a/QEfficient/transformers/models/modeling_auto.py b/QEfficient/transformers/models/modeling_auto.py index fc10032df6..0b1e3702b6 100755 --- a/QEfficient/transformers/models/modeling_auto.py +++ b/QEfficient/transformers/models/modeling_auto.py @@ -1377,7 +1377,7 @@ def export( kv_offload=True, continuous_batching=self.continuous_batching, comp_ctx_lengths=self.comp_ctx_lengths_decode, - prefill_seq_len=prefill_seq_len, + **dummy_inputs_kwargs, ) dynamic_axes = self.model.get_onnx_dynamic_axes( kv_offload=True, @@ -1385,11 +1385,7 @@ def export( comp_ctx_lengths=self.comp_ctx_lengths_decode, ) except TypeError: - inputs = self.model.get_dummy_inputs( - kv_offload=True, - comp_ctx_lengths=self.comp_ctx_lengths_decode, - prefill_seq_len=prefill_seq_len, - ) + inputs = self.model.get_dummy_inputs(kv_offload=True, comp_ctx_lengths=self.comp_ctx_lengths_decode) dynamic_axes = self.model.get_onnx_dynamic_axes( kv_offload=True, comp_ctx_lengths=self.comp_ctx_lengths_decode ) diff --git a/QEfficient/transformers/models/whisper/modeling_whisper.py b/QEfficient/transformers/models/whisper/modeling_whisper.py index 1bdcd07ada..bf01a1779f 100644 --- a/QEfficient/transformers/models/whisper/modeling_whisper.py +++ b/QEfficient/transformers/models/whisper/modeling_whisper.py @@ -795,7 +795,10 @@ def get_dummy_inputs( **kwargs, ): bs = 1 - seq_len = int(kwargs.get("prefill_seq_len", ONNX_EXPORT_EXAMPLE_SEQ_LEN)) + seq_len = kwargs.get("prefill_seq_len") + if seq_len is None: + seq_len = 32 + seq_len = int(seq_len) encoder_seq_len = self.config.max_source_positions encoder_feature_count = self.config.num_mel_bins num_key_value_heads = self.config.decoder_attention_heads diff --git a/scripts/Jenkinsfile b/scripts/Jenkinsfile index 0858a08254..49f637c2f9 100644 --- a/scripts/Jenkinsfile +++ b/scripts/Jenkinsfile @@ -64,7 +64,7 @@ pipeline { pip install .[test] && pip install junitparser pytest-xdist && pip install librosa==0.10.2 soundfile==0.13.1 && - pip install "qwen-vl-utils>=0.0.14" && + pip install qwen-vl-utils==0.0.14 && pip install --extra-index-url https://download.pytorch.org/whl/cpu timm==1.0.14 torchvision==0.22.0+cpu einops==0.8.1 rm -rf QEfficient" ''' From eee709853eae6a432286534292504c5c060d42a1 Mon Sep 17 00:00:00 2001 From: Amit Raj Date: Fri, 22 May 2026 17:58:35 +0530 Subject: [PATCH 05/11] Addressed comments Signed-off-by: Amit Raj --- examples/reranker/qwen3vl/README.md | 7 +- .../reranker/qwen3vl/qwen3_vl_reranker.py | 4 +- examples/reranker/qwen3vl/reranker_model.py | 153 ++++++++++++++---- 3 files changed, 126 insertions(+), 38 deletions(-) diff --git a/examples/reranker/qwen3vl/README.md b/examples/reranker/qwen3vl/README.md index 7ebe1d7db8..d9d96645a8 100644 --- a/examples/reranker/qwen3vl/README.md +++ b/examples/reranker/qwen3vl/README.md @@ -23,8 +23,11 @@ pip install "qwen-vl-utils>=0.0.14" ## Scripts -- `qwen3_vl_reranker.py` - simple runnable API usage example. -- `reranker_model.py` - AI100 dual-QPC implementation adapted from the official Qwen reranker reference: +- `qwen3_vl_reranker.py` - runnable example that explicitly shows: + - `QEFFAutoModelForImageTextToText.from_pretrained(...)` + - `model.compile(...)` arguments for QPC generation + - AI100 scoring call flow +- `reranker_model.py` - Qwen3-VL-specific helper logic (prompting/tokenization/scoring/runtime glue) adapted from the official Qwen reranker reference: https://huggingface.co/Qwen/Qwen3-VL-Reranker-2B/blob/main/scripts/qwen3_vl_reranker.py ## Run diff --git a/examples/reranker/qwen3vl/qwen3_vl_reranker.py b/examples/reranker/qwen3vl/qwen3_vl_reranker.py index 01884d0d08..42e2cf5082 100644 --- a/examples/reranker/qwen3vl/qwen3_vl_reranker.py +++ b/examples/reranker/qwen3vl/qwen3_vl_reranker.py @@ -85,13 +85,13 @@ def main() -> None: model_source = resolve_model_source(args.model_name) # 1) Load config + processor + QEff model through public QEff/HF APIs. - config = AutoConfig.from_pretrained(model_source, trust_remote_code=True) + config = AutoConfig.from_pretrained(model_source, trust_remote_code=True, padding=True) if hasattr(config, "use_cache"): config.use_cache = True if hasattr(config, "text_config") and hasattr(config.text_config, "use_cache"): config.text_config.use_cache = True - processor = AutoProcessor.from_pretrained(model_source, trust_remote_code=True) + processor = AutoProcessor.from_pretrained(model_source, trust_remote_code=True, padding=True) model = QEFFAutoModelForImageTextToText.from_pretrained( model_source, kv_offload=True, diff --git a/examples/reranker/qwen3vl/reranker_model.py b/examples/reranker/qwen3vl/reranker_model.py index 8577c8a979..8cd8a5ed4f 100644 --- a/examples/reranker/qwen3vl/reranker_model.py +++ b/examples/reranker/qwen3vl/reranker_model.py @@ -5,32 +5,27 @@ # # ---------------------------------------------------------------------------- -"""Core AI100 reranker implementation for Qwen3-VL reranker models. +"""Qwen3-VL-specific reranker helpers for AI100 runtime. The tokenization/scoring flow is adapted from the official Qwen reference: https://huggingface.co/Qwen/Qwen3-VL-Reranker-2B/blob/main/scripts/qwen3_vl_reranker.py -This module isolates AI100 dual-QPC runtime details so the user-facing example -script (`qwen3_vl_reranker.py`) remains focused on simple API usage. +This module intentionally keeps only Qwen3-VL-specific reranker logic +(prompt construction, multimodal tokenization, yes/no score computation, +and AI100 runtime orchestration with compiled QPC paths). + +Model loading (`from_pretrained`) and model compilation (`compile`) are exposed +in `qwen3_vl_reranker.py` so users can directly see QEff API usage. """ from typing import Dict, List, Tuple import numpy as np import torch +from huggingface_hub import snapshot_download +from qwen_vl_utils import process_vision_info from QEfficient.generation.cloud_infer import QAICInferenceSession -from QEfficient.transformers.models.qwen3_vl._reranker_utils import ( - format_mm_content, - format_mm_instruction, - get_yes_no_token_ids, - score_from_logits, - tokenize_pair, - truncate_tokens_optimized, -) -from QEfficient.transformers.models.qwen3_vl._reranker_utils import ( - resolve_model_source as _resolve_model_source, -) # Max token budget used by this example's manual truncation/padding flow. MAX_LENGTH = 8192 @@ -48,7 +43,9 @@ def resolve_model_source(model_name_or_path: str) -> str: Some transformers versions can fail when resolving chat templates from repo-id mode for this model. Using a local snapshot path avoids that path. """ - return _resolve_model_source(model_name_or_path) + if os.path.isdir(model_name_or_path): + return model_name_or_path + return snapshot_download(repo_id=model_name_or_path) class QEffQwen3VLReranker: @@ -84,40 +81,128 @@ def _score_from_logits(logits, yes_token_id: int, no_token_id: int) -> float: Score formula: sigmoid(logit_yes - logit_no) """ - score = score_from_logits(logits, yes_token_id, no_token_id) + logits_tensor = torch.from_numpy(logits) if isinstance(logits, np.ndarray) else logits.detach().cpu() + if logits_tensor.ndim == 3: + logits_tensor = logits_tensor[:, -1, :] + score = torch.sigmoid(logits_tensor[:, yes_token_id] - logits_tensor[:, no_token_id]) return float(score[0].item()) @staticmethod def _truncate_tokens_optimized(tokens: List[int], max_length: int, special_tokens: List[int]) -> List[int]: """Truncate while preserving all special tokens in sequence order.""" - return truncate_tokens_optimized(tokens, max_length, special_tokens) + if len(tokens) <= max_length: + return tokens + + special_tokens_set = set(special_tokens) + num_special = sum(1 for token in tokens if token in special_tokens_set) + num_non_special_to_keep = max_length - num_special + + final_tokens = [] + non_special_kept_count = 0 + for token in tokens: + if token in special_tokens_set: + final_tokens.append(token) + elif non_special_kept_count < num_non_special_to_keep: + final_tokens.append(token) + non_special_kept_count += 1 + return final_tokens def _format_mm_content(self, text, image, video, prefix: str) -> List[Dict]: """Build one multimodal content block (prefix + optional image + optional text).""" - return format_mm_content( - text=text, - image=image, - video=video, - prefix=prefix, - min_pixels=MIN_PIXELS, - max_pixels=MAX_PIXELS, - unsupported_video_error="Video input is not supported in this AI100-only example.", - ) + content = [{"type": "text", "text": prefix}] + + if not text and not image and not video: + content.append({"type": "text", "text": "NULL"}) + return content + + if video: + raise ValueError("Video input is not supported in this AI100-only example.") + + if image: + if isinstance(image, str): + image_content = image if image.startswith(("http", "oss")) else "file://" + image + else: + image_content = image + content.append( + { + "type": "image", + "image": image_content, + "min_pixels": MIN_PIXELS, + "max_pixels": MAX_PIXELS, + } + ) + + if text: + content.append({"type": "text", "text": text}) + + return content def _format_mm_instruction(self, instruction: str, query: Dict, document: Dict) -> List[Dict]: """Create the chat payload for one query-document pair.""" - return format_mm_instruction( - instruction=instruction, - query=query, - document=document, - min_pixels=MIN_PIXELS, - max_pixels=MAX_PIXELS, - unsupported_video_error="Video input is not supported in this AI100-only example.", + contents = [{"type": "text", "text": ": " + instruction}] + + contents.extend( + self._format_mm_content( + query.get("text"), + query.get("image"), + query.get("video"), + prefix=":", + ) ) def _tokenize_pair(self, pair: List[Dict]) -> Dict: """Tokenize a query-document pair with the exact HF multimodal pipeline.""" - return tokenize_pair(self.processor, pair, self.max_length) + pairs = [pair] + text = self.processor.apply_chat_template(pairs, tokenize=False, add_generation_prompt=True) + + images, videos, video_kwargs = process_vision_info( + pairs, + image_patch_size=16, + return_video_kwargs=True, + return_video_metadata=True, + ) + + if videos is not None: + videos, video_metadatas = zip(*videos) + videos = list(videos) + video_metadatas = list(video_metadatas) + else: + video_metadatas = None + + inputs = self.processor( + text=text, + images=images, + videos=videos, + video_metadata=video_metadatas, + truncation=False, + padding=False, + do_resize=False, + **video_kwargs, + ) + + for i, input_ids in enumerate(inputs["input_ids"]): + inputs["input_ids"][i] = ( + self._truncate_tokens_optimized( + input_ids[:-5], + self.max_length, + self.processor.tokenizer.all_special_ids, + ) + + input_ids[-5:] + ) + + padded = self.processor.tokenizer.pad( + {"input_ids": inputs["input_ids"]}, + padding=True, + return_tensors="pt", + max_length=self.max_length, + ) + for key in padded: + inputs[key] = padded[key] + + if "pixel_values" in inputs: + inputs["pixel_values"] = inputs["pixel_values"].to(torch.float32) + + return inputs def _prepare_inputs(self, tokenized_inputs: Dict, prefill_seq_len: int): """Prepare model inputs for dual-QPC prefill execution.""" From 7d1e2f43f7257d3174f2d66d4f5ee560de13a3da Mon Sep 17 00:00:00 2001 From: Amit Raj Date: Mon, 1 Jun 2026 14:25:26 +0530 Subject: [PATCH 06/11] Rebased and addressed comments Signed-off-by: Amit Raj --- .../models/whisper/modeling_whisper.py | 5 +- .../reranker/qwen3vl/qwen3_vl_reranker.py | 4 +- examples/reranker/qwen3vl/reranker_model.py | 143 ++++-------------- 3 files changed, 34 insertions(+), 118 deletions(-) diff --git a/QEfficient/transformers/models/whisper/modeling_whisper.py b/QEfficient/transformers/models/whisper/modeling_whisper.py index bf01a1779f..1bdcd07ada 100644 --- a/QEfficient/transformers/models/whisper/modeling_whisper.py +++ b/QEfficient/transformers/models/whisper/modeling_whisper.py @@ -795,10 +795,7 @@ def get_dummy_inputs( **kwargs, ): bs = 1 - seq_len = kwargs.get("prefill_seq_len") - if seq_len is None: - seq_len = 32 - seq_len = int(seq_len) + seq_len = int(kwargs.get("prefill_seq_len", ONNX_EXPORT_EXAMPLE_SEQ_LEN)) encoder_seq_len = self.config.max_source_positions encoder_feature_count = self.config.num_mel_bins num_key_value_heads = self.config.decoder_attention_heads diff --git a/examples/reranker/qwen3vl/qwen3_vl_reranker.py b/examples/reranker/qwen3vl/qwen3_vl_reranker.py index 42e2cf5082..01884d0d08 100644 --- a/examples/reranker/qwen3vl/qwen3_vl_reranker.py +++ b/examples/reranker/qwen3vl/qwen3_vl_reranker.py @@ -85,13 +85,13 @@ def main() -> None: model_source = resolve_model_source(args.model_name) # 1) Load config + processor + QEff model through public QEff/HF APIs. - config = AutoConfig.from_pretrained(model_source, trust_remote_code=True, padding=True) + config = AutoConfig.from_pretrained(model_source, trust_remote_code=True) if hasattr(config, "use_cache"): config.use_cache = True if hasattr(config, "text_config") and hasattr(config.text_config, "use_cache"): config.text_config.use_cache = True - processor = AutoProcessor.from_pretrained(model_source, trust_remote_code=True, padding=True) + processor = AutoProcessor.from_pretrained(model_source, trust_remote_code=True) model = QEFFAutoModelForImageTextToText.from_pretrained( model_source, kv_offload=True, diff --git a/examples/reranker/qwen3vl/reranker_model.py b/examples/reranker/qwen3vl/reranker_model.py index 8cd8a5ed4f..33e73b05f6 100644 --- a/examples/reranker/qwen3vl/reranker_model.py +++ b/examples/reranker/qwen3vl/reranker_model.py @@ -22,10 +22,19 @@ import numpy as np import torch -from huggingface_hub import snapshot_download -from qwen_vl_utils import process_vision_info from QEfficient.generation.cloud_infer import QAICInferenceSession +from QEfficient.transformers.models.qwen3_vl._reranker_utils import ( + format_mm_content, + format_mm_instruction, + get_yes_no_token_ids, + score_from_logits, + tokenize_pair, + truncate_tokens_optimized, +) +from QEfficient.transformers.models.qwen3_vl._reranker_utils import ( + resolve_model_source as _resolve_model_source, +) # Max token budget used by this example's manual truncation/padding flow. MAX_LENGTH = 8192 @@ -43,9 +52,7 @@ def resolve_model_source(model_name_or_path: str) -> str: Some transformers versions can fail when resolving chat templates from repo-id mode for this model. Using a local snapshot path avoids that path. """ - if os.path.isdir(model_name_or_path): - return model_name_or_path - return snapshot_download(repo_id=model_name_or_path) + return _resolve_model_source(model_name_or_path) class QEffQwen3VLReranker: @@ -81,128 +88,40 @@ def _score_from_logits(logits, yes_token_id: int, no_token_id: int) -> float: Score formula: sigmoid(logit_yes - logit_no) """ - logits_tensor = torch.from_numpy(logits) if isinstance(logits, np.ndarray) else logits.detach().cpu() - if logits_tensor.ndim == 3: - logits_tensor = logits_tensor[:, -1, :] - score = torch.sigmoid(logits_tensor[:, yes_token_id] - logits_tensor[:, no_token_id]) + score = score_from_logits(logits, yes_token_id, no_token_id) return float(score[0].item()) @staticmethod def _truncate_tokens_optimized(tokens: List[int], max_length: int, special_tokens: List[int]) -> List[int]: """Truncate while preserving all special tokens in sequence order.""" - if len(tokens) <= max_length: - return tokens - - special_tokens_set = set(special_tokens) - num_special = sum(1 for token in tokens if token in special_tokens_set) - num_non_special_to_keep = max_length - num_special - - final_tokens = [] - non_special_kept_count = 0 - for token in tokens: - if token in special_tokens_set: - final_tokens.append(token) - elif non_special_kept_count < num_non_special_to_keep: - final_tokens.append(token) - non_special_kept_count += 1 - return final_tokens + return truncate_tokens_optimized(tokens, max_length, special_tokens) def _format_mm_content(self, text, image, video, prefix: str) -> List[Dict]: """Build one multimodal content block (prefix + optional image + optional text).""" - content = [{"type": "text", "text": prefix}] - - if not text and not image and not video: - content.append({"type": "text", "text": "NULL"}) - return content - - if video: - raise ValueError("Video input is not supported in this AI100-only example.") - - if image: - if isinstance(image, str): - image_content = image if image.startswith(("http", "oss")) else "file://" + image - else: - image_content = image - content.append( - { - "type": "image", - "image": image_content, - "min_pixels": MIN_PIXELS, - "max_pixels": MAX_PIXELS, - } - ) - - if text: - content.append({"type": "text", "text": text}) - - return content + return format_mm_content( + text=text, + image=image, + video=video, + prefix=prefix, + min_pixels=MIN_PIXELS, + max_pixels=MAX_PIXELS, + unsupported_video_error="Video input is not supported in this AI100-only example.", + ) def _format_mm_instruction(self, instruction: str, query: Dict, document: Dict) -> List[Dict]: """Create the chat payload for one query-document pair.""" - contents = [{"type": "text", "text": ": " + instruction}] - - contents.extend( - self._format_mm_content( - query.get("text"), - query.get("image"), - query.get("video"), - prefix=":", - ) + return format_mm_instruction( + instruction=instruction, + query=query, + document=document, + min_pixels=MIN_PIXELS, + max_pixels=MAX_PIXELS, + unsupported_video_error="Video input is not supported in this AI100-only example.", ) def _tokenize_pair(self, pair: List[Dict]) -> Dict: """Tokenize a query-document pair with the exact HF multimodal pipeline.""" - pairs = [pair] - text = self.processor.apply_chat_template(pairs, tokenize=False, add_generation_prompt=True) - - images, videos, video_kwargs = process_vision_info( - pairs, - image_patch_size=16, - return_video_kwargs=True, - return_video_metadata=True, - ) - - if videos is not None: - videos, video_metadatas = zip(*videos) - videos = list(videos) - video_metadatas = list(video_metadatas) - else: - video_metadatas = None - - inputs = self.processor( - text=text, - images=images, - videos=videos, - video_metadata=video_metadatas, - truncation=False, - padding=False, - do_resize=False, - **video_kwargs, - ) - - for i, input_ids in enumerate(inputs["input_ids"]): - inputs["input_ids"][i] = ( - self._truncate_tokens_optimized( - input_ids[:-5], - self.max_length, - self.processor.tokenizer.all_special_ids, - ) - + input_ids[-5:] - ) - - padded = self.processor.tokenizer.pad( - {"input_ids": inputs["input_ids"]}, - padding=True, - return_tensors="pt", - max_length=self.max_length, - ) - for key in padded: - inputs[key] = padded[key] - - if "pixel_values" in inputs: - inputs["pixel_values"] = inputs["pixel_values"].to(torch.float32) - - return inputs + return tokenize_pair(self.processor, pair, self.max_length) def _prepare_inputs(self, tokenized_inputs: Dict, prefill_seq_len: int): """Prepare model inputs for dual-QPC prefill execution.""" From 15d0ff1817fd50149e4c4817e4b03cfc65f38215 Mon Sep 17 00:00:00 2001 From: Amit Raj Date: Thu, 4 Jun 2026 01:05:16 +0530 Subject: [PATCH 07/11] Intial fix Signed-off-by: Amit Raj --- QEfficient/transformers/models/modeling_auto.py | 9 ++++----- .../models/qwen3_vl/modeling_qwen3_vl.py | 15 +++++---------- examples/reranker/qwen3vl/README.md | 1 - examples/reranker/qwen3vl/qwen3_vl_reranker.py | 2 -- examples/reranker/qwen3vl/reranker_model.py | 5 +++-- 5 files changed, 12 insertions(+), 20 deletions(-) diff --git a/QEfficient/transformers/models/modeling_auto.py b/QEfficient/transformers/models/modeling_auto.py index 0b1e3702b6..360aaa13ec 100755 --- a/QEfficient/transformers/models/modeling_auto.py +++ b/QEfficient/transformers/models/modeling_auto.py @@ -1367,17 +1367,12 @@ def export( List[str] A list containing the paths to the generated ONNX graph files for both components. """ - dummy_inputs_kwargs = {} - if prefill_seq_len is not None: - dummy_inputs_kwargs["prefill_seq_len"] = int(prefill_seq_len) - # TODO This is a temporary change as continous batching is enabled only for few models. Once support is added for all the models this exception handing can be removed. try: inputs = self.model.get_dummy_inputs( kv_offload=True, continuous_batching=self.continuous_batching, comp_ctx_lengths=self.comp_ctx_lengths_decode, - **dummy_inputs_kwargs, ) dynamic_axes = self.model.get_onnx_dynamic_axes( kv_offload=True, @@ -1678,6 +1673,10 @@ def compile( elif prefill_seq_len == 1: specializations = specializations["lang"][-1:] qpc_key = "lang_decode_qpc_path" + elif prefill_seq_len is not None and ctx_len is not None and prefill_seq_len == ctx_len: + # Single-shot mode (e.g. reranker): no decode steps, only prefill kernel needed. + specializations = specializations["lang"][:1] + qpc_key = "lang_qpc_path" else: specializations = specializations["lang"] qpc_key = "lang_qpc_path" diff --git a/QEfficient/transformers/models/qwen3_vl/modeling_qwen3_vl.py b/QEfficient/transformers/models/qwen3_vl/modeling_qwen3_vl.py index 0f6ab210de..9f609ea2ea 100644 --- a/QEfficient/transformers/models/qwen3_vl/modeling_qwen3_vl.py +++ b/QEfficient/transformers/models/qwen3_vl/modeling_qwen3_vl.py @@ -847,13 +847,8 @@ def get_dummy_inputs( continuous_batching: bool = False, **kwargs, ): - prefill_seq_len = kwargs.get("prefill_seq_len", constants.ONNX_EXPORT_EXAMPLE_SEQ_LEN) - if prefill_seq_len is None: - prefill_seq_len = constants.ONNX_EXPORT_EXAMPLE_SEQ_LEN - prefill_seq_len = int(prefill_seq_len) - inputs_shapes = {} - inputs_shapes["input_ids"] = (constants.ONNX_EXPORT_EXAMPLE_BATCH_SIZE, prefill_seq_len) + inputs_shapes["input_ids"] = (constants.ONNX_EXPORT_EXAMPLE_BATCH_SIZE, constants.ONNX_EXPORT_EXAMPLE_SEQ_LEN) # vision_size = 1024 vision_size = 187 inputs_shapes["vision_embeds"] = ( @@ -865,7 +860,7 @@ def get_dummy_inputs( inputs_shapes["position_ids"] = ( 3, constants.ONNX_EXPORT_EXAMPLE_BATCH_SIZE, - prefill_seq_len, + constants.ONNX_EXPORT_EXAMPLE_SEQ_LEN, ) inputs_shapes["pixel_values"] = (748, 1536) inputs_shapes["image_idx"] = (1, 1) @@ -889,8 +884,8 @@ def get_dummy_inputs( ) lang_inputs["position_ids"] = ( ( - torch.arange(prefill_seq_len, dtype=torch.int64) - .view(1, prefill_seq_len) + torch.arange(constants.ONNX_EXPORT_EXAMPLE_SEQ_LEN, dtype=torch.int64) + .view(1, constants.ONNX_EXPORT_EXAMPLE_SEQ_LEN) .repeat(constants.ONNX_EXPORT_EXAMPLE_BATCH_SIZE, 1) ) .unsqueeze(0) @@ -908,7 +903,7 @@ def get_dummy_inputs( kv_cache_shape = get_padding_shape_from_config( config=self.model.config.text_config, batch_size=fbs if continuous_batching else bs, - seq_len=prefill_seq_len, + seq_len=constants.ONNX_EXPORT_EXAMPLE_SEQ_LEN, ) lang_inputs["past_key_values"] = [[] for _ in range(self.model.config.text_config.num_hidden_layers)] diff --git a/examples/reranker/qwen3vl/README.md b/examples/reranker/qwen3vl/README.md index d9d96645a8..74bc9d4a2a 100644 --- a/examples/reranker/qwen3vl/README.md +++ b/examples/reranker/qwen3vl/README.md @@ -49,7 +49,6 @@ With compile parameters: ```bash python examples/reranker/qwen3vl/qwen3_vl_reranker.py \ --model-name Qwen/Qwen3-VL-Reranker-2B \ - --ctx-len 2048 \ --num-cores 16 \ --num-devices 1 \ --compile-prefill-seq-len 4096 \ diff --git a/examples/reranker/qwen3vl/qwen3_vl_reranker.py b/examples/reranker/qwen3vl/qwen3_vl_reranker.py index 01884d0d08..504280e7d9 100644 --- a/examples/reranker/qwen3vl/qwen3_vl_reranker.py +++ b/examples/reranker/qwen3vl/qwen3_vl_reranker.py @@ -30,7 +30,6 @@ def parse_args() -> argparse.Namespace: """Parse command-line arguments for AI100 compile/inference knobs.""" parser = argparse.ArgumentParser(description="Qwen3-VL reranker example.") parser.add_argument("--model-name", type=str, default="Qwen/Qwen3-VL-Reranker-2B") - parser.add_argument("--ctx-len", type=int, default=2048, help="Context length used at compile time.") parser.add_argument("--num-cores", type=int, default=16, help="Number of AI100 cores.") parser.add_argument("--num-devices", type=int, default=1, help="Number of AI100 devices.") parser.add_argument( @@ -106,7 +105,6 @@ def main() -> None: # 3) Derive compile requirements from current payload. compile_specs = reranker.get_compile_specs( inputs=inputs, - ctx_len=args.ctx_len, prefill_seq_len=args.compile_prefill_seq_len, ) diff --git a/examples/reranker/qwen3vl/reranker_model.py b/examples/reranker/qwen3vl/reranker_model.py index 33e73b05f6..32c4e65eaa 100644 --- a/examples/reranker/qwen3vl/reranker_model.py +++ b/examples/reranker/qwen3vl/reranker_model.py @@ -173,7 +173,7 @@ def _collect_contexts(self, inputs: Dict): return prepared_contexts, max_prompt_len, max_grid_h, max_grid_w - def get_compile_specs(self, inputs: Dict, ctx_len: int, prefill_seq_len: int = None) -> Dict[str, int]: + def get_compile_specs(self, inputs: Dict, prefill_seq_len: int = None) -> Dict[str, int]: """Return compile parameters required for this input batch.""" _, max_prompt_len, max_grid_h, max_grid_w = self._collect_contexts(inputs) if max_prompt_len == 0: @@ -189,9 +189,10 @@ def get_compile_specs(self, inputs: Dict, ctx_len: int, prefill_seq_len: int = N height = max_grid_h * patch_size width = max_grid_w * patch_size + # ctx_len == prefill_seq_len always: reranker is single-shot prefill, no decode steps. return { "prefill_seq_len": target_prefill_seq_len, - "ctx_len": int(ctx_len), + "ctx_len": target_prefill_seq_len, "img_size": max(height, width), "height": height, "width": width, From 28dc773ede838e8161800a67d2168c5ad4cf9a51 Mon Sep 17 00:00:00 2001 From: Amit Raj Date: Thu, 4 Jun 2026 14:38:11 +0530 Subject: [PATCH 08/11] Update the exmple script and modelling files Signed-off-by: Amit Raj --- .../models/gemma3/modeling_gemma3.py | 20 ++++--------- .../models/internvl/modeling_internvl.py | 20 ++++--------- .../models/llama4/modeling_llama4.py | 20 ++++--------- .../models/llava/modeling_llava.py | 8 ++--- .../models/llava_next/modeling_llava_next.py | 10 +++---- .../models/mistral3/modeling_mistral3.py | 14 ++++----- .../models/mllama/modeling_mllama.py | 7 ++--- .../models/molmo/modeling_molmo.py | 14 ++++----- .../models/qwen2_5_vl/modeling_qwen2_5_vl.py | 14 ++++----- .../qwen3_vl_moe/modeling_qwen3_vl_moe.py | 14 ++++----- .../models/whisper/modeling_whisper.py | 5 ++-- tests/configs/image_text_model_configs.json | 30 ------------------- tests/configs/reranker_model_configs.json | 28 +++++++++++++++++ .../models/reranker/test_reranker_mad.py | 7 ++--- .../reranker/test_reranker_models_unit.py | 9 +++--- 15 files changed, 83 insertions(+), 137 deletions(-) create mode 100644 tests/configs/reranker_model_configs.json diff --git a/QEfficient/transformers/models/gemma3/modeling_gemma3.py b/QEfficient/transformers/models/gemma3/modeling_gemma3.py index a3e9257a73..35d9c07cf8 100644 --- a/QEfficient/transformers/models/gemma3/modeling_gemma3.py +++ b/QEfficient/transformers/models/gemma3/modeling_gemma3.py @@ -969,16 +969,8 @@ def get_dummy_pkv_cache(self, config, batch_size, seq_len, dtype=None): return past_key_values def get_dummy_inputs( - self, - comp_ctx_lengths: Optional[List[int]] = None, - kv_offload: bool = False, - continuous_batching: bool = False, - **kwargs, + self, comp_ctx_lengths: Optional[List[int]] = None, kv_offload: bool = False, continuous_batching: bool = False ): - prefill_seq_len = kwargs.get("prefill_seq_len") - if prefill_seq_len is None: - prefill_seq_len = constants.ONNX_EXPORT_EXAMPLE_SEQ_LEN - prefill_seq_len = int(prefill_seq_len) if vis_cfg := getattr(self.config, "vision_config", None): img_size = getattr(vis_cfg, "image_size", 896) else: @@ -987,7 +979,7 @@ def get_dummy_inputs( mm_tokens_per_image = getattr(self.config, "mm_tokens_per_image", 256) # Define shapes inputs_shapes = {} - inputs_shapes["input_ids"] = (constants.ONNX_EXPORT_EXAMPLE_BATCH_SIZE, prefill_seq_len) + inputs_shapes["input_ids"] = (constants.ONNX_EXPORT_EXAMPLE_BATCH_SIZE, constants.ONNX_EXPORT_EXAMPLE_SEQ_LEN) inputs_shapes["vision_embeds"] = ( 1, # constants.INTERN_NUM_PATCHES, mm_tokens_per_image, # constants.INTERN_FEATURE_SIZE, @@ -995,7 +987,7 @@ def get_dummy_inputs( ) inputs_shapes["position_ids"] = ( constants.ONNX_EXPORT_EXAMPLE_BATCH_SIZE, - prefill_seq_len, + constants.ONNX_EXPORT_EXAMPLE_SEQ_LEN, ) inputs_shapes["pixel_values"] = ( constants.ONNX_EXPORT_EXAMPLE_BATCH_SIZE, @@ -1012,8 +1004,8 @@ def get_dummy_inputs( lang_inputs["input_ids"] = torch.zeros((inputs_shapes["input_ids"]), dtype=torch.int64) lang_inputs["vision_embeds"] = torch.zeros((inputs_shapes["vision_embeds"]), dtype=self.config.torch_dtype) lang_inputs["position_ids"] = ( - torch.arange(prefill_seq_len, dtype=torch.int64) - .view(1, prefill_seq_len) + torch.arange(constants.ONNX_EXPORT_EXAMPLE_SEQ_LEN, dtype=torch.int64) + .view(1, constants.ONNX_EXPORT_EXAMPLE_SEQ_LEN) .repeat(constants.ONNX_EXPORT_EXAMPLE_BATCH_SIZE, 1) ) lang_inputs["image_idx"] = torch.zeros((inputs_shapes["image_idx"]), dtype=torch.int64) @@ -1025,7 +1017,7 @@ def get_dummy_inputs( lang_inputs["past_key_values"] = self.get_dummy_pkv_cache( config=self.language_model.config, batch_size=fbs if continuous_batching else bs, - seq_len=prefill_seq_len, + seq_len=constants.ONNX_EXPORT_EXAMPLE_SEQ_LEN, ) if comp_ctx_lengths is not None: diff --git a/QEfficient/transformers/models/internvl/modeling_internvl.py b/QEfficient/transformers/models/internvl/modeling_internvl.py index 563c42e256..821381ac0d 100644 --- a/QEfficient/transformers/models/internvl/modeling_internvl.py +++ b/QEfficient/transformers/models/internvl/modeling_internvl.py @@ -273,16 +273,8 @@ def get_output_names(self, kv_offload: bool = False): return output_names def get_dummy_inputs( - self, - comp_ctx_lengths: Optional[List[int]] = None, - kv_offload: bool = False, - continuous_batching: bool = False, - **kwargs, + self, comp_ctx_lengths: Optional[List[int]] = None, kv_offload: bool = False, continuous_batching: bool = False ): - prefill_seq_len = kwargs.get("prefill_seq_len") - if prefill_seq_len is None: - prefill_seq_len = constants.ONNX_EXPORT_EXAMPLE_SEQ_LEN - prefill_seq_len = int(prefill_seq_len) if vis_cfg := getattr(self.config, "vision_config", None): img_size = getattr(vis_cfg, "image_size", constants.INTERN_IMG_SIZE) else: @@ -301,7 +293,7 @@ def get_dummy_inputs( # Define shapes inputs_shapes = {} - inputs_shapes["input_ids"] = (constants.ONNX_EXPORT_EXAMPLE_BATCH_SIZE, prefill_seq_len) + inputs_shapes["input_ids"] = (constants.ONNX_EXPORT_EXAMPLE_BATCH_SIZE, constants.ONNX_EXPORT_EXAMPLE_SEQ_LEN) inputs_shapes["vision_embeds"] = ( 1, computed_feature_size * constants.ONNX_EXPORT_EXAMPLE_BATCH_SIZE, @@ -309,7 +301,7 @@ def get_dummy_inputs( ) inputs_shapes["position_ids"] = ( constants.ONNX_EXPORT_EXAMPLE_BATCH_SIZE, - prefill_seq_len, + constants.ONNX_EXPORT_EXAMPLE_SEQ_LEN, ) inputs_shapes["pixel_values"] = ( constants.INTERN_NUM_PATCHES * constants.ONNX_EXPORT_EXAMPLE_BATCH_SIZE, @@ -329,8 +321,8 @@ def get_dummy_inputs( (inputs_shapes["vision_embeds"]), dtype=self.config.vision_config.torch_dtype ) lang_inputs["position_ids"] = ( - torch.arange(prefill_seq_len, dtype=torch.int64) - .view(1, prefill_seq_len) + torch.arange(constants.ONNX_EXPORT_EXAMPLE_SEQ_LEN, dtype=torch.int64) + .view(1, constants.ONNX_EXPORT_EXAMPLE_SEQ_LEN) .repeat(constants.ONNX_EXPORT_EXAMPLE_BATCH_SIZE, 1) ) lang_inputs["image_idx"] = torch.zeros((1, 1), dtype=torch.int64) @@ -342,7 +334,7 @@ def get_dummy_inputs( kv_cache_shape = get_padding_shape_from_config( config=self.language_model.config, batch_size=fbs if continuous_batching else bs, - seq_len=prefill_seq_len, + seq_len=constants.ONNX_EXPORT_EXAMPLE_SEQ_LEN, ) lang_inputs["past_key_values"] = [[] for _ in range(self.language_model.config.num_hidden_layers)] diff --git a/QEfficient/transformers/models/llama4/modeling_llama4.py b/QEfficient/transformers/models/llama4/modeling_llama4.py index 2cf5dbb2e9..7f90262bec 100644 --- a/QEfficient/transformers/models/llama4/modeling_llama4.py +++ b/QEfficient/transformers/models/llama4/modeling_llama4.py @@ -1185,16 +1185,8 @@ def get_dummy_pkv_cache(self, config, batch_size, seq_len): return past_key_values def get_dummy_inputs( - self, - comp_ctx_lengths: Optional[List[int]] = None, - kv_offload: bool = False, - continuous_batching: bool = False, - **kwargs, + self, comp_ctx_lengths: Optional[List[int]] = None, kv_offload: bool = False, continuous_batching: bool = False ): - prefill_seq_len = kwargs.get("prefill_seq_len") - if prefill_seq_len is None: - prefill_seq_len = constants.ONNX_EXPORT_EXAMPLE_SEQ_LEN - prefill_seq_len = int(prefill_seq_len) if vis_cfg := getattr(self.config, "vision_config", None): img_size = getattr(vis_cfg, "image_size", 336) else: @@ -1202,7 +1194,7 @@ def get_dummy_inputs( # Define shapes inputs_shapes = {} - inputs_shapes["input_ids"] = (constants.ONNX_EXPORT_EXAMPLE_BATCH_SIZE, prefill_seq_len) + inputs_shapes["input_ids"] = (constants.ONNX_EXPORT_EXAMPLE_BATCH_SIZE, constants.ONNX_EXPORT_EXAMPLE_SEQ_LEN) max_num_tiles = 17 downsample_ratio = int(round(1.0 / (self.config.vision_config.pixel_shuffle_ratio**2))) num_features_per_tile = int( @@ -1218,7 +1210,7 @@ def get_dummy_inputs( ) inputs_shapes["position_ids"] = ( constants.ONNX_EXPORT_EXAMPLE_BATCH_SIZE, - prefill_seq_len, + constants.ONNX_EXPORT_EXAMPLE_SEQ_LEN, ) inputs_shapes["pixel_values"] = ( max_num_tiles, # constants.INTERN_NUM_PATCHES, @@ -1234,8 +1226,8 @@ def get_dummy_inputs( lang_inputs["input_ids"] = torch.zeros((inputs_shapes["input_ids"]), dtype=torch.int64) lang_inputs["vision_embeds"] = torch.zeros((inputs_shapes["vision_embeds"]), dtype=self.config.torch_dtype) lang_inputs["position_ids"] = ( - torch.arange(prefill_seq_len, dtype=torch.int64) - .view(1, prefill_seq_len) + torch.arange(constants.ONNX_EXPORT_EXAMPLE_SEQ_LEN, dtype=torch.int64) + .view(1, constants.ONNX_EXPORT_EXAMPLE_SEQ_LEN) .repeat(constants.ONNX_EXPORT_EXAMPLE_BATCH_SIZE, 1) ) lang_inputs["image_idx"] = torch.zeros((inputs_shapes["image_idx"]), dtype=torch.int64) @@ -1247,7 +1239,7 @@ def get_dummy_inputs( past_key_values = self.get_dummy_pkv_cache( config=self.language_model.config, batch_size=fbs if continuous_batching else bs, - seq_len=prefill_seq_len, + seq_len=constants.ONNX_EXPORT_EXAMPLE_SEQ_LEN, ) lang_inputs["past_key_values"] = [[] for _ in range(self.language_model.config.num_hidden_layers)] diff --git a/QEfficient/transformers/models/llava/modeling_llava.py b/QEfficient/transformers/models/llava/modeling_llava.py index 88bb5e1027..3fdfd11b9e 100644 --- a/QEfficient/transformers/models/llava/modeling_llava.py +++ b/QEfficient/transformers/models/llava/modeling_llava.py @@ -168,10 +168,6 @@ def get_dummy_inputs( continuous_batching: bool = False, **kwargs, ): - prefill_seq_len = kwargs.get("prefill_seq_len") - if prefill_seq_len is None: - prefill_seq_len = SEQ_LEN - prefill_seq_len = int(prefill_seq_len) num_layers = self.config.text_config.num_hidden_layers num_key_value_heads = self.config.text_config.num_key_value_heads head_dim = self.config.text_config.hidden_size // self.config.text_config.num_attention_heads @@ -186,11 +182,11 @@ def get_dummy_inputs( "pixel_values": torch.zeros((BS, NUM_CHANNEL, img_size, img_size), dtype=self.config.torch_dtype), } lang_inputs = { - "input_ids": torch.ones((BS, prefill_seq_len), dtype=torch.int64), + "input_ids": torch.ones((BS, SEQ_LEN), dtype=torch.int64), "vision_embeds": torch.ones( (BS, vision_size, self.model.language_model.config.hidden_size), dtype=self.config.torch_dtype ), - "attention_mask": torch.ones((BS, prefill_seq_len), dtype=torch.int64), + "attention_mask": torch.ones((BS, SEQ_LEN), dtype=torch.int64), "image_idx": torch.zeros((1, 1), dtype=torch.int64), } lang_inputs["position_ids"] = lang_inputs.pop("attention_mask").cumsum(1) diff --git a/QEfficient/transformers/models/llava_next/modeling_llava_next.py b/QEfficient/transformers/models/llava_next/modeling_llava_next.py index 342269ce50..c2a9137006 100755 --- a/QEfficient/transformers/models/llava_next/modeling_llava_next.py +++ b/QEfficient/transformers/models/llava_next/modeling_llava_next.py @@ -195,10 +195,6 @@ def get_dummy_inputs( continuous_batching: bool = False, **kwargs, ): - prefill_seq_len = kwargs.get("prefill_seq_len") - if prefill_seq_len is None: - prefill_seq_len = constants.GRANITEVISION_SEQ_LEN - prefill_seq_len = int(prefill_seq_len) num_layers = self.config.text_config.num_hidden_layers num_key_value_heads = self.config.text_config.num_key_value_heads head_dim = self.config.text_config.hidden_size // self.config.text_config.num_attention_heads @@ -225,9 +221,11 @@ def get_dummy_inputs( ), } lang_inputs = { - "input_ids": torch.ones((constants.ONNX_EXPORT_EXAMPLE_BATCH_SIZE, prefill_seq_len), dtype=torch.int64), + "input_ids": torch.ones( + (constants.ONNX_EXPORT_EXAMPLE_BATCH_SIZE, constants.GRANITEVISION_SEQ_LEN), dtype=torch.int64 + ), "attention_mask": torch.ones( - (constants.ONNX_EXPORT_EXAMPLE_BATCH_SIZE, prefill_seq_len), dtype=torch.int64 + (constants.ONNX_EXPORT_EXAMPLE_BATCH_SIZE, constants.GRANITEVISION_SEQ_LEN), dtype=torch.int64 ), "vision_embeds": torch.ones( ( diff --git a/QEfficient/transformers/models/mistral3/modeling_mistral3.py b/QEfficient/transformers/models/mistral3/modeling_mistral3.py index 628d1dee2c..9c37353328 100644 --- a/QEfficient/transformers/models/mistral3/modeling_mistral3.py +++ b/QEfficient/transformers/models/mistral3/modeling_mistral3.py @@ -346,12 +346,8 @@ def get_dummy_inputs( continuous_batching: bool = False, **kwargs, ): - prefill_seq_len = kwargs.get("prefill_seq_len") - if prefill_seq_len is None: - prefill_seq_len = constants.ONNX_EXPORT_EXAMPLE_SEQ_LEN - prefill_seq_len = int(prefill_seq_len) inputs_shapes = {} - inputs_shapes["input_ids"] = (constants.ONNX_EXPORT_EXAMPLE_BATCH_SIZE, prefill_seq_len) + inputs_shapes["input_ids"] = (constants.ONNX_EXPORT_EXAMPLE_BATCH_SIZE, constants.ONNX_EXPORT_EXAMPLE_SEQ_LEN) height = self.config.vision_config.image_size width = self.config.vision_config.image_size patch_size = self.config.vision_config.patch_size @@ -367,7 +363,7 @@ def get_dummy_inputs( ) inputs_shapes["position_ids"] = ( constants.ONNX_EXPORT_EXAMPLE_BATCH_SIZE, - prefill_seq_len, + constants.ONNX_EXPORT_EXAMPLE_SEQ_LEN, ) inputs_shapes["pixel_values"] = ( constants.ONNX_EXPORT_EXAMPLE_BATCH_SIZE, @@ -384,8 +380,8 @@ def get_dummy_inputs( lang_inputs["input_ids"] = torch.zeros((inputs_shapes["input_ids"]), dtype=torch.int64) lang_inputs["vision_embeds"] = torch.zeros((inputs_shapes["vision_embeds"]), dtype=self.config.torch_dtype) lang_inputs["position_ids"] = ( - torch.arange(prefill_seq_len, dtype=torch.int64) - .view(1, prefill_seq_len) + torch.arange(constants.ONNX_EXPORT_EXAMPLE_SEQ_LEN, dtype=torch.int64) + .view(1, constants.ONNX_EXPORT_EXAMPLE_SEQ_LEN) .repeat(constants.ONNX_EXPORT_EXAMPLE_BATCH_SIZE, 1) ) lang_inputs["image_idx"] = torch.zeros((inputs_shapes["image_idx"]), dtype=torch.int64) @@ -397,7 +393,7 @@ def get_dummy_inputs( kv_cache_shape = get_padding_shape_from_config( config=self.model.config.text_config, batch_size=fbs if continuous_batching else bs, - seq_len=prefill_seq_len, + seq_len=constants.ONNX_EXPORT_EXAMPLE_SEQ_LEN, ) lang_inputs["past_key_values"] = [[] for _ in range(self.model.language_model.config.num_hidden_layers)] diff --git a/QEfficient/transformers/models/mllama/modeling_mllama.py b/QEfficient/transformers/models/mllama/modeling_mllama.py index 45649662a7..d9310c02e4 100644 --- a/QEfficient/transformers/models/mllama/modeling_mllama.py +++ b/QEfficient/transformers/models/mllama/modeling_mllama.py @@ -924,12 +924,9 @@ def forward( logits = self.lm_head(hidden_states).float() return logits, image_idx, outputs.past_key_values, pixel_values - def get_dummy_inputs(self, comp_ctx_lengths: Optional[List[int]] = None, kv_offload: bool = False, **kwargs): + def get_dummy_inputs(self, comp_ctx_lengths: Optional[List[int]] = None, kv_offload: bool = False): BS = constants.ONNX_EXPORT_EXAMPLE_BATCH_SIZE - seq_len = kwargs.get("prefill_seq_len") - if seq_len is None: - seq_len = constants.ONNX_EXPORT_EXAMPLE_SEQ_LEN - SEQ_LEN = int(seq_len) + SEQ_LEN = constants.ONNX_EXPORT_EXAMPLE_SEQ_LEN CTX_LEN = constants.ONNX_EXPORT_CTX_LEN txt_cfg = self.config.get_text_config() diff --git a/QEfficient/transformers/models/molmo/modeling_molmo.py b/QEfficient/transformers/models/molmo/modeling_molmo.py index d59ca4e017..3eefba47f5 100644 --- a/QEfficient/transformers/models/molmo/modeling_molmo.py +++ b/QEfficient/transformers/models/molmo/modeling_molmo.py @@ -931,13 +931,9 @@ def get_dummy_inputs( continuous_batching: bool = False, **kwargs, ): - prefill_seq_len = kwargs.get("prefill_seq_len") - if prefill_seq_len is None: - prefill_seq_len = constants.ONNX_EXPORT_EXAMPLE_SEQ_LEN - prefill_seq_len = int(prefill_seq_len) inputs_shapes = {} inputs_shapes_lang = {} - inputs_shapes["input_ids"] = (constants.ONNX_EXPORT_EXAMPLE_BATCH_SIZE, prefill_seq_len) + inputs_shapes["input_ids"] = (constants.ONNX_EXPORT_EXAMPLE_BATCH_SIZE, constants.ONNX_EXPORT_EXAMPLE_SEQ_LEN) inputs_shapes["vision_embeds"] = ( constants.ONNX_EXPORT_EXAMPLE_BATCH_SIZE, @@ -946,7 +942,7 @@ def get_dummy_inputs( ) inputs_shapes["position_ids"] = ( constants.ONNX_EXPORT_EXAMPLE_BATCH_SIZE, - prefill_seq_len, + constants.ONNX_EXPORT_EXAMPLE_SEQ_LEN, ) inputs_shapes["pixel_values"] = ( constants.ONNX_EXPORT_EXAMPLE_BATCH_SIZE, @@ -980,8 +976,8 @@ def get_dummy_inputs( lang_inputs["input_ids"] = torch.zeros((inputs_shapes["input_ids"]), dtype=torch.int64) lang_inputs["vision_embeds"] = torch.zeros((inputs_shapes["vision_embeds"]), dtype=self.config.torch_dtype) lang_inputs["position_ids"] = ( - torch.arange(prefill_seq_len, dtype=torch.int64) - .view(1, prefill_seq_len) + torch.arange(constants.ONNX_EXPORT_EXAMPLE_SEQ_LEN, dtype=torch.int64) + .view(1, constants.ONNX_EXPORT_EXAMPLE_SEQ_LEN) .repeat(constants.ONNX_EXPORT_EXAMPLE_BATCH_SIZE, 1) ) lang_inputs["image_idx"] = torch.zeros((inputs_shapes["image_idx"]), dtype=torch.int64) @@ -993,7 +989,7 @@ def get_dummy_inputs( kv_cache_shape = get_padding_shape_from_config( config=self.config, batch_size=fbs if continuous_batching else bs, - seq_len=prefill_seq_len, + seq_len=constants.ONNX_EXPORT_EXAMPLE_SEQ_LEN, ) lang_inputs["past_key_values"] = [[] for _ in range(self.model.config.n_layers)] diff --git a/QEfficient/transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py b/QEfficient/transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py index 357c4af16e..dd70a31c95 100644 --- a/QEfficient/transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py +++ b/QEfficient/transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py @@ -831,12 +831,8 @@ def get_dummy_inputs( continuous_batching: bool = False, **kwargs, ): - prefill_seq_len = kwargs.get("prefill_seq_len", constants.ONNX_EXPORT_EXAMPLE_SEQ_LEN) - if prefill_seq_len is None: - prefill_seq_len = constants.ONNX_EXPORT_EXAMPLE_SEQ_LEN - prefill_seq_len = int(prefill_seq_len) inputs_shapes = {} - inputs_shapes["input_ids"] = (constants.ONNX_EXPORT_EXAMPLE_BATCH_SIZE, prefill_seq_len) + inputs_shapes["input_ids"] = (constants.ONNX_EXPORT_EXAMPLE_BATCH_SIZE, constants.ONNX_EXPORT_EXAMPLE_SEQ_LEN) vision_size = 3577 inputs_shapes["vision_embeds"] = ( @@ -848,7 +844,7 @@ def get_dummy_inputs( inputs_shapes["position_ids"] = ( 3, constants.ONNX_EXPORT_EXAMPLE_BATCH_SIZE, - prefill_seq_len, + constants.ONNX_EXPORT_EXAMPLE_SEQ_LEN, ) inputs_shapes["pixel_values"] = (14308, 1176) inputs_shapes["image_idx"] = (1, 1) @@ -862,8 +858,8 @@ def get_dummy_inputs( lang_inputs["vision_embeds"] = torch.zeros((inputs_shapes["vision_embeds"]), dtype=self.config.torch_dtype) lang_inputs["position_ids"] = ( ( - torch.arange(prefill_seq_len, dtype=torch.int64) - .view(1, prefill_seq_len) + torch.arange(constants.ONNX_EXPORT_EXAMPLE_SEQ_LEN, dtype=torch.int64) + .view(1, constants.ONNX_EXPORT_EXAMPLE_SEQ_LEN) .repeat(constants.ONNX_EXPORT_EXAMPLE_BATCH_SIZE, 1) ) .unsqueeze(0) @@ -878,7 +874,7 @@ def get_dummy_inputs( kv_cache_shape = get_padding_shape_from_config( config=self.model.config.text_config, batch_size=fbs if continuous_batching else bs, - seq_len=prefill_seq_len, + seq_len=constants.ONNX_EXPORT_EXAMPLE_SEQ_LEN, ) lang_inputs["past_key_values"] = [[] for _ in range(self.model.config.text_config.num_hidden_layers)] diff --git a/QEfficient/transformers/models/qwen3_vl_moe/modeling_qwen3_vl_moe.py b/QEfficient/transformers/models/qwen3_vl_moe/modeling_qwen3_vl_moe.py index dc741969a4..317c5ee261 100644 --- a/QEfficient/transformers/models/qwen3_vl_moe/modeling_qwen3_vl_moe.py +++ b/QEfficient/transformers/models/qwen3_vl_moe/modeling_qwen3_vl_moe.py @@ -867,12 +867,8 @@ def get_dummy_inputs( continuous_batching: bool = False, **kwargs, ): - prefill_seq_len = kwargs.get("prefill_seq_len") - if prefill_seq_len is None: - prefill_seq_len = constants.ONNX_EXPORT_EXAMPLE_SEQ_LEN - prefill_seq_len = int(prefill_seq_len) inputs_shapes = {} - inputs_shapes["input_ids"] = (constants.ONNX_EXPORT_EXAMPLE_BATCH_SIZE, prefill_seq_len) + inputs_shapes["input_ids"] = (constants.ONNX_EXPORT_EXAMPLE_BATCH_SIZE, constants.ONNX_EXPORT_EXAMPLE_SEQ_LEN) # vision_size = 1024 vision_size = 187 inputs_shapes["vision_embeds"] = ( @@ -884,7 +880,7 @@ def get_dummy_inputs( inputs_shapes["position_ids"] = ( 3, constants.ONNX_EXPORT_EXAMPLE_BATCH_SIZE, - prefill_seq_len, + constants.ONNX_EXPORT_EXAMPLE_SEQ_LEN, ) inputs_shapes["pixel_values"] = (748, 1536) inputs_shapes["image_idx"] = (1, 1) @@ -908,8 +904,8 @@ def get_dummy_inputs( ) lang_inputs["position_ids"] = ( ( - torch.arange(prefill_seq_len, dtype=torch.int64) - .view(1, prefill_seq_len) + torch.arange(constants.ONNX_EXPORT_EXAMPLE_SEQ_LEN, dtype=torch.int64) + .view(1, constants.ONNX_EXPORT_EXAMPLE_SEQ_LEN) .repeat(constants.ONNX_EXPORT_EXAMPLE_BATCH_SIZE, 1) ) .unsqueeze(0) @@ -927,7 +923,7 @@ def get_dummy_inputs( kv_cache_shape = get_padding_shape_from_config( config=self.model.config.text_config, batch_size=fbs if continuous_batching else bs, - seq_len=prefill_seq_len, + seq_len=constants.ONNX_EXPORT_EXAMPLE_SEQ_LEN, ) lang_inputs["past_key_values"] = [[] for _ in range(self.model.config.text_config.num_hidden_layers)] diff --git a/QEfficient/transformers/models/whisper/modeling_whisper.py b/QEfficient/transformers/models/whisper/modeling_whisper.py index 1bdcd07ada..89c52c9517 100644 --- a/QEfficient/transformers/models/whisper/modeling_whisper.py +++ b/QEfficient/transformers/models/whisper/modeling_whisper.py @@ -30,7 +30,7 @@ from QEfficient.transformers.cache_utils import QEffEncoderDecoderCache from QEfficient.transformers.modeling_attn_mask_utils import _create_causal_mask from QEfficient.utils._utils import IOInfo -from QEfficient.utils.constants import MIN_MASKED_ATTENTION_VALUE, ONNX_EXPORT_EXAMPLE_SEQ_LEN +from QEfficient.utils.constants import MIN_MASKED_ATTENTION_VALUE class QEffWhisperPositionalEmbedding(WhisperPositionalEmbedding): @@ -792,10 +792,9 @@ def forward( def get_dummy_inputs( self, - **kwargs, ): bs = 1 - seq_len = int(kwargs.get("prefill_seq_len", ONNX_EXPORT_EXAMPLE_SEQ_LEN)) + seq_len = 32 encoder_seq_len = self.config.max_source_positions encoder_feature_count = self.config.num_mel_bins num_key_value_heads = self.config.decoder_attention_heads diff --git a/tests/configs/image_text_model_configs.json b/tests/configs/image_text_model_configs.json index 85df559970..8181faf430 100644 --- a/tests/configs/image_text_model_configs.json +++ b/tests/configs/image_text_model_configs.json @@ -724,36 +724,6 @@ } } ], - "image_text_reranker_models": [ - { - "model_name": "Qwen/Qwen3-VL-Reranker-2B", - "model_type": "qwen3_vl", - "batch_size": 1, - "prompt_len": 128, - "ctx_len": 1024, - "img_size": 1540, - "img_url": "https://picsum.photos/id/237/536/354", - "instruction": "Retrieve candidates relevant to the query.", - "query_text": "A woman playing with her dog on a beach at sunset.", - "document_text": "A woman and her dog spend time together on a beach during sunset.", - "num_layers": 1, - "additional_params": {} - }, - { - "model_name": "Qwen/Qwen3-VL-Reranker-8B", - "model_type": "qwen3_vl", - "batch_size": 1, - "prompt_len": 128, - "ctx_len": 1024, - "img_size": 1540, - "img_url": "https://picsum.photos/id/237/536/354", - "instruction": "Retrieve candidates relevant to the query.", - "query_text": "A woman playing with her dog on a beach at sunset.", - "document_text": "A woman and her dog spend time together on a beach during sunset.", - "num_layers": 1, - "additional_params": {} - } - ], "image_text_embedding_models": [ { "model_name": "Qwen/Qwen3-VL-Embedding-8B", diff --git a/tests/configs/reranker_model_configs.json b/tests/configs/reranker_model_configs.json new file mode 100644 index 0000000000..4427b9da0c --- /dev/null +++ b/tests/configs/reranker_model_configs.json @@ -0,0 +1,28 @@ +[ + { + "model_name": "Qwen/Qwen3-VL-Reranker-2B", + "model_type": "qwen3_vl", + "batch_size": 1, + "prompt_len": 128, + "img_size": 1540, + "img_url": "https://picsum.photos/id/237/536/354", + "instruction": "Retrieve candidates relevant to the query.", + "query_text": "A woman playing with her dog on a beach at sunset.", + "document_text": "A woman and her dog spend time together on a beach during sunset.", + "num_layers": 1, + "additional_params": {} + }, + { + "model_name": "Qwen/Qwen3-VL-Reranker-8B", + "model_type": "qwen3_vl", + "batch_size": 1, + "prompt_len": 128, + "img_size": 1540, + "img_url": "https://picsum.photos/id/237/536/354", + "instruction": "Retrieve candidates relevant to the query.", + "query_text": "A woman playing with her dog on a beach at sunset.", + "document_text": "A woman and her dog spend time together on a beach during sunset.", + "num_layers": 1, + "additional_params": {} + } +] diff --git a/tests/transformers/models/reranker/test_reranker_mad.py b/tests/transformers/models/reranker/test_reranker_mad.py index 148935c5a7..4677f96933 100644 --- a/tests/transformers/models/reranker/test_reranker_mad.py +++ b/tests/transformers/models/reranker/test_reranker_mad.py @@ -39,7 +39,7 @@ ) from QEfficient.utils.test_utils import load_vlm_model, set_num_layers_vlm -CONFIG_PATH = os.path.join(os.path.dirname(__file__), "../../../configs/image_text_model_configs.json") +CONFIG_PATH = os.path.join(os.path.dirname(__file__), "../../../configs/reranker_model_configs.json") PT_AI100_MAD_MAX = 5e-3 MAX_LENGTH = 8192 @@ -60,8 +60,7 @@ } with open(CONFIG_PATH, "r") as f: - config_data = json.load(f) - reranker_models = config_data["image_text_reranker_models"] + reranker_models = json.load(f) test_reranker_models = [model_config["model_name"] for model_config in reranker_models] reranker_model_config_dict = {model["model_name"]: model for model in reranker_models} @@ -298,7 +297,7 @@ def test_qwen3_vl_reranker_mad_parity(model_name): height=compile_height, width=compile_width, prefill_seq_len=max_prompt_len, - ctx_len=model_cfg["ctx_len"], + ctx_len=max_prompt_len, num_devices=1, num_cores=16, mxfp6_matmul=False, diff --git a/tests/unit_test/models/reranker/test_reranker_models_unit.py b/tests/unit_test/models/reranker/test_reranker_models_unit.py index f3036502e1..b79a3d29c9 100644 --- a/tests/unit_test/models/reranker/test_reranker_models_unit.py +++ b/tests/unit_test/models/reranker/test_reranker_models_unit.py @@ -8,7 +8,7 @@ Generic unit coverage for image-text reranker model entries. This test is intentionally model-list driven: - - Add/remove reranker models only in tests/configs/image_text_model_configs.json + - Add/remove reranker models only in tests/configs/reranker_model_configs.json - The same unit checks run for every configured reranker model """ @@ -22,13 +22,12 @@ from QEfficient.utils.test_utils import set_num_layers_vlm -CONFIG_PATH = os.path.join(os.path.dirname(__file__), "../../../configs/image_text_model_configs.json") +CONFIG_PATH = os.path.join(os.path.dirname(__file__), "../../../configs/reranker_model_configs.json") def _load_reranker_model_configs() -> List[Dict]: with open(CONFIG_PATH, "r", encoding="utf-8") as file: - config_data = json.load(file) - return config_data.get("image_text_reranker_models", []) + return json.load(file) RERANKER_MODEL_CONFIGS = _load_reranker_model_configs() @@ -51,7 +50,7 @@ def _vision_num_layers(config) -> int: def test_reranker_model_list_is_present(): assert RERANKER_MODEL_CONFIGS, ( - "image_text_reranker_models is empty. Add reranker entries in tests/configs/image_text_model_configs.json." + "reranker_model_configs.json is empty. Add reranker entries in tests/configs/reranker_model_configs.json." ) From fc44abeb8fa23b5e19e46c24f899b9d0d550f954 Mon Sep 17 00:00:00 2001 From: Amit Raj Date: Thu, 4 Jun 2026 21:24:06 +0530 Subject: [PATCH 09/11] Embedding: single prefill specialization, remove ctx_len from API Mirror of the reranker fix: Qwen3-VL embedding is single-shot prefill (reads last-token hidden state as embedding vector, no decode loop). `get_compile_specs` now returns ctx_len == prefill_seq_len, triggering Solution A in modeling_auto.py to compile only the Prefill kernel. Signed-off-by: Amit Raj --- .../transformers/models/qwen3_vl/_embedding_utils.py | 7 +++---- examples/embeddings/qwen3vl/README.md | 1 - examples/embeddings/qwen3vl/qwen3_vl_embedding.py | 3 --- tests/configs/image_text_model_configs.json | 1 - .../models/embedding_models/test_qwen3vl_embedding_mad.py | 1 - .../models/embedding/test_qwen3vl_embedding_unit.py | 4 ++-- 6 files changed, 5 insertions(+), 12 deletions(-) diff --git a/QEfficient/transformers/models/qwen3_vl/_embedding_utils.py b/QEfficient/transformers/models/qwen3_vl/_embedding_utils.py index ca0316371d..bce751db9d 100644 --- a/QEfficient/transformers/models/qwen3_vl/_embedding_utils.py +++ b/QEfficient/transformers/models/qwen3_vl/_embedding_utils.py @@ -257,9 +257,7 @@ def _collect_contexts(self, inputs: List[Dict[str, Any]]): return contexts, max_prompt_len, max_grid_h, max_grid_w - def get_compile_specs( - self, inputs: List[Dict[str, Any]], ctx_len: int, prefill_seq_len: int = None - ) -> Dict[str, int]: + def get_compile_specs(self, inputs: List[Dict[str, Any]], prefill_seq_len: int = None) -> Dict[str, int]: """Compute compile-time spec values for the current input batch.""" _, max_prompt_len, max_grid_h, max_grid_w = self._collect_contexts(inputs) if max_prompt_len == 0: @@ -275,9 +273,10 @@ def get_compile_specs( height = max_grid_h * patch_size width = max_grid_w * patch_size + # ctx_len == prefill_seq_len always: embedding is single-shot prefill, no decode steps. return { "prefill_seq_len": target_prefill_seq_len, - "ctx_len": int(ctx_len), + "ctx_len": target_prefill_seq_len, "img_size": max(height, width), "height": height, "width": width, diff --git a/examples/embeddings/qwen3vl/README.md b/examples/embeddings/qwen3vl/README.md index cff14908cc..6f89fade06 100644 --- a/examples/embeddings/qwen3vl/README.md +++ b/examples/embeddings/qwen3vl/README.md @@ -40,7 +40,6 @@ With compile parameters: ```bash python examples/embeddings/qwen3vl/qwen3_vl_embedding.py \ --model-name Qwen/Qwen3-VL-Embedding-8B \ - --ctx-len 2048 \ --num-cores 16 \ --num-devices 1 \ --compile-prefill-seq-len 4096 \ diff --git a/examples/embeddings/qwen3vl/qwen3_vl_embedding.py b/examples/embeddings/qwen3vl/qwen3_vl_embedding.py index bd707ffb08..b3124352a6 100644 --- a/examples/embeddings/qwen3vl/qwen3_vl_embedding.py +++ b/examples/embeddings/qwen3vl/qwen3_vl_embedding.py @@ -24,7 +24,6 @@ from QEfficient.transformers.models.qwen3_vl._embedding_utils import configure_embedding_model_config DEFAULT_MODEL_NAME = "Qwen/Qwen3-VL-Embedding-8B" -DEFAULT_CTX_LEN = 2048 DEFAULT_NUM_CORES = 16 DEFAULT_NUM_DEVICES = 1 DEFAULT_NUM_HIDDEN_LAYERS = 36 @@ -36,7 +35,6 @@ def parse_args() -> argparse.Namespace: """Parse command-line arguments for AI100 compile/inference knobs.""" parser = argparse.ArgumentParser(description="Qwen3-VL embedding example.") parser.add_argument("--model-name", type=str, default=DEFAULT_MODEL_NAME) - parser.add_argument("--ctx-len", type=int, default=DEFAULT_CTX_LEN, help="Context length used at compile time.") parser.add_argument("--num-cores", type=int, default=DEFAULT_NUM_CORES, help="Number of AI100 cores.") parser.add_argument("--num-devices", type=int, default=DEFAULT_NUM_DEVICES, help="Number of AI100 devices.") parser.add_argument( @@ -121,7 +119,6 @@ def main() -> None: # 3) Derive compile requirements from current payload. compile_specs = embedder.get_compile_specs( inputs=model_inputs, - ctx_len=args.ctx_len, prefill_seq_len=args.compile_prefill_seq_len, ) diff --git a/tests/configs/image_text_model_configs.json b/tests/configs/image_text_model_configs.json index 8181faf430..d98d0e08a2 100644 --- a/tests/configs/image_text_model_configs.json +++ b/tests/configs/image_text_model_configs.json @@ -729,7 +729,6 @@ "model_name": "Qwen/Qwen3-VL-Embedding-8B", "model_type": "qwen3_vl", "batch_size": 1, - "ctx_len": 2048, "num_layers": 1, "vision_depth": 9, "deepstack_index": 8, diff --git a/tests/transformers/models/embedding_models/test_qwen3vl_embedding_mad.py b/tests/transformers/models/embedding_models/test_qwen3vl_embedding_mad.py index d540593b86..885372355d 100644 --- a/tests/transformers/models/embedding_models/test_qwen3vl_embedding_mad.py +++ b/tests/transformers/models/embedding_models/test_qwen3vl_embedding_mad.py @@ -108,7 +108,6 @@ def test_qwen3_vl_embedding_cpu_vs_ai100_mad_parity(model_name): model_inputs = EXAMPLE_QUERIES + EXAMPLE_DOCUMENTS compile_specs = embedder.get_compile_specs( inputs=model_inputs, - ctx_len=model_cfg["ctx_len"], prefill_seq_len=model_cfg.get("compile_prefill_seq_len", None), ) qpc_paths = qeff_model.compile( diff --git a/tests/unit_test/models/embedding/test_qwen3vl_embedding_unit.py b/tests/unit_test/models/embedding/test_qwen3vl_embedding_unit.py index ae7c88e837..a602a0f7dd 100644 --- a/tests/unit_test/models/embedding/test_qwen3vl_embedding_unit.py +++ b/tests/unit_test/models/embedding/test_qwen3vl_embedding_unit.py @@ -118,8 +118,8 @@ def _fake_run_ai100_prefill(prepared_inputs, vision_outputs, lang_qpc_path): monkeypatch.setattr(QEffQwen3VLEmbedder, "_run_ai100_vision", staticmethod(_fake_run_ai100_vision)) monkeypatch.setattr(QEffQwen3VLEmbedder, "_run_ai100_prefill", staticmethod(_fake_run_ai100_prefill)) - compile_specs = embedder.get_compile_specs(inputs=[{}, {}], ctx_len=64, prefill_seq_len=12) - assert compile_specs == {"prefill_seq_len": 12, "ctx_len": 64, "img_size": 160, "height": 96, "width": 160} + compile_specs = embedder.get_compile_specs(inputs=[{}, {}], prefill_seq_len=12) + assert compile_specs == {"prefill_seq_len": 12, "ctx_len": 12, "img_size": 160, "height": 96, "width": 160} embeddings = embedder.process( inputs=[{}, {}], From 5fbc0d8c8968fd5f1d7bf16107fb76caa335ffcc Mon Sep 17 00:00:00 2001 From: Amit Raj Date: Thu, 4 Jun 2026 22:44:01 +0530 Subject: [PATCH 10/11] Address review comments: use ONNX_EXPORT_EXAMPLE_SEQ_LEN constant and simplify config path Signed-off-by: Amit Raj --- QEfficient/transformers/models/whisper/modeling_whisper.py | 4 ++-- tests/unit_test/models/reranker/test_reranker_models_unit.py | 3 +-- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/QEfficient/transformers/models/whisper/modeling_whisper.py b/QEfficient/transformers/models/whisper/modeling_whisper.py index 89c52c9517..4c30166289 100644 --- a/QEfficient/transformers/models/whisper/modeling_whisper.py +++ b/QEfficient/transformers/models/whisper/modeling_whisper.py @@ -30,7 +30,7 @@ from QEfficient.transformers.cache_utils import QEffEncoderDecoderCache from QEfficient.transformers.modeling_attn_mask_utils import _create_causal_mask from QEfficient.utils._utils import IOInfo -from QEfficient.utils.constants import MIN_MASKED_ATTENTION_VALUE +from QEfficient.utils.constants import MIN_MASKED_ATTENTION_VALUE, ONNX_EXPORT_EXAMPLE_SEQ_LEN class QEffWhisperPositionalEmbedding(WhisperPositionalEmbedding): @@ -794,7 +794,7 @@ def get_dummy_inputs( self, ): bs = 1 - seq_len = 32 + seq_len = ONNX_EXPORT_EXAMPLE_SEQ_LEN encoder_seq_len = self.config.max_source_positions encoder_feature_count = self.config.num_mel_bins num_key_value_heads = self.config.decoder_attention_heads diff --git a/tests/unit_test/models/reranker/test_reranker_models_unit.py b/tests/unit_test/models/reranker/test_reranker_models_unit.py index b79a3d29c9..7d1321a98b 100644 --- a/tests/unit_test/models/reranker/test_reranker_models_unit.py +++ b/tests/unit_test/models/reranker/test_reranker_models_unit.py @@ -14,7 +14,6 @@ import copy import json -import os from typing import Dict, List import pytest @@ -22,7 +21,7 @@ from QEfficient.utils.test_utils import set_num_layers_vlm -CONFIG_PATH = os.path.join(os.path.dirname(__file__), "../../../configs/reranker_model_configs.json") +CONFIG_PATH = "tests/configs/reranker_model_configs.json" def _load_reranker_model_configs() -> List[Dict]: From e14f780835a4244040c85107086fd39b5c0bde60 Mon Sep 17 00:00:00 2001 From: Amit Raj Date: Fri, 5 Jun 2026 12:22:47 +0530 Subject: [PATCH 11/11] Added support of embedding and reranker model export wnd compile without kv input outpur Signed-off-by: Amit Raj --- QEfficient/blocking/attention_blocking.py | 1 + .../transformers/models/modeling_auto.py | 3 + .../models/qwen3_vl/modeling_qwen3_vl.py | 70 ++++++++++++++++--- .../reranker/qwen3vl/qwen3_vl_reranker.py | 1 + 4 files changed, 64 insertions(+), 11 deletions(-) diff --git a/QEfficient/blocking/attention_blocking.py b/QEfficient/blocking/attention_blocking.py index b753420132..cae8840811 100644 --- a/QEfficient/blocking/attention_blocking.py +++ b/QEfficient/blocking/attention_blocking.py @@ -81,6 +81,7 @@ def past_key_value_update( position_ids: Optional[torch.LongTensor] = None, sliding_window: Optional[int] = None, ): + cache_kwargs = {} if past_key_value is not None: cache_kwargs = {"batch_index": batch_index, "position_ids": position_ids} if sliding_window is not None: diff --git a/QEfficient/transformers/models/modeling_auto.py b/QEfficient/transformers/models/modeling_auto.py index 360aaa13ec..4f5ad61d3d 100755 --- a/QEfficient/transformers/models/modeling_auto.py +++ b/QEfficient/transformers/models/modeling_auto.py @@ -1268,6 +1268,9 @@ def __init__( ) self.model = model self.config = model.config + # Propagate qaic_config to the full model so helpers like _is_single_shot_mode + # can detect the mode when get_output_names/get_dummy_inputs are called on it. + model.qaic_config = qaic_config self.vision_model = QEffVisionEncoderForTextImageToTextModel(model, **kwargs) self.lang_model = QEffCausalLMForTextImageToTextModel(model, qaic_config=qaic_config, **kwargs) diff --git a/QEfficient/transformers/models/qwen3_vl/modeling_qwen3_vl.py b/QEfficient/transformers/models/qwen3_vl/modeling_qwen3_vl.py index 9f609ea2ea..cd39a98f0c 100644 --- a/QEfficient/transformers/models/qwen3_vl/modeling_qwen3_vl.py +++ b/QEfficient/transformers/models/qwen3_vl/modeling_qwen3_vl.py @@ -57,6 +57,18 @@ def _should_export_embedding_output(module) -> bool: return False +def _is_single_shot_mode(module) -> bool: + """True when model is single-shot prefill only (reranker/embedding) — no KV cache needed.""" + for holder in (module, getattr(module, "model", None)): + if holder is None: + continue + qaic_config = getattr(holder, "qaic_config", None) + if isinstance(qaic_config, dict): + if qaic_config.get("no_kv_cache", False) or qaic_config.get("export_embedding", False): + return True + return False + + def qeff_apply_interleaved_mrope(freqs, mrope_section): """Apply interleaved MRoPE to 3D rotary embeddings. Reorganizes frequency layout from chunked [TTT...HHH...WWW] to @@ -549,7 +561,9 @@ def forward( ) -> Union[Tuple, BaseModelOutputWithPast]: if (input_ids is None) ^ (inputs_embeds is not None): raise ValueError("You must specify exactly one of input_ids or inputs_embeds") - if self.config.use_cache and not isinstance(past_key_values, Cache): + return_legacy_cache = False + effective_use_cache = use_cache if use_cache is not None else self.config.use_cache + if effective_use_cache and not isinstance(past_key_values, Cache): return_legacy_cache = True past_key_values = QEffDynamicCache.from_legacy_cache(past_key_values) @@ -567,7 +581,11 @@ def forward( elif position_ids.dim() == 2: position_ids = position_ids[None, ...].expand(3, position_ids.shape[0], -1) - target_length = attention_mask.shape[-1] if isinstance(attention_mask, torch.Tensor) else past_seen_tokens + target_length = ( + attention_mask.shape[-1] + if isinstance(attention_mask, torch.Tensor) + else (past_seen_tokens if past_seen_tokens > 0 else inputs_embeds.shape[1]) + ) causal_mask = _create_causal_mask( position_ids=position_ids[0], target_length=target_length, sliding_window=None ) @@ -696,7 +714,7 @@ def forward( deepstack_features, position_ids, image_idx, - past_key_values, + past_key_values=None, batch_index: Optional[torch.LongTensor] = None, comp_ctx_lengths: Optional[List[int]] = None, ): @@ -705,7 +723,7 @@ def forward( selected = input_ids == self.model.config.image_token_id indices1 = selected.to(torch.int64).cumsum(1) - 1 indices1 = torch.where(indices1 != -1, indices1 + image_idx, indices1) - indices0 = torch.arange(selected.unsqueeze(0).shape[0]).view(-1, 1) + indices0 = torch.arange(selected.shape[0], device=selected.device).view(-1, 1) image_features_expanded = vision_embeds.reshape(-1, C).unsqueeze(0)[indices0, indices1] num_features, bs, split_size, C = deepstack_features.shape @@ -723,13 +741,14 @@ def forward( visual_pos_masks = image_mask deepstack_visual_embeds = deepstack_features_expanded + single_shot = _is_single_shot_mode(self) outputs = self.language_model( inputs_embeds=inputs_embeds, position_ids=position_ids, - past_key_values=past_key_values, + past_key_values=None if single_shot else past_key_values, comp_ctx_lengths=comp_ctx_lengths, batch_index=batch_index, - use_cache=True, + use_cache=not single_shot, visual_pos_masks=visual_pos_masks, deepstack_visual_embeds=deepstack_visual_embeds, ) @@ -737,6 +756,10 @@ def forward( hidden_states = outputs.last_hidden_state[torch.arange(position_ids[0].shape[0]).view(-1, 1), logit_index] logits = self.model.lm_head(hidden_states) image_idx = (indices1.max() + 1).unsqueeze(0).unsqueeze(0) + if single_shot: + if _should_export_embedding_output(self): + return logits, vision_embeds, deepstack_features, image_idx, hidden_states + return logits, vision_embeds, deepstack_features, image_idx if _should_export_embedding_output(self): return logits, vision_embeds, deepstack_features, image_idx, hidden_states, outputs.past_key_values return logits, vision_embeds, deepstack_features, image_idx, outputs.past_key_values @@ -920,6 +943,8 @@ def get_dummy_inputs( lang_inputs["comp_ctx_lengths"] = torch.randint(0, 100, (40,), dtype=torch.int64) inputs = {} if kv_offload: + if _is_single_shot_mode(self): + lang_inputs.pop("past_key_values") inputs["vision"] = vision_inputs inputs["lang"] = lang_inputs else: @@ -1101,6 +1126,11 @@ def smart_resize( lang = [lang_prefill, lang_decode] + # Single-shot (reranker/embedding): no KV cache → ctx_len not referenced in ONNX + if _is_single_shot_mode(self): + for spec in lang: + spec.pop("ctx_len", None) + specializations = {} if kv_offload: @@ -1149,6 +1179,10 @@ def get_onnx_dynamic_axes( dynamic_axes = {} if kv_offload: + if _is_single_shot_mode(self): + for i in range(num_layers): + lang_dynamic_axes.pop(f"past_key.{i}", None) + lang_dynamic_axes.pop(f"past_value.{i}", None) dynamic_axes["vision"] = vision_dynamic_axes dynamic_axes["lang"] = lang_dynamic_axes else: @@ -1166,11 +1200,25 @@ def get_output_names(self, kv_offload: bool = False): output_names = {} if kv_offload: - lang_output_names.insert(1, "vision_embeds_RetainedState") - lang_output_names.insert(2, "image_idx_output") - lang_output_names.insert(2, "deepstack_features_RetainedState") - if _should_export_embedding_output(self): - lang_output_names.insert(4, "embedding_output") + if _is_single_shot_mode(self): + # Single-shot: keep vision/deepstack retained states, drop KV retained states. + # Order matches QEffQwen3VLDecoderWrapper.forward single-shot return: + # reranker: (logits, vision_embeds, deepstack_features, image_idx) + # embedding: (logits, vision_embeds, deepstack_features, image_idx, hidden_states) + lang_output_names = [ + "logits", + "vision_embeds_RetainedState", + "deepstack_features_RetainedState", + "image_idx_output", + ] + if _should_export_embedding_output(self): + lang_output_names.append("embedding_output") # hidden_states is output[4] + else: + lang_output_names.insert(1, "vision_embeds_RetainedState") + lang_output_names.insert(2, "image_idx_output") + lang_output_names.insert(2, "deepstack_features_RetainedState") + if _should_export_embedding_output(self): + lang_output_names.insert(4, "embedding_output") output_names["vision"] = vision_output_names output_names["lang"] = lang_output_names else: diff --git a/examples/reranker/qwen3vl/qwen3_vl_reranker.py b/examples/reranker/qwen3vl/qwen3_vl_reranker.py index 504280e7d9..a3d05c3d21 100644 --- a/examples/reranker/qwen3vl/qwen3_vl_reranker.py +++ b/examples/reranker/qwen3vl/qwen3_vl_reranker.py @@ -96,6 +96,7 @@ def main() -> None: kv_offload=True, trust_remote_code=True, config=config, + qaic_config={"no_kv_cache": True}, ) # 2) Build reranker helper and reference payload.