From f074a75a19d487b95af58bc561536b38244e4c7e Mon Sep 17 00:00:00 2001 From: Amit Raj Date: Sat, 18 Apr 2026 10:55:00 +0000 Subject: [PATCH 01/15] Enabling support of rerankers models 2B and 8B of qwen3vl bucket Signed-off-by: Amit Raj --- .../transformers/models/modeling_auto.py | 8 +- .../models/qwen3vl/reranker/README.md | 52 ++ .../qwen3vl/reranker/qwen3_vl_reranker.py | 555 ++++++++++++++++++ tests/configs/image_text_model_configs.json | 2 +- .../image_text_to_text/test_reranker_mad.py | 455 ++++++++++++++ 5 files changed, 1069 insertions(+), 3 deletions(-) create mode 100644 examples/image_text_to_text/models/qwen3vl/reranker/README.md create mode 100644 examples/image_text_to_text/models/qwen3vl/reranker/qwen3_vl_reranker.py create mode 100644 tests/transformers/models/image_text_to_text/test_reranker_mad.py diff --git a/QEfficient/transformers/models/modeling_auto.py b/QEfficient/transformers/models/modeling_auto.py index 65b89d274f..af7cce655b 100755 --- a/QEfficient/transformers/models/modeling_auto.py +++ b/QEfficient/transformers/models/modeling_auto.py @@ -1389,7 +1389,7 @@ def export( kv_offload=True, continuous_batching=self.continuous_batching, comp_ctx_lengths=self.comp_ctx_lengths_decode, - **dummy_inputs_kwargs, + prefill_seq_len=prefill_seq_len, ) dynamic_axes = self.model.get_onnx_dynamic_axes( kv_offload=True, @@ -1397,7 +1397,11 @@ def export( comp_ctx_lengths=self.comp_ctx_lengths_decode, ) except TypeError: - inputs = self.model.get_dummy_inputs(kv_offload=True, comp_ctx_lengths=self.comp_ctx_lengths_decode) + inputs = self.model.get_dummy_inputs( + kv_offload=True, + comp_ctx_lengths=self.comp_ctx_lengths_decode, + prefill_seq_len=prefill_seq_len, + ) dynamic_axes = self.model.get_onnx_dynamic_axes( kv_offload=True, comp_ctx_lengths=self.comp_ctx_lengths_decode ) diff --git a/examples/image_text_to_text/models/qwen3vl/reranker/README.md b/examples/image_text_to_text/models/qwen3vl/reranker/README.md new file mode 100644 index 0000000000..a3e715478d --- /dev/null +++ b/examples/image_text_to_text/models/qwen3vl/reranker/README.md @@ -0,0 +1,52 @@ +# Qwen3-VL Reranker Inference + +This directory contains an AI100 example for running Qwen3-VL reranker models with QEfficient and printing per-document relevance scores. + +Supported models: +- `Qwen/Qwen3-VL-Reranker-2B` +- `Qwen/Qwen3-VL-Reranker-8B` + +## What this example does + +- Loads Qwen3-VL reranker from Hugging Face (or local snapshot path). +- Uses QEff dual-QPC execution (vision encoder + language model). +- Runs the same query against multiple text/image documents. +- Prints one score per document in input order. + +## Required package + +- `qwen-vl-utils>=0.0.14` + +```bash +pip install "qwen-vl-utils>=0.0.14" +``` + +## Script + +- `qwen3_vl_reranker.py` + +## Run + +```bash +python examples/image_text_to_text/models/qwen3vl/reranker/qwen3_vl_reranker.py \ + --model-name Qwen/Qwen3-VL-Reranker-2B +``` + +Or run with 8B: + +```bash +python examples/image_text_to_text/models/qwen3vl/reranker/qwen3_vl_reranker.py \ + --model-name Qwen/Qwen3-VL-Reranker-8B +``` + +With compile parameters: + +```bash +python examples/image_text_to_text/models/qwen3vl/reranker/qwen3_vl_reranker.py \ + --model-name Qwen/Qwen3-VL-Reranker-2B \ + --ctx-len 2048 \ + --num-cores 16 \ + --num-devices 1 \ + --compile-prefill-seq-len 4096 \ + --mxfp6-matmul +``` diff --git a/examples/image_text_to_text/models/qwen3vl/reranker/qwen3_vl_reranker.py b/examples/image_text_to_text/models/qwen3vl/reranker/qwen3_vl_reranker.py new file mode 100644 index 0000000000..2fdd225571 --- /dev/null +++ b/examples/image_text_to_text/models/qwen3vl/reranker/qwen3_vl_reranker.py @@ -0,0 +1,555 @@ +# ----------------------------------------------------------------------------- +# +# Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries. +# SPDX-License-Identifier: BSD-3-Clause +# +# ----------------------------------------------------------------------------- + +import argparse +import os +from typing import Dict, List, Tuple + +import numpy as np +import torch +from huggingface_hub import snapshot_download +from qwen_vl_utils import process_vision_info +from transformers import AutoConfig, AutoProcessor + +from QEfficient import QEFFAutoModelForImageTextToText +from QEfficient.generation.cloud_infer import QAICInferenceSession + +DEFAULT_MODEL_NAME = "Qwen/Qwen3-VL-Reranker-2B" +DEFAULT_CTX_LEN = 2048 +DEFAULT_NUM_CORES = 16 +DEFAULT_NUM_DEVICES = 1 + +# Max token budget used by this example's manual truncation/padding flow. +MAX_LENGTH = 8192 +# Pixel constraints used by Qwen3-VL preprocessing. +IMAGE_BASE_FACTOR = 16 +IMAGE_FACTOR = IMAGE_BASE_FACTOR * 2 +MIN_PIXELS = 4 * IMAGE_FACTOR * IMAGE_FACTOR +MAX_PIXELS = 1280 * IMAGE_FACTOR * IMAGE_FACTOR +FPS = 1.0 + + +class QEffQwen3VLReranker: + @staticmethod + def _resolve_model_source(model_name_or_path: str) -> str: + """Return a local model path when given an HF repo id. + + Why: + Some transformers versions can fail when resolving chat templates from + repo-id mode for this model. Using a local snapshot path avoids that path. + """ + if os.path.isdir(model_name_or_path): + return model_name_or_path + return snapshot_download(repo_id=model_name_or_path) + + def __init__( + self, + model_name_or_path: str = DEFAULT_MODEL_NAME, + ctx_len: int = DEFAULT_CTX_LEN, + num_cores: int = DEFAULT_NUM_CORES, + num_devices: int = DEFAULT_NUM_DEVICES, + mxfp6_matmul: bool = False, + compile_prefill_seq_len: int = None, + ): + """Initialize the AI100-only reranker wrapper. + + This loads: + - HF config/processor for prompt and multimodal preprocessing. + - QEFF dual-QPC model wrapper (vision encoder + language decoder). + - Token ids for "yes"/"no" used to compute reranker scores. + + Parameters + ---------- + model_name_or_path: + HF model id or local snapshot path. + """ + self.model_name_or_path = model_name_or_path + self.model_source = self._resolve_model_source(model_name_or_path) + self.ctx_len = ctx_len + self.num_cores = num_cores + self.num_devices = num_devices + self.mxfp6_matmul = mxfp6_matmul + self.compile_prefill_seq_len = compile_prefill_seq_len + self.max_length = MAX_LENGTH + self.fps = FPS + + # Use local snapshot for stable processor/chat-template loading. + config = AutoConfig.from_pretrained(self.model_source, trust_remote_code=True, padding=True) + if hasattr(config, "use_cache"): + config.use_cache = True + if hasattr(config, "text_config") and hasattr(config.text_config, "use_cache"): + config.text_config.use_cache = True + + self.processor = AutoProcessor.from_pretrained(self.model_source, trust_remote_code=True, padding=True) + self.model = QEFFAutoModelForImageTextToText.from_pretrained( + self.model_source, + kv_offload=True, + trust_remote_code=True, + config=config, + ) + + self.yes_token_id, self.no_token_id = self._get_yes_no_token_ids(self.processor.tokenizer) + self._compiled_qpc_paths = None + self._compiled_prefill_seq_len = 0 + self._compiled_height = None + self._compiled_width = None + + @staticmethod + def _get_yes_no_token_ids(tokenizer) -> Tuple[int, int]: + """Resolve tokenizer ids for the exact tokens 'yes' and 'no'.""" + vocab = tokenizer.get_vocab() + if "yes" not in vocab or "no" not in vocab: + raise ValueError("Could not resolve tokenizer ids for exact tokens 'yes' and 'no'.") + return vocab["yes"], vocab["no"] + + @staticmethod + def _score_from_logits(logits, yes_token_id: int, no_token_id: int) -> float: + """Convert model logits into a reranker relevance score. + + Score formula: + sigmoid(logit_yes - logit_no) + """ + # Convert runtime output to torch and use final-token logits. + logits_tensor = torch.from_numpy(logits) if isinstance(logits, np.ndarray) else logits.detach().cpu() + if logits_tensor.ndim == 3: + logits_tensor = logits_tensor[:, -1, :] + # Binary relevance score from yes/no logit gap. + score = torch.sigmoid(logits_tensor[:, yes_token_id] - logits_tensor[:, no_token_id]) + return float(score[0].item()) + + @staticmethod + def _truncate_tokens_optimized(tokens: List[int], max_length: int, special_tokens: List[int]) -> List[int]: + """Truncate while preserving all special tokens in sequence order.""" + if len(tokens) <= max_length: + return tokens + + # Preserve all special/control tokens and trim only non-special tokens. + special_tokens_set = set(special_tokens) + num_special = sum(1 for token in tokens if token in special_tokens_set) + num_non_special_to_keep = max_length - num_special + + final_tokens = [] + non_special_kept_count = 0 + for token in tokens: + if token in special_tokens_set: + final_tokens.append(token) + elif non_special_kept_count < num_non_special_to_keep: + final_tokens.append(token) + non_special_kept_count += 1 + return final_tokens + + def _format_mm_content(self, text, image, video, prefix: str) -> List[Dict]: + """Build one multimodal content block (prefix + optional image + optional text).""" + # Prefix helps the model distinguish query vs document sections. + content = [{"type": "text", "text": prefix}] + + if not text and not image and not video: + content.append({"type": "text", "text": "NULL"}) + return content + + if video: + raise ValueError("Video input is not supported in this AI100-only example.") + + if image: + # Convert local paths to file:// URIs for the processor. + if isinstance(image, str): + image_content = image if image.startswith(("http", "oss")) else "file://" + image + else: + image_content = image + content.append( + { + "type": "image", + "image": image_content, + "min_pixels": MIN_PIXELS, + "max_pixels": MAX_PIXELS, + } + ) + + if text: + content.append({"type": "text", "text": text}) + + return content + + def _format_mm_instruction(self, instruction: str, query: Dict, document: Dict) -> List[Dict]: + """Create the chat payload for one query-document pair.""" + # Prompt shape follows the HF reranker reference format. + contents = [{"type": "text", "text": ": " + instruction}] + + contents.extend( + self._format_mm_content( + query.get("text"), + query.get("image"), + query.get("video"), + prefix=":", + ) + ) + contents.extend( + self._format_mm_content( + document.get("text"), + document.get("image"), + document.get("video"), + prefix="\n:", + ) + ) + + return [ + { + "role": "system", + "content": [ + { + "type": "text", + "text": ( + "Judge whether the Document meets the requirements based on the Query and the Instruct " + 'provided. Note that the answer can only be "yes" or "no".' + ), + } + ], + }, + {"role": "user", "content": contents}, + ] + + def _tokenize_pair(self, pair: List[Dict]) -> Dict: + """Tokenize a query-document pair with the exact HF multimodal pipeline.""" + # Processor expects list-of-conversations. + pairs = [pair] + text = self.processor.apply_chat_template(pairs, tokenize=False, add_generation_prompt=True) + + # Build image/video tensors + metadata for processor inputs. + images, videos, video_kwargs = process_vision_info( + pairs, + image_patch_size=16, + return_video_kwargs=True, + return_video_metadata=True, + ) + + if videos is not None: + videos, video_metadatas = zip(*videos) + videos = list(videos) + video_metadatas = list(video_metadatas) + else: + video_metadatas = None + + inputs = self.processor( + text=text, + images=images, + videos=videos, + video_metadata=video_metadatas, + truncation=False, + padding=False, + do_resize=False, + **video_kwargs, + ) + + # Apply custom truncation preserving trailing template control tokens. + for i, input_ids in enumerate(inputs["input_ids"]): + inputs["input_ids"][i] = ( + self._truncate_tokens_optimized( + input_ids[:-5], + self.max_length, + self.processor.tokenizer.all_special_ids, + ) + + input_ids[-5:] + ) + + # Re-pad through tokenizer utilities so masks align with token ids. + padded = self.processor.tokenizer.pad( + {"input_ids": inputs["input_ids"]}, + padding=True, + return_tensors="pt", + max_length=self.max_length, + ) + for key in padded: + inputs[key] = padded[key] + + if "pixel_values" in inputs: + # Keep pixels fp32 before explicit cast to fp16 during vision run. + inputs["pixel_values"] = inputs["pixel_values"].to(torch.float32) + + return inputs + + def _prepare_inputs(self, tokenized_inputs: Dict, prefill_seq_len: int = None): + """Prepare model inputs for dual-QPC prefill execution.""" + # True prompt length before compile-aligned padding. + runtime_prompt_len = int(tokenized_inputs["input_ids"].shape[1]) + effective_prefill = runtime_prompt_len if prefill_seq_len is None else prefill_seq_len + if effective_prefill < runtime_prompt_len: + raise ValueError( + f"prefill_seq_len ({effective_prefill}) must be >= runtime prompt length ({runtime_prompt_len})." + ) + + # Let model helper compute position_ids and multimodal placement. + prepared_inputs = self.model.model.prepare_inputs_for_generation( + inputs=tokenized_inputs, + prefill_seq_len=effective_prefill, + batch_size=1, + ) + + # Normalize image_grid_thw to the shape consumed by compiled path. + if "image_grid_thw" in prepared_inputs and prepared_inputs["image_grid_thw"].ndim == 2: + thw = prepared_inputs["image_grid_thw"][0] + t, h, w = int(thw[0].item()), int(thw[1].item()), int(thw[2].item()) + prepared_inputs["image_grid_thw"] = torch.zeros((1, t, h, w), dtype=thw.dtype) + + if "pixel_values" in prepared_inputs: + prepared_inputs["pixel_values"] = prepared_inputs["pixel_values"].to(torch.float32) + + return prepared_inputs, runtime_prompt_len + + def _ensure_compiled(self, prefill_seq_len: int, height: int, width: int): + """Compile QPCs if needed, otherwise reuse cached compiled artifacts.""" + # Reuse previously compiled artifacts whenever shapes are compatible. + if ( + self._compiled_qpc_paths is not None + and prefill_seq_len <= self._compiled_prefill_seq_len + and height == self._compiled_height + and width == self._compiled_width + ): + return + + reuse_vision_qpc = ( + self._compiled_qpc_paths is not None and height == self._compiled_height and width == self._compiled_width + ) + + # Compile one max prefill specialization and optionally skip vision recompile. + compiled_paths = self.model.compile( + prefill_seq_len=prefill_seq_len, + ctx_len=self.ctx_len, + img_size=max(height, width), + height=height, + width=width, + num_cores=self.num_cores, + num_devices=self.num_devices, + mxfp6_matmul=self.mxfp6_matmul, + # vision_embed_fp32=True, + skip_vision=reuse_vision_qpc, + ) + if reuse_vision_qpc: + compiled_paths["vision_qpc_path"] = self._compiled_qpc_paths["vision_qpc_path"] + + self._compiled_qpc_paths = compiled_paths + self._compiled_prefill_seq_len = prefill_seq_len + self._compiled_height = height + self._compiled_width = width + + @staticmethod + def _zero_vision_outputs(vision_outputs: Dict[str, np.ndarray]) -> Dict[str, np.ndarray]: + """Create zero-valued placeholders matching vision output buffers.""" + return {name: np.zeros_like(value) for name, value in vision_outputs.items()} + + def _run_ai100_vision(self, prepared_inputs) -> Dict[str, np.ndarray]: + """Run the compiled vision encoder QPC and return retained-state buffers.""" + if "pixel_values" not in prepared_inputs or "image_grid_thw" not in prepared_inputs: + raise ValueError("Missing pixel_values/image_grid_thw for vision execution.") + + # Vision session produces retained states consumed by language session. + vision_session = QAICInferenceSession(self._compiled_qpc_paths["vision_qpc_path"]) + vision_outputs = vision_session.run( + { + # Vision qpc expects fp16 pixels + int64 grid coordinates. + "pixel_values": prepared_inputs["pixel_values"].detach().cpu().numpy().astype(np.float16), + "image_grid_thw": prepared_inputs["image_grid_thw"].detach().cpu().numpy().astype(np.int64), + } + ) + vision_session.deactivate() + return vision_outputs + + def _run_ai100_prefill(self, prepared_inputs, vision_template: Dict[str, np.ndarray]) -> np.ndarray: + """Run one prefill pass on AI100 language QPC and return logits.""" + # Match runtime input to compiled prefill length. + prefill_len = prepared_inputs["position_ids"].shape[-1] + input_ids = prepared_inputs["input_ids"] + if input_ids.shape[1] < prefill_len: + pad = torch.full( + (input_ids.shape[0], prefill_len - input_ids.shape[1]), + 1, + dtype=input_ids.dtype, + device=input_ids.device, + ) + input_ids = torch.cat([input_ids, pad], dim=1) + else: + input_ids = input_ids[:, :prefill_len] + + position_ids = prepared_inputs["position_ids"][..., :prefill_len] + + # For text-only docs, inject zeroed retained states with matching shapes. + if "pixel_values" in prepared_inputs and "image_grid_thw" in prepared_inputs: + vision_outputs = self._run_ai100_vision(prepared_inputs) + else: + vision_outputs = self._zero_vision_outputs(vision_template) + + # Skip past/retained buffers and run only required prefill inputs. + lang_session = QAICInferenceSession(self._compiled_qpc_paths["lang_qpc_path"]) + lang_session.skip_buffers( + [ + name + for name in lang_session.input_names + lang_session.output_names + if name.startswith("past_") or name.endswith("_RetainedState") + ] + ) + lang_session.set_buffers(vision_outputs) + outputs = lang_session.run( + { + # image_idx selects the vision buffer slot for this request. + "input_ids": input_ids.detach().cpu().numpy().astype(np.int64), + "position_ids": position_ids.detach().cpu().numpy().astype(np.int64), + "image_idx": np.zeros((1, 1), dtype=np.int64), + } + ) + lang_session.deactivate() + return outputs["logits"] + + def process(self, inputs: Dict) -> List[float]: + """Score all documents for one query on AI100. + + High-level flow: + 1) Build model-ready query-document pairs. + 2) Find max prompt/image shape across all docs. + 3) Compile once at max shape (single stable specialization). + 4) Run prefill per doc and convert logits -> score. + """ + # Unpack user payload. + instruction = inputs["instruction"] + query = inputs.get("query", {}) + documents = inputs.get("documents", []) + + # Collect per-document tokenized contexts first so we can compile once + # with the largest prompt/image shape required by this request. + prepared_contexts = [] + max_prompt_len = 0 + max_grid_h = 22 + max_grid_w = 34 + + # Build each pair in the exact chat-template format expected by the model. + for document in documents: + pair = self._format_mm_instruction(instruction, query, document) + tokenized = self._tokenize_pair(pair) + runtime_prompt_len = int(tokenized["input_ids"].shape[1]) + + # Track the max image grid (H, W) seen so compile dimensions can + # handle all documents in this batch. + if "image_grid_thw" in tokenized and tokenized["image_grid_thw"].numel() > 0: + grid = tokenized["image_grid_thw"] + max_grid_h = max(max_grid_h, int(grid[..., 1].max().item())) + max_grid_w = max(max_grid_w, int(grid[..., 2].max().item())) + + prepared_contexts.append( + { + "tokenized": tokenized, + "runtime_prompt_len": runtime_prompt_len, + } + ) + max_prompt_len = max(max_prompt_len, runtime_prompt_len) + + # Empty documents list => no scores. + if max_prompt_len == 0: + return [] + + # Convert max grid to compile-time pixel dimensions using model patch size. + patch_size = int(self.model.model.config.vision_config.patch_size) + compile_height = max_grid_h * patch_size + compile_width = max_grid_w * patch_size + + # Compile/reuse a single language specialization and prepare all requests + # to that same prefill length to avoid per-document recompiles. + target_prefill_seq_len = max_prompt_len + if self.compile_prefill_seq_len is not None: + if self.compile_prefill_seq_len < max_prompt_len: + raise ValueError( + f"--compile-prefill-seq-len ({self.compile_prefill_seq_len}) must be >= " + f"max runtime prompt length ({max_prompt_len})." + ) + target_prefill_seq_len = self.compile_prefill_seq_len + + self._ensure_compiled(target_prefill_seq_len, compile_height, compile_width) + + # Prepare all documents to the same prefill length used at compile time. + prepared_contexts_with_prefill = [] + vision_template = None + for ctx in prepared_contexts: + prepared_inputs, _ = self._prepare_inputs(ctx["tokenized"], prefill_seq_len=target_prefill_seq_len) + prepared_contexts_with_prefill.append({"prepared_inputs": prepared_inputs}) + + # Capture one real vision-output template so text-only docs can reuse + # zero-valued buffers with exact matching shapes. + if vision_template is None and "pixel_values" in prepared_inputs and "image_grid_thw" in prepared_inputs: + vision_template = self._run_ai100_vision(prepared_inputs) + + # This example currently expects at least one image document to establish + # retained-state buffer shapes for mixed image/text batches. + if vision_template is None: + raise ValueError("At least one image document is required to initialize AI100 vision buffers.") + + # Run language prefill and compute scalar score per document. + scores = [] + for ctx in prepared_contexts_with_prefill: + logits = self._run_ai100_prefill( + ctx["prepared_inputs"], + vision_template=vision_template, + ) + # Reranker score = sigmoid(logit_yes - logit_no). + score = self._score_from_logits(logits, self.yes_token_id, self.no_token_id) + scores.append(score) + + return scores + + +def main(): + # Keep CLI simple: just allow model id/path override. + parser = argparse.ArgumentParser(description="Qwen3-VL reranker example.") + parser.add_argument("--model-name", type=str, default=DEFAULT_MODEL_NAME) + parser.add_argument("--ctx-len", type=int, default=DEFAULT_CTX_LEN, help="Context length used at compile time.") + parser.add_argument("--num-cores", type=int, default=DEFAULT_NUM_CORES, help="Number of AI100 cores.") + parser.add_argument("--num-devices", type=int, default=DEFAULT_NUM_DEVICES, help="Number of AI100 devices.") + parser.add_argument( + "--mxfp6-matmul", + action="store_true", + help="Enable MXFP6 matmul during compile (default: disabled).", + ) + parser.add_argument( + "--compile-prefill-seq-len", + type=int, + default=None, + help=( + "Optional fixed prefill sequence length for compile/padding. " + "Must be >= max prompt length of the current request." + ), + ) + args = parser.parse_args() + + model = QEffQwen3VLReranker( + model_name_or_path=args.model_name, + ctx_len=args.ctx_len, + num_cores=args.num_cores, + num_devices=args.num_devices, + mxfp6_matmul=args.mxfp6_matmul, + compile_prefill_seq_len=args.compile_prefill_seq_len, + ) + + # Example input payload matching the HF reranker schema. + inputs = { + "instruction": "Retrieve images or text relevant to the user's query.", + "query": {"text": "A woman playing with her dog on a beach at sunset."}, + "documents": [ + { + "text": "A woman shares a joyful moment with her golden retriever on a sun-drenched beach at sunset, as the dog offers its paw in a heartwarming display of companionship and trust." + }, + {"image": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg"}, + { + "text": "A woman shares a joyful moment with her golden retriever on a sun-drenched beach at sunset, as the dog offers its paw in a heartwarming display of companionship and trust.", + "image": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg", + }, + ], + "fps": 1.0, + } + + # Print one score per document in the same order as inputs["documents"]. + scores = model.process(inputs) + print(scores) + + +if __name__ == "__main__": + main() diff --git a/tests/configs/image_text_model_configs.json b/tests/configs/image_text_model_configs.json index 85df559970..f4cdb6a0fd 100644 --- a/tests/configs/image_text_model_configs.json +++ b/tests/configs/image_text_model_configs.json @@ -5,7 +5,7 @@ "model_type": "llava", "batch_size": 1, "prompt_len": 784, - "ctx_len": 1024, + "ctx_len": 2048, "img_size": 336, "img_url": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/ai2d-demo.jpg", "text_prompt": "What does the label 15 represent? (1) lava (2) core (3) tunnel (4) ash cloud", diff --git a/tests/transformers/models/image_text_to_text/test_reranker_mad.py b/tests/transformers/models/image_text_to_text/test_reranker_mad.py new file mode 100644 index 0000000000..3a6497b520 --- /dev/null +++ b/tests/transformers/models/image_text_to_text/test_reranker_mad.py @@ -0,0 +1,455 @@ +# ----------------------------------------------------------------------------- +# +# Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries. +# SPDX-License-Identifier: BSD-3-Clause +# +# ---------------------------------------------------------------------------- + +import json +import os +from typing import Dict, List, Tuple + +import numpy as np +import pytest +import torch +from huggingface_hub import snapshot_download +from qwen_vl_utils import process_vision_info +from transformers import AutoConfig, AutoProcessor + +from QEfficient.generation.cloud_infer import QAICInferenceSession +from QEfficient.transformers.models.modeling_auto import QEFFAutoModelForImageTextToText +from QEfficient.utils.test_utils import load_vlm_model, set_num_layers_vlm + +CONFIG_PATH = "tests/configs/image_text_model_configs.json" + +PT_AI100_MAD_MAX = 5e-3 +MAX_LENGTH = 8192 +RERANKER_DOC_LIMIT = int(os.getenv("QEFF_RERANKER_DOC_LIMIT", "0")) + +IMAGE_BASE_FACTOR = 16 +IMAGE_FACTOR = IMAGE_BASE_FACTOR * 2 +MIN_PIXELS = 4 * IMAGE_FACTOR * IMAGE_FACTOR +MAX_PIXELS = 1280 * IMAGE_FACTOR * IMAGE_FACTOR + +EXAMPLE_INPUTS = { + "instruction": "Retrieve relevant content.", + "query": {"text": "dog on beach"}, + "documents": [ + {"image": "https://picsum.photos/id/237/536/354"}, + {"text": "A dog running on the beach."}, + ], +} + +with open(CONFIG_PATH, "r") as f: + config_data = json.load(f) + reranker_models = config_data["image_text_reranker_models"] + +test_reranker_models = [model_config["model_name"] for model_config in reranker_models] +reranker_model_config_dict = {model["model_name"]: model for model in reranker_models} + + +def _resolve_model_source(model_name_or_path: str) -> str: + if os.path.isdir(model_name_or_path): + return model_name_or_path + return snapshot_download(repo_id=model_name_or_path) + + +def _format_mm_content(text, image, video, prefix: str) -> List[Dict]: + content = [{"type": "text", "text": prefix}] + + if not text and not image and not video: + content.append({"type": "text", "text": "NULL"}) + return content + + if video: + raise ValueError("Video input is not supported in this test.") + + if image: + if isinstance(image, str): + image_content = image if image.startswith(("http", "oss")) else "file://" + image + else: + image_content = image + content.append( + { + "type": "image", + "image": image_content, + "min_pixels": MIN_PIXELS, + "max_pixels": MAX_PIXELS, + } + ) + + if text: + content.append({"type": "text", "text": text}) + + return content + + +def _format_mm_instruction(instruction: str, query: Dict, document: Dict) -> List[Dict]: + contents = [{"type": "text", "text": ": " + instruction}] + + contents.extend( + _format_mm_content( + query.get("text"), + query.get("image"), + query.get("video"), + prefix=":", + ) + ) + contents.extend( + _format_mm_content( + document.get("text"), + document.get("image"), + document.get("video"), + prefix="\n:", + ) + ) + + return [ + { + "role": "system", + "content": [ + { + "type": "text", + "text": ( + "Judge whether the Document meets the requirements based on the Query and the Instruct " + 'provided. Note that the answer can only be "yes" or "no".' + ), + } + ], + }, + {"role": "user", "content": contents}, + ] + + +def _truncate_tokens_optimized(tokens: List[int], max_length: int, special_tokens: List[int]) -> List[int]: + if len(tokens) <= max_length: + return tokens + + special_tokens_set = set(special_tokens) + num_special = sum(1 for token in tokens if token in special_tokens_set) + num_non_special_to_keep = max_length - num_special + + final_tokens = [] + non_special_kept_count = 0 + for token in tokens: + if token in special_tokens_set: + final_tokens.append(token) + elif non_special_kept_count < num_non_special_to_keep: + final_tokens.append(token) + non_special_kept_count += 1 + return final_tokens + + +def _tokenize_pair(processor, pair: List[Dict]) -> Dict: + pairs = [pair] + text = processor.apply_chat_template(pairs, tokenize=False, add_generation_prompt=True) + + images, videos, video_kwargs = process_vision_info( + pairs, + image_patch_size=16, + return_video_kwargs=True, + return_video_metadata=True, + ) + + if videos is not None: + videos, video_metadatas = zip(*videos) + videos = list(videos) + video_metadatas = list(video_metadatas) + else: + video_metadatas = None + + inputs = processor( + text=text, + images=images, + videos=videos, + video_metadata=video_metadatas, + truncation=False, + padding=False, + do_resize=False, + **video_kwargs, + ) + + for i, input_ids in enumerate(inputs["input_ids"]): + inputs["input_ids"][i] = ( + _truncate_tokens_optimized( + input_ids[:-5], + MAX_LENGTH, + processor.tokenizer.all_special_ids, + ) + + input_ids[-5:] + ) + + padded = processor.tokenizer.pad( + {"input_ids": inputs["input_ids"]}, + padding=True, + return_tensors="pt", + max_length=MAX_LENGTH, + ) + for key in padded: + inputs[key] = padded[key] + + if "pixel_values" in inputs: + inputs["pixel_values"] = inputs["pixel_values"].to(torch.float32) + + return inputs + + +def _get_yes_no_token_ids(tokenizer) -> Tuple[int, int]: + vocab = tokenizer.get_vocab() + if "yes" not in vocab or "no" not in vocab: + raise ValueError("Could not resolve tokenizer ids for exact tokens 'yes' and 'no'.") + return vocab["yes"], vocab["no"] + + +def _score_from_logits(logits, yes_token_id: int, no_token_id: int) -> np.ndarray: + if isinstance(logits, np.ndarray): + logits_tensor = torch.from_numpy(logits) + else: + logits_tensor = logits.detach().cpu() + + if logits_tensor.ndim == 3: + logits_tensor = logits_tensor[:, -1, :] + elif logits_tensor.ndim != 2: + raise ValueError(f"Unsupported logits rank for score conversion: {logits_tensor.ndim}") + + score = torch.sigmoid(logits_tensor[:, yes_token_id] - logits_tensor[:, no_token_id]) + return score.detach().cpu().numpy().astype(np.float64) + + +def _score_from_last_hidden(last_hidden_state: torch.Tensor, score_linear: torch.nn.Linear) -> np.ndarray: + score = torch.sigmoid(score_linear(last_hidden_state[:, -1])).squeeze(-1) + return score.detach().cpu().numpy().astype(np.float64) + + +def _make_score_linear(model_hf, yes_token_id: int, no_token_id: int) -> torch.nn.Linear: + lm_head_weights = model_hf.lm_head.weight.data + weight_yes = lm_head_weights[yes_token_id] + weight_no = lm_head_weights[no_token_id] + + linear_layer = torch.nn.Linear(weight_yes.shape[0], 1, bias=False) + with torch.no_grad(): + linear_layer.weight[0] = weight_yes - weight_no + return linear_layer.eval() + + +def _mad_stats(reference: np.ndarray, candidate: np.ndarray) -> Tuple[float, float]: + diff = np.abs(reference - candidate) + return float(np.mean(diff)), float(np.max(diff)) + + +def _prepare_qeff_inputs(qeff_model, tokenized_inputs: Dict, prefill_seq_len: int = None): + runtime_prompt_len = int(tokenized_inputs["input_ids"].shape[1]) + effective_prefill_seq_len = runtime_prompt_len if prefill_seq_len is None else prefill_seq_len + if effective_prefill_seq_len < runtime_prompt_len: + raise ValueError( + f"prefill_seq_len ({effective_prefill_seq_len}) must be >= runtime prompt length ({runtime_prompt_len})." + ) + + prepared_inputs = qeff_model.model.prepare_inputs_for_generation( + inputs=tokenized_inputs, + prefill_seq_len=effective_prefill_seq_len, + batch_size=1, + ) + + if "image_grid_thw" in prepared_inputs and prepared_inputs["image_grid_thw"].ndim == 2: + thw = prepared_inputs["image_grid_thw"][0] + t, h, w = int(thw[0].item()), int(thw[1].item()), int(thw[2].item()) + prepared_inputs["image_grid_thw"] = torch.zeros((1, t, h, w), dtype=thw.dtype) + + if "pixel_values" in prepared_inputs: + prepared_inputs["pixel_values"] = prepared_inputs["pixel_values"].to(torch.float32) + + return prepared_inputs, runtime_prompt_len + + +def _zero_vision_outputs(vision_outputs: Dict[str, np.ndarray]) -> Dict[str, np.ndarray]: + return {name: np.zeros_like(value) for name, value in vision_outputs.items()} + + +def _run_ai100_vision(vision_qpc_path: str, prepared_inputs) -> Dict[str, np.ndarray]: + vision_session = QAICInferenceSession(vision_qpc_path) + vision_inputs = { + "pixel_values": prepared_inputs["pixel_values"].detach().cpu().numpy().astype(np.float16), + "image_grid_thw": prepared_inputs["image_grid_thw"].detach().cpu().numpy().astype(np.int64), + } + vision_outputs = vision_session.run(vision_inputs) + vision_session.deactivate() + return vision_outputs + + +def _run_ai100_prefill(qpc_paths, prepared_inputs, vision_template): + if not isinstance(qpc_paths, dict): + raise ValueError("Expected qpc_paths to be a dict with vision/lang QPC keys.") + + vision_qpc_path = qpc_paths.get("vision_qpc_path") + lang_qpc_path = qpc_paths.get("lang_qpc_path") + if vision_qpc_path is None or lang_qpc_path is None: + raise ValueError("Missing vision_qpc_path/lang_qpc_path in compiled QPC outputs.") + + prefill_len = prepared_inputs["position_ids"].shape[-1] + input_ids = prepared_inputs["input_ids"] + if input_ids.shape[1] < prefill_len: + pad = torch.full( + (input_ids.shape[0], prefill_len - input_ids.shape[1]), + 1, + dtype=input_ids.dtype, + device=input_ids.device, + ) + input_ids = torch.cat([input_ids, pad], dim=1) + else: + input_ids = input_ids[:, :prefill_len] + position_ids = prepared_inputs["position_ids"][..., :prefill_len] + + if "pixel_values" in prepared_inputs and "image_grid_thw" in prepared_inputs: + vision_outputs = _run_ai100_vision(vision_qpc_path, prepared_inputs) + else: + vision_outputs = _zero_vision_outputs(vision_template) + + lang_session = QAICInferenceSession(lang_qpc_path) + lang_session.skip_buffers( + [ + name + for name in lang_session.input_names + lang_session.output_names + if name.startswith("past_") or name.endswith("_RetainedState") + ] + ) + lang_session.set_buffers(vision_outputs) + lang_inputs = { + "input_ids": input_ids.detach().cpu().numpy().astype(np.int64), + "position_ids": position_ids.detach().cpu().numpy().astype(np.int64), + "image_idx": np.zeros((1, 1), dtype=np.int64), + } + outputs = lang_session.run(lang_inputs) + lang_session.deactivate() + return outputs["logits"] + + +@pytest.mark.on_qaic +@pytest.mark.multimodal +@pytest.mark.regular +@pytest.mark.parametrize("model_name", test_reranker_models) +def test_qwen3_vl_reranker_mad_parity(model_name): + torch.manual_seed(42) + model_cfg = reranker_model_config_dict[model_name] + model_source = _resolve_model_source(model_name) + + config = AutoConfig.from_pretrained(model_source, trust_remote_code=True, padding=True) + config = set_num_layers_vlm(config, n_layer=model_cfg["num_layers"]) + if hasattr(config, "use_cache"): + config.use_cache = True + if hasattr(config, "text_config") and hasattr(config.text_config, "use_cache"): + config.text_config.use_cache = True + + model_hf = load_vlm_model(config) + model_hf.eval() + + qeff_model = QEFFAutoModelForImageTextToText.from_pretrained( + model_source, + kv_offload=True, + config=config, + ) + processor = AutoProcessor.from_pretrained(model_source, trust_remote_code=True, padding=True) + + yes_token_id, no_token_id = _get_yes_no_token_ids(processor.tokenizer) + score_linear = _make_score_linear(model_hf, yes_token_id, no_token_id).to(next(model_hf.parameters()).device) + score_linear = score_linear.to(dtype=next(model_hf.parameters()).dtype) + + doc_contexts = [] + max_prompt_len = 0 + max_grid_h = 22 + max_grid_w = 34 + + hf_scores_list = [] + + documents = EXAMPLE_INPUTS["documents"] + if RERANKER_DOC_LIMIT > 0: + documents = documents[:RERANKER_DOC_LIMIT] + + for document in documents: + pair = _format_mm_instruction( + instruction=EXAMPLE_INPUTS["instruction"], + query=EXAMPLE_INPUTS["query"], + document=document, + ) + tokenized = _tokenize_pair(processor, pair) + runtime_prompt_len = int(tokenized["input_ids"].shape[1]) + + hf_inputs = {} + for key, value in tokenized.items(): + hf_inputs[key] = value.to(next(model_hf.parameters()).device) if torch.is_tensor(value) else value + with torch.no_grad(): + hf_last_hidden = model_hf.model(**hf_inputs).last_hidden_state + hf_score = _score_from_last_hidden(hf_last_hidden, score_linear)[0] + hf_scores_list.append(float(hf_score)) + + if "image_grid_thw" in tokenized and tokenized["image_grid_thw"].numel() > 0: + grid = tokenized["image_grid_thw"] + max_grid_h = max(max_grid_h, int(grid[..., 1].max().item())) + max_grid_w = max(max_grid_w, int(grid[..., 2].max().item())) + + doc_contexts.append( + { + "tokenized": tokenized, + } + ) + max_prompt_len = max(max_prompt_len, runtime_prompt_len) + + patch_size = int(qeff_model.model.config.vision_config.patch_size) + compile_height = max_grid_h * patch_size + compile_width = max_grid_w * patch_size + + qpc_paths = qeff_model.compile( + img_size=max(compile_height, compile_width), + height=compile_height, + width=compile_width, + prefill_seq_len=max_prompt_len, + ctx_len=model_cfg["ctx_len"], + num_devices=1, + num_cores=16, + mxfp6_matmul=False, + ) + + ai100_scores_list = [] + + prepared_contexts = [] + vision_template_ai100 = None + for context in doc_contexts: + prepared_inputs, _ = _prepare_qeff_inputs( + qeff_model=qeff_model, + tokenized_inputs=context["tokenized"], + prefill_seq_len=max_prompt_len, + ) + prepared_contexts.append( + { + "prepared_inputs": prepared_inputs, + } + ) + if vision_template_ai100 is None and "pixel_values" in prepared_inputs and "image_grid_thw" in prepared_inputs: + vision_template_ai100 = _run_ai100_vision(qpc_paths["vision_qpc_path"], prepared_inputs) + + if vision_template_ai100 is None: + raise ValueError("Expected at least one image document to initialize vision templates.") + + for context in prepared_contexts: + prepared_inputs_runtime = context["prepared_inputs"] + ai100_logits = _run_ai100_prefill( + qpc_paths=qpc_paths, + prepared_inputs=prepared_inputs_runtime, + vision_template=vision_template_ai100, + ) + ai100_score = _score_from_logits(ai100_logits, yes_token_id, no_token_id)[0] + ai100_scores_list.append(float(ai100_score)) + + hf_scores = np.array(hf_scores_list, dtype=np.float64) + ai100_scores = np.array(ai100_scores_list, dtype=np.float64) + + print(f"[SCORES] PyTorch(original): {hf_scores.tolist()}") + print(f"[SCORES] AI100: {ai100_scores.tolist()}") + + pt_ai100_mad_mean, pt_ai100_mad_max = _mad_stats(hf_scores, ai100_scores) + print(f"[MAD] PyTorch(original) vs AI100: mean={pt_ai100_mad_mean:.6e}, max={pt_ai100_mad_max:.6e}") + assert pt_ai100_mad_max <= PT_AI100_MAD_MAX, ( + f"PyTorch(original) vs AI100 MAD max {pt_ai100_mad_max:.6e} " + f"exceeds threshold {PT_AI100_MAD_MAX:.6e}. " + f"Check tokenizer ids, prompt formatting, runtime prompt length slicing, and compile dimensions." + ) From 66763898a3f0824830ec787a003e80c44afbb940 Mon Sep 17 00:00:00 2001 From: Amit Raj Date: Tue, 19 May 2026 08:50:39 +0530 Subject: [PATCH 02/15] Functionality changes to PR and rebase with main branch Signed-off-by: Amit Raj --- .../models/qwen3vl/reranker/README.md | 52 -- .../qwen3vl/reranker/qwen3_vl_reranker.py | 555 ------------------ tests/configs/image_text_model_configs.json | 2 +- .../image_text_to_text/test_reranker_mad.py | 455 -------------- 4 files changed, 1 insertion(+), 1063 deletions(-) delete mode 100644 examples/image_text_to_text/models/qwen3vl/reranker/README.md delete mode 100644 examples/image_text_to_text/models/qwen3vl/reranker/qwen3_vl_reranker.py delete mode 100644 tests/transformers/models/image_text_to_text/test_reranker_mad.py diff --git a/examples/image_text_to_text/models/qwen3vl/reranker/README.md b/examples/image_text_to_text/models/qwen3vl/reranker/README.md deleted file mode 100644 index a3e715478d..0000000000 --- a/examples/image_text_to_text/models/qwen3vl/reranker/README.md +++ /dev/null @@ -1,52 +0,0 @@ -# Qwen3-VL Reranker Inference - -This directory contains an AI100 example for running Qwen3-VL reranker models with QEfficient and printing per-document relevance scores. - -Supported models: -- `Qwen/Qwen3-VL-Reranker-2B` -- `Qwen/Qwen3-VL-Reranker-8B` - -## What this example does - -- Loads Qwen3-VL reranker from Hugging Face (or local snapshot path). -- Uses QEff dual-QPC execution (vision encoder + language model). -- Runs the same query against multiple text/image documents. -- Prints one score per document in input order. - -## Required package - -- `qwen-vl-utils>=0.0.14` - -```bash -pip install "qwen-vl-utils>=0.0.14" -``` - -## Script - -- `qwen3_vl_reranker.py` - -## Run - -```bash -python examples/image_text_to_text/models/qwen3vl/reranker/qwen3_vl_reranker.py \ - --model-name Qwen/Qwen3-VL-Reranker-2B -``` - -Or run with 8B: - -```bash -python examples/image_text_to_text/models/qwen3vl/reranker/qwen3_vl_reranker.py \ - --model-name Qwen/Qwen3-VL-Reranker-8B -``` - -With compile parameters: - -```bash -python examples/image_text_to_text/models/qwen3vl/reranker/qwen3_vl_reranker.py \ - --model-name Qwen/Qwen3-VL-Reranker-2B \ - --ctx-len 2048 \ - --num-cores 16 \ - --num-devices 1 \ - --compile-prefill-seq-len 4096 \ - --mxfp6-matmul -``` diff --git a/examples/image_text_to_text/models/qwen3vl/reranker/qwen3_vl_reranker.py b/examples/image_text_to_text/models/qwen3vl/reranker/qwen3_vl_reranker.py deleted file mode 100644 index 2fdd225571..0000000000 --- a/examples/image_text_to_text/models/qwen3vl/reranker/qwen3_vl_reranker.py +++ /dev/null @@ -1,555 +0,0 @@ -# ----------------------------------------------------------------------------- -# -# Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries. -# SPDX-License-Identifier: BSD-3-Clause -# -# ----------------------------------------------------------------------------- - -import argparse -import os -from typing import Dict, List, Tuple - -import numpy as np -import torch -from huggingface_hub import snapshot_download -from qwen_vl_utils import process_vision_info -from transformers import AutoConfig, AutoProcessor - -from QEfficient import QEFFAutoModelForImageTextToText -from QEfficient.generation.cloud_infer import QAICInferenceSession - -DEFAULT_MODEL_NAME = "Qwen/Qwen3-VL-Reranker-2B" -DEFAULT_CTX_LEN = 2048 -DEFAULT_NUM_CORES = 16 -DEFAULT_NUM_DEVICES = 1 - -# Max token budget used by this example's manual truncation/padding flow. -MAX_LENGTH = 8192 -# Pixel constraints used by Qwen3-VL preprocessing. -IMAGE_BASE_FACTOR = 16 -IMAGE_FACTOR = IMAGE_BASE_FACTOR * 2 -MIN_PIXELS = 4 * IMAGE_FACTOR * IMAGE_FACTOR -MAX_PIXELS = 1280 * IMAGE_FACTOR * IMAGE_FACTOR -FPS = 1.0 - - -class QEffQwen3VLReranker: - @staticmethod - def _resolve_model_source(model_name_or_path: str) -> str: - """Return a local model path when given an HF repo id. - - Why: - Some transformers versions can fail when resolving chat templates from - repo-id mode for this model. Using a local snapshot path avoids that path. - """ - if os.path.isdir(model_name_or_path): - return model_name_or_path - return snapshot_download(repo_id=model_name_or_path) - - def __init__( - self, - model_name_or_path: str = DEFAULT_MODEL_NAME, - ctx_len: int = DEFAULT_CTX_LEN, - num_cores: int = DEFAULT_NUM_CORES, - num_devices: int = DEFAULT_NUM_DEVICES, - mxfp6_matmul: bool = False, - compile_prefill_seq_len: int = None, - ): - """Initialize the AI100-only reranker wrapper. - - This loads: - - HF config/processor for prompt and multimodal preprocessing. - - QEFF dual-QPC model wrapper (vision encoder + language decoder). - - Token ids for "yes"/"no" used to compute reranker scores. - - Parameters - ---------- - model_name_or_path: - HF model id or local snapshot path. - """ - self.model_name_or_path = model_name_or_path - self.model_source = self._resolve_model_source(model_name_or_path) - self.ctx_len = ctx_len - self.num_cores = num_cores - self.num_devices = num_devices - self.mxfp6_matmul = mxfp6_matmul - self.compile_prefill_seq_len = compile_prefill_seq_len - self.max_length = MAX_LENGTH - self.fps = FPS - - # Use local snapshot for stable processor/chat-template loading. - config = AutoConfig.from_pretrained(self.model_source, trust_remote_code=True, padding=True) - if hasattr(config, "use_cache"): - config.use_cache = True - if hasattr(config, "text_config") and hasattr(config.text_config, "use_cache"): - config.text_config.use_cache = True - - self.processor = AutoProcessor.from_pretrained(self.model_source, trust_remote_code=True, padding=True) - self.model = QEFFAutoModelForImageTextToText.from_pretrained( - self.model_source, - kv_offload=True, - trust_remote_code=True, - config=config, - ) - - self.yes_token_id, self.no_token_id = self._get_yes_no_token_ids(self.processor.tokenizer) - self._compiled_qpc_paths = None - self._compiled_prefill_seq_len = 0 - self._compiled_height = None - self._compiled_width = None - - @staticmethod - def _get_yes_no_token_ids(tokenizer) -> Tuple[int, int]: - """Resolve tokenizer ids for the exact tokens 'yes' and 'no'.""" - vocab = tokenizer.get_vocab() - if "yes" not in vocab or "no" not in vocab: - raise ValueError("Could not resolve tokenizer ids for exact tokens 'yes' and 'no'.") - return vocab["yes"], vocab["no"] - - @staticmethod - def _score_from_logits(logits, yes_token_id: int, no_token_id: int) -> float: - """Convert model logits into a reranker relevance score. - - Score formula: - sigmoid(logit_yes - logit_no) - """ - # Convert runtime output to torch and use final-token logits. - logits_tensor = torch.from_numpy(logits) if isinstance(logits, np.ndarray) else logits.detach().cpu() - if logits_tensor.ndim == 3: - logits_tensor = logits_tensor[:, -1, :] - # Binary relevance score from yes/no logit gap. - score = torch.sigmoid(logits_tensor[:, yes_token_id] - logits_tensor[:, no_token_id]) - return float(score[0].item()) - - @staticmethod - def _truncate_tokens_optimized(tokens: List[int], max_length: int, special_tokens: List[int]) -> List[int]: - """Truncate while preserving all special tokens in sequence order.""" - if len(tokens) <= max_length: - return tokens - - # Preserve all special/control tokens and trim only non-special tokens. - special_tokens_set = set(special_tokens) - num_special = sum(1 for token in tokens if token in special_tokens_set) - num_non_special_to_keep = max_length - num_special - - final_tokens = [] - non_special_kept_count = 0 - for token in tokens: - if token in special_tokens_set: - final_tokens.append(token) - elif non_special_kept_count < num_non_special_to_keep: - final_tokens.append(token) - non_special_kept_count += 1 - return final_tokens - - def _format_mm_content(self, text, image, video, prefix: str) -> List[Dict]: - """Build one multimodal content block (prefix + optional image + optional text).""" - # Prefix helps the model distinguish query vs document sections. - content = [{"type": "text", "text": prefix}] - - if not text and not image and not video: - content.append({"type": "text", "text": "NULL"}) - return content - - if video: - raise ValueError("Video input is not supported in this AI100-only example.") - - if image: - # Convert local paths to file:// URIs for the processor. - if isinstance(image, str): - image_content = image if image.startswith(("http", "oss")) else "file://" + image - else: - image_content = image - content.append( - { - "type": "image", - "image": image_content, - "min_pixels": MIN_PIXELS, - "max_pixels": MAX_PIXELS, - } - ) - - if text: - content.append({"type": "text", "text": text}) - - return content - - def _format_mm_instruction(self, instruction: str, query: Dict, document: Dict) -> List[Dict]: - """Create the chat payload for one query-document pair.""" - # Prompt shape follows the HF reranker reference format. - contents = [{"type": "text", "text": ": " + instruction}] - - contents.extend( - self._format_mm_content( - query.get("text"), - query.get("image"), - query.get("video"), - prefix=":", - ) - ) - contents.extend( - self._format_mm_content( - document.get("text"), - document.get("image"), - document.get("video"), - prefix="\n:", - ) - ) - - return [ - { - "role": "system", - "content": [ - { - "type": "text", - "text": ( - "Judge whether the Document meets the requirements based on the Query and the Instruct " - 'provided. Note that the answer can only be "yes" or "no".' - ), - } - ], - }, - {"role": "user", "content": contents}, - ] - - def _tokenize_pair(self, pair: List[Dict]) -> Dict: - """Tokenize a query-document pair with the exact HF multimodal pipeline.""" - # Processor expects list-of-conversations. - pairs = [pair] - text = self.processor.apply_chat_template(pairs, tokenize=False, add_generation_prompt=True) - - # Build image/video tensors + metadata for processor inputs. - images, videos, video_kwargs = process_vision_info( - pairs, - image_patch_size=16, - return_video_kwargs=True, - return_video_metadata=True, - ) - - if videos is not None: - videos, video_metadatas = zip(*videos) - videos = list(videos) - video_metadatas = list(video_metadatas) - else: - video_metadatas = None - - inputs = self.processor( - text=text, - images=images, - videos=videos, - video_metadata=video_metadatas, - truncation=False, - padding=False, - do_resize=False, - **video_kwargs, - ) - - # Apply custom truncation preserving trailing template control tokens. - for i, input_ids in enumerate(inputs["input_ids"]): - inputs["input_ids"][i] = ( - self._truncate_tokens_optimized( - input_ids[:-5], - self.max_length, - self.processor.tokenizer.all_special_ids, - ) - + input_ids[-5:] - ) - - # Re-pad through tokenizer utilities so masks align with token ids. - padded = self.processor.tokenizer.pad( - {"input_ids": inputs["input_ids"]}, - padding=True, - return_tensors="pt", - max_length=self.max_length, - ) - for key in padded: - inputs[key] = padded[key] - - if "pixel_values" in inputs: - # Keep pixels fp32 before explicit cast to fp16 during vision run. - inputs["pixel_values"] = inputs["pixel_values"].to(torch.float32) - - return inputs - - def _prepare_inputs(self, tokenized_inputs: Dict, prefill_seq_len: int = None): - """Prepare model inputs for dual-QPC prefill execution.""" - # True prompt length before compile-aligned padding. - runtime_prompt_len = int(tokenized_inputs["input_ids"].shape[1]) - effective_prefill = runtime_prompt_len if prefill_seq_len is None else prefill_seq_len - if effective_prefill < runtime_prompt_len: - raise ValueError( - f"prefill_seq_len ({effective_prefill}) must be >= runtime prompt length ({runtime_prompt_len})." - ) - - # Let model helper compute position_ids and multimodal placement. - prepared_inputs = self.model.model.prepare_inputs_for_generation( - inputs=tokenized_inputs, - prefill_seq_len=effective_prefill, - batch_size=1, - ) - - # Normalize image_grid_thw to the shape consumed by compiled path. - if "image_grid_thw" in prepared_inputs and prepared_inputs["image_grid_thw"].ndim == 2: - thw = prepared_inputs["image_grid_thw"][0] - t, h, w = int(thw[0].item()), int(thw[1].item()), int(thw[2].item()) - prepared_inputs["image_grid_thw"] = torch.zeros((1, t, h, w), dtype=thw.dtype) - - if "pixel_values" in prepared_inputs: - prepared_inputs["pixel_values"] = prepared_inputs["pixel_values"].to(torch.float32) - - return prepared_inputs, runtime_prompt_len - - def _ensure_compiled(self, prefill_seq_len: int, height: int, width: int): - """Compile QPCs if needed, otherwise reuse cached compiled artifacts.""" - # Reuse previously compiled artifacts whenever shapes are compatible. - if ( - self._compiled_qpc_paths is not None - and prefill_seq_len <= self._compiled_prefill_seq_len - and height == self._compiled_height - and width == self._compiled_width - ): - return - - reuse_vision_qpc = ( - self._compiled_qpc_paths is not None and height == self._compiled_height and width == self._compiled_width - ) - - # Compile one max prefill specialization and optionally skip vision recompile. - compiled_paths = self.model.compile( - prefill_seq_len=prefill_seq_len, - ctx_len=self.ctx_len, - img_size=max(height, width), - height=height, - width=width, - num_cores=self.num_cores, - num_devices=self.num_devices, - mxfp6_matmul=self.mxfp6_matmul, - # vision_embed_fp32=True, - skip_vision=reuse_vision_qpc, - ) - if reuse_vision_qpc: - compiled_paths["vision_qpc_path"] = self._compiled_qpc_paths["vision_qpc_path"] - - self._compiled_qpc_paths = compiled_paths - self._compiled_prefill_seq_len = prefill_seq_len - self._compiled_height = height - self._compiled_width = width - - @staticmethod - def _zero_vision_outputs(vision_outputs: Dict[str, np.ndarray]) -> Dict[str, np.ndarray]: - """Create zero-valued placeholders matching vision output buffers.""" - return {name: np.zeros_like(value) for name, value in vision_outputs.items()} - - def _run_ai100_vision(self, prepared_inputs) -> Dict[str, np.ndarray]: - """Run the compiled vision encoder QPC and return retained-state buffers.""" - if "pixel_values" not in prepared_inputs or "image_grid_thw" not in prepared_inputs: - raise ValueError("Missing pixel_values/image_grid_thw for vision execution.") - - # Vision session produces retained states consumed by language session. - vision_session = QAICInferenceSession(self._compiled_qpc_paths["vision_qpc_path"]) - vision_outputs = vision_session.run( - { - # Vision qpc expects fp16 pixels + int64 grid coordinates. - "pixel_values": prepared_inputs["pixel_values"].detach().cpu().numpy().astype(np.float16), - "image_grid_thw": prepared_inputs["image_grid_thw"].detach().cpu().numpy().astype(np.int64), - } - ) - vision_session.deactivate() - return vision_outputs - - def _run_ai100_prefill(self, prepared_inputs, vision_template: Dict[str, np.ndarray]) -> np.ndarray: - """Run one prefill pass on AI100 language QPC and return logits.""" - # Match runtime input to compiled prefill length. - prefill_len = prepared_inputs["position_ids"].shape[-1] - input_ids = prepared_inputs["input_ids"] - if input_ids.shape[1] < prefill_len: - pad = torch.full( - (input_ids.shape[0], prefill_len - input_ids.shape[1]), - 1, - dtype=input_ids.dtype, - device=input_ids.device, - ) - input_ids = torch.cat([input_ids, pad], dim=1) - else: - input_ids = input_ids[:, :prefill_len] - - position_ids = prepared_inputs["position_ids"][..., :prefill_len] - - # For text-only docs, inject zeroed retained states with matching shapes. - if "pixel_values" in prepared_inputs and "image_grid_thw" in prepared_inputs: - vision_outputs = self._run_ai100_vision(prepared_inputs) - else: - vision_outputs = self._zero_vision_outputs(vision_template) - - # Skip past/retained buffers and run only required prefill inputs. - lang_session = QAICInferenceSession(self._compiled_qpc_paths["lang_qpc_path"]) - lang_session.skip_buffers( - [ - name - for name in lang_session.input_names + lang_session.output_names - if name.startswith("past_") or name.endswith("_RetainedState") - ] - ) - lang_session.set_buffers(vision_outputs) - outputs = lang_session.run( - { - # image_idx selects the vision buffer slot for this request. - "input_ids": input_ids.detach().cpu().numpy().astype(np.int64), - "position_ids": position_ids.detach().cpu().numpy().astype(np.int64), - "image_idx": np.zeros((1, 1), dtype=np.int64), - } - ) - lang_session.deactivate() - return outputs["logits"] - - def process(self, inputs: Dict) -> List[float]: - """Score all documents for one query on AI100. - - High-level flow: - 1) Build model-ready query-document pairs. - 2) Find max prompt/image shape across all docs. - 3) Compile once at max shape (single stable specialization). - 4) Run prefill per doc and convert logits -> score. - """ - # Unpack user payload. - instruction = inputs["instruction"] - query = inputs.get("query", {}) - documents = inputs.get("documents", []) - - # Collect per-document tokenized contexts first so we can compile once - # with the largest prompt/image shape required by this request. - prepared_contexts = [] - max_prompt_len = 0 - max_grid_h = 22 - max_grid_w = 34 - - # Build each pair in the exact chat-template format expected by the model. - for document in documents: - pair = self._format_mm_instruction(instruction, query, document) - tokenized = self._tokenize_pair(pair) - runtime_prompt_len = int(tokenized["input_ids"].shape[1]) - - # Track the max image grid (H, W) seen so compile dimensions can - # handle all documents in this batch. - if "image_grid_thw" in tokenized and tokenized["image_grid_thw"].numel() > 0: - grid = tokenized["image_grid_thw"] - max_grid_h = max(max_grid_h, int(grid[..., 1].max().item())) - max_grid_w = max(max_grid_w, int(grid[..., 2].max().item())) - - prepared_contexts.append( - { - "tokenized": tokenized, - "runtime_prompt_len": runtime_prompt_len, - } - ) - max_prompt_len = max(max_prompt_len, runtime_prompt_len) - - # Empty documents list => no scores. - if max_prompt_len == 0: - return [] - - # Convert max grid to compile-time pixel dimensions using model patch size. - patch_size = int(self.model.model.config.vision_config.patch_size) - compile_height = max_grid_h * patch_size - compile_width = max_grid_w * patch_size - - # Compile/reuse a single language specialization and prepare all requests - # to that same prefill length to avoid per-document recompiles. - target_prefill_seq_len = max_prompt_len - if self.compile_prefill_seq_len is not None: - if self.compile_prefill_seq_len < max_prompt_len: - raise ValueError( - f"--compile-prefill-seq-len ({self.compile_prefill_seq_len}) must be >= " - f"max runtime prompt length ({max_prompt_len})." - ) - target_prefill_seq_len = self.compile_prefill_seq_len - - self._ensure_compiled(target_prefill_seq_len, compile_height, compile_width) - - # Prepare all documents to the same prefill length used at compile time. - prepared_contexts_with_prefill = [] - vision_template = None - for ctx in prepared_contexts: - prepared_inputs, _ = self._prepare_inputs(ctx["tokenized"], prefill_seq_len=target_prefill_seq_len) - prepared_contexts_with_prefill.append({"prepared_inputs": prepared_inputs}) - - # Capture one real vision-output template so text-only docs can reuse - # zero-valued buffers with exact matching shapes. - if vision_template is None and "pixel_values" in prepared_inputs and "image_grid_thw" in prepared_inputs: - vision_template = self._run_ai100_vision(prepared_inputs) - - # This example currently expects at least one image document to establish - # retained-state buffer shapes for mixed image/text batches. - if vision_template is None: - raise ValueError("At least one image document is required to initialize AI100 vision buffers.") - - # Run language prefill and compute scalar score per document. - scores = [] - for ctx in prepared_contexts_with_prefill: - logits = self._run_ai100_prefill( - ctx["prepared_inputs"], - vision_template=vision_template, - ) - # Reranker score = sigmoid(logit_yes - logit_no). - score = self._score_from_logits(logits, self.yes_token_id, self.no_token_id) - scores.append(score) - - return scores - - -def main(): - # Keep CLI simple: just allow model id/path override. - parser = argparse.ArgumentParser(description="Qwen3-VL reranker example.") - parser.add_argument("--model-name", type=str, default=DEFAULT_MODEL_NAME) - parser.add_argument("--ctx-len", type=int, default=DEFAULT_CTX_LEN, help="Context length used at compile time.") - parser.add_argument("--num-cores", type=int, default=DEFAULT_NUM_CORES, help="Number of AI100 cores.") - parser.add_argument("--num-devices", type=int, default=DEFAULT_NUM_DEVICES, help="Number of AI100 devices.") - parser.add_argument( - "--mxfp6-matmul", - action="store_true", - help="Enable MXFP6 matmul during compile (default: disabled).", - ) - parser.add_argument( - "--compile-prefill-seq-len", - type=int, - default=None, - help=( - "Optional fixed prefill sequence length for compile/padding. " - "Must be >= max prompt length of the current request." - ), - ) - args = parser.parse_args() - - model = QEffQwen3VLReranker( - model_name_or_path=args.model_name, - ctx_len=args.ctx_len, - num_cores=args.num_cores, - num_devices=args.num_devices, - mxfp6_matmul=args.mxfp6_matmul, - compile_prefill_seq_len=args.compile_prefill_seq_len, - ) - - # Example input payload matching the HF reranker schema. - inputs = { - "instruction": "Retrieve images or text relevant to the user's query.", - "query": {"text": "A woman playing with her dog on a beach at sunset."}, - "documents": [ - { - "text": "A woman shares a joyful moment with her golden retriever on a sun-drenched beach at sunset, as the dog offers its paw in a heartwarming display of companionship and trust." - }, - {"image": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg"}, - { - "text": "A woman shares a joyful moment with her golden retriever on a sun-drenched beach at sunset, as the dog offers its paw in a heartwarming display of companionship and trust.", - "image": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg", - }, - ], - "fps": 1.0, - } - - # Print one score per document in the same order as inputs["documents"]. - scores = model.process(inputs) - print(scores) - - -if __name__ == "__main__": - main() diff --git a/tests/configs/image_text_model_configs.json b/tests/configs/image_text_model_configs.json index f4cdb6a0fd..85df559970 100644 --- a/tests/configs/image_text_model_configs.json +++ b/tests/configs/image_text_model_configs.json @@ -5,7 +5,7 @@ "model_type": "llava", "batch_size": 1, "prompt_len": 784, - "ctx_len": 2048, + "ctx_len": 1024, "img_size": 336, "img_url": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/ai2d-demo.jpg", "text_prompt": "What does the label 15 represent? (1) lava (2) core (3) tunnel (4) ash cloud", diff --git a/tests/transformers/models/image_text_to_text/test_reranker_mad.py b/tests/transformers/models/image_text_to_text/test_reranker_mad.py deleted file mode 100644 index 3a6497b520..0000000000 --- a/tests/transformers/models/image_text_to_text/test_reranker_mad.py +++ /dev/null @@ -1,455 +0,0 @@ -# ----------------------------------------------------------------------------- -# -# Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries. -# SPDX-License-Identifier: BSD-3-Clause -# -# ---------------------------------------------------------------------------- - -import json -import os -from typing import Dict, List, Tuple - -import numpy as np -import pytest -import torch -from huggingface_hub import snapshot_download -from qwen_vl_utils import process_vision_info -from transformers import AutoConfig, AutoProcessor - -from QEfficient.generation.cloud_infer import QAICInferenceSession -from QEfficient.transformers.models.modeling_auto import QEFFAutoModelForImageTextToText -from QEfficient.utils.test_utils import load_vlm_model, set_num_layers_vlm - -CONFIG_PATH = "tests/configs/image_text_model_configs.json" - -PT_AI100_MAD_MAX = 5e-3 -MAX_LENGTH = 8192 -RERANKER_DOC_LIMIT = int(os.getenv("QEFF_RERANKER_DOC_LIMIT", "0")) - -IMAGE_BASE_FACTOR = 16 -IMAGE_FACTOR = IMAGE_BASE_FACTOR * 2 -MIN_PIXELS = 4 * IMAGE_FACTOR * IMAGE_FACTOR -MAX_PIXELS = 1280 * IMAGE_FACTOR * IMAGE_FACTOR - -EXAMPLE_INPUTS = { - "instruction": "Retrieve relevant content.", - "query": {"text": "dog on beach"}, - "documents": [ - {"image": "https://picsum.photos/id/237/536/354"}, - {"text": "A dog running on the beach."}, - ], -} - -with open(CONFIG_PATH, "r") as f: - config_data = json.load(f) - reranker_models = config_data["image_text_reranker_models"] - -test_reranker_models = [model_config["model_name"] for model_config in reranker_models] -reranker_model_config_dict = {model["model_name"]: model for model in reranker_models} - - -def _resolve_model_source(model_name_or_path: str) -> str: - if os.path.isdir(model_name_or_path): - return model_name_or_path - return snapshot_download(repo_id=model_name_or_path) - - -def _format_mm_content(text, image, video, prefix: str) -> List[Dict]: - content = [{"type": "text", "text": prefix}] - - if not text and not image and not video: - content.append({"type": "text", "text": "NULL"}) - return content - - if video: - raise ValueError("Video input is not supported in this test.") - - if image: - if isinstance(image, str): - image_content = image if image.startswith(("http", "oss")) else "file://" + image - else: - image_content = image - content.append( - { - "type": "image", - "image": image_content, - "min_pixels": MIN_PIXELS, - "max_pixels": MAX_PIXELS, - } - ) - - if text: - content.append({"type": "text", "text": text}) - - return content - - -def _format_mm_instruction(instruction: str, query: Dict, document: Dict) -> List[Dict]: - contents = [{"type": "text", "text": ": " + instruction}] - - contents.extend( - _format_mm_content( - query.get("text"), - query.get("image"), - query.get("video"), - prefix=":", - ) - ) - contents.extend( - _format_mm_content( - document.get("text"), - document.get("image"), - document.get("video"), - prefix="\n:", - ) - ) - - return [ - { - "role": "system", - "content": [ - { - "type": "text", - "text": ( - "Judge whether the Document meets the requirements based on the Query and the Instruct " - 'provided. Note that the answer can only be "yes" or "no".' - ), - } - ], - }, - {"role": "user", "content": contents}, - ] - - -def _truncate_tokens_optimized(tokens: List[int], max_length: int, special_tokens: List[int]) -> List[int]: - if len(tokens) <= max_length: - return tokens - - special_tokens_set = set(special_tokens) - num_special = sum(1 for token in tokens if token in special_tokens_set) - num_non_special_to_keep = max_length - num_special - - final_tokens = [] - non_special_kept_count = 0 - for token in tokens: - if token in special_tokens_set: - final_tokens.append(token) - elif non_special_kept_count < num_non_special_to_keep: - final_tokens.append(token) - non_special_kept_count += 1 - return final_tokens - - -def _tokenize_pair(processor, pair: List[Dict]) -> Dict: - pairs = [pair] - text = processor.apply_chat_template(pairs, tokenize=False, add_generation_prompt=True) - - images, videos, video_kwargs = process_vision_info( - pairs, - image_patch_size=16, - return_video_kwargs=True, - return_video_metadata=True, - ) - - if videos is not None: - videos, video_metadatas = zip(*videos) - videos = list(videos) - video_metadatas = list(video_metadatas) - else: - video_metadatas = None - - inputs = processor( - text=text, - images=images, - videos=videos, - video_metadata=video_metadatas, - truncation=False, - padding=False, - do_resize=False, - **video_kwargs, - ) - - for i, input_ids in enumerate(inputs["input_ids"]): - inputs["input_ids"][i] = ( - _truncate_tokens_optimized( - input_ids[:-5], - MAX_LENGTH, - processor.tokenizer.all_special_ids, - ) - + input_ids[-5:] - ) - - padded = processor.tokenizer.pad( - {"input_ids": inputs["input_ids"]}, - padding=True, - return_tensors="pt", - max_length=MAX_LENGTH, - ) - for key in padded: - inputs[key] = padded[key] - - if "pixel_values" in inputs: - inputs["pixel_values"] = inputs["pixel_values"].to(torch.float32) - - return inputs - - -def _get_yes_no_token_ids(tokenizer) -> Tuple[int, int]: - vocab = tokenizer.get_vocab() - if "yes" not in vocab or "no" not in vocab: - raise ValueError("Could not resolve tokenizer ids for exact tokens 'yes' and 'no'.") - return vocab["yes"], vocab["no"] - - -def _score_from_logits(logits, yes_token_id: int, no_token_id: int) -> np.ndarray: - if isinstance(logits, np.ndarray): - logits_tensor = torch.from_numpy(logits) - else: - logits_tensor = logits.detach().cpu() - - if logits_tensor.ndim == 3: - logits_tensor = logits_tensor[:, -1, :] - elif logits_tensor.ndim != 2: - raise ValueError(f"Unsupported logits rank for score conversion: {logits_tensor.ndim}") - - score = torch.sigmoid(logits_tensor[:, yes_token_id] - logits_tensor[:, no_token_id]) - return score.detach().cpu().numpy().astype(np.float64) - - -def _score_from_last_hidden(last_hidden_state: torch.Tensor, score_linear: torch.nn.Linear) -> np.ndarray: - score = torch.sigmoid(score_linear(last_hidden_state[:, -1])).squeeze(-1) - return score.detach().cpu().numpy().astype(np.float64) - - -def _make_score_linear(model_hf, yes_token_id: int, no_token_id: int) -> torch.nn.Linear: - lm_head_weights = model_hf.lm_head.weight.data - weight_yes = lm_head_weights[yes_token_id] - weight_no = lm_head_weights[no_token_id] - - linear_layer = torch.nn.Linear(weight_yes.shape[0], 1, bias=False) - with torch.no_grad(): - linear_layer.weight[0] = weight_yes - weight_no - return linear_layer.eval() - - -def _mad_stats(reference: np.ndarray, candidate: np.ndarray) -> Tuple[float, float]: - diff = np.abs(reference - candidate) - return float(np.mean(diff)), float(np.max(diff)) - - -def _prepare_qeff_inputs(qeff_model, tokenized_inputs: Dict, prefill_seq_len: int = None): - runtime_prompt_len = int(tokenized_inputs["input_ids"].shape[1]) - effective_prefill_seq_len = runtime_prompt_len if prefill_seq_len is None else prefill_seq_len - if effective_prefill_seq_len < runtime_prompt_len: - raise ValueError( - f"prefill_seq_len ({effective_prefill_seq_len}) must be >= runtime prompt length ({runtime_prompt_len})." - ) - - prepared_inputs = qeff_model.model.prepare_inputs_for_generation( - inputs=tokenized_inputs, - prefill_seq_len=effective_prefill_seq_len, - batch_size=1, - ) - - if "image_grid_thw" in prepared_inputs and prepared_inputs["image_grid_thw"].ndim == 2: - thw = prepared_inputs["image_grid_thw"][0] - t, h, w = int(thw[0].item()), int(thw[1].item()), int(thw[2].item()) - prepared_inputs["image_grid_thw"] = torch.zeros((1, t, h, w), dtype=thw.dtype) - - if "pixel_values" in prepared_inputs: - prepared_inputs["pixel_values"] = prepared_inputs["pixel_values"].to(torch.float32) - - return prepared_inputs, runtime_prompt_len - - -def _zero_vision_outputs(vision_outputs: Dict[str, np.ndarray]) -> Dict[str, np.ndarray]: - return {name: np.zeros_like(value) for name, value in vision_outputs.items()} - - -def _run_ai100_vision(vision_qpc_path: str, prepared_inputs) -> Dict[str, np.ndarray]: - vision_session = QAICInferenceSession(vision_qpc_path) - vision_inputs = { - "pixel_values": prepared_inputs["pixel_values"].detach().cpu().numpy().astype(np.float16), - "image_grid_thw": prepared_inputs["image_grid_thw"].detach().cpu().numpy().astype(np.int64), - } - vision_outputs = vision_session.run(vision_inputs) - vision_session.deactivate() - return vision_outputs - - -def _run_ai100_prefill(qpc_paths, prepared_inputs, vision_template): - if not isinstance(qpc_paths, dict): - raise ValueError("Expected qpc_paths to be a dict with vision/lang QPC keys.") - - vision_qpc_path = qpc_paths.get("vision_qpc_path") - lang_qpc_path = qpc_paths.get("lang_qpc_path") - if vision_qpc_path is None or lang_qpc_path is None: - raise ValueError("Missing vision_qpc_path/lang_qpc_path in compiled QPC outputs.") - - prefill_len = prepared_inputs["position_ids"].shape[-1] - input_ids = prepared_inputs["input_ids"] - if input_ids.shape[1] < prefill_len: - pad = torch.full( - (input_ids.shape[0], prefill_len - input_ids.shape[1]), - 1, - dtype=input_ids.dtype, - device=input_ids.device, - ) - input_ids = torch.cat([input_ids, pad], dim=1) - else: - input_ids = input_ids[:, :prefill_len] - position_ids = prepared_inputs["position_ids"][..., :prefill_len] - - if "pixel_values" in prepared_inputs and "image_grid_thw" in prepared_inputs: - vision_outputs = _run_ai100_vision(vision_qpc_path, prepared_inputs) - else: - vision_outputs = _zero_vision_outputs(vision_template) - - lang_session = QAICInferenceSession(lang_qpc_path) - lang_session.skip_buffers( - [ - name - for name in lang_session.input_names + lang_session.output_names - if name.startswith("past_") or name.endswith("_RetainedState") - ] - ) - lang_session.set_buffers(vision_outputs) - lang_inputs = { - "input_ids": input_ids.detach().cpu().numpy().astype(np.int64), - "position_ids": position_ids.detach().cpu().numpy().astype(np.int64), - "image_idx": np.zeros((1, 1), dtype=np.int64), - } - outputs = lang_session.run(lang_inputs) - lang_session.deactivate() - return outputs["logits"] - - -@pytest.mark.on_qaic -@pytest.mark.multimodal -@pytest.mark.regular -@pytest.mark.parametrize("model_name", test_reranker_models) -def test_qwen3_vl_reranker_mad_parity(model_name): - torch.manual_seed(42) - model_cfg = reranker_model_config_dict[model_name] - model_source = _resolve_model_source(model_name) - - config = AutoConfig.from_pretrained(model_source, trust_remote_code=True, padding=True) - config = set_num_layers_vlm(config, n_layer=model_cfg["num_layers"]) - if hasattr(config, "use_cache"): - config.use_cache = True - if hasattr(config, "text_config") and hasattr(config.text_config, "use_cache"): - config.text_config.use_cache = True - - model_hf = load_vlm_model(config) - model_hf.eval() - - qeff_model = QEFFAutoModelForImageTextToText.from_pretrained( - model_source, - kv_offload=True, - config=config, - ) - processor = AutoProcessor.from_pretrained(model_source, trust_remote_code=True, padding=True) - - yes_token_id, no_token_id = _get_yes_no_token_ids(processor.tokenizer) - score_linear = _make_score_linear(model_hf, yes_token_id, no_token_id).to(next(model_hf.parameters()).device) - score_linear = score_linear.to(dtype=next(model_hf.parameters()).dtype) - - doc_contexts = [] - max_prompt_len = 0 - max_grid_h = 22 - max_grid_w = 34 - - hf_scores_list = [] - - documents = EXAMPLE_INPUTS["documents"] - if RERANKER_DOC_LIMIT > 0: - documents = documents[:RERANKER_DOC_LIMIT] - - for document in documents: - pair = _format_mm_instruction( - instruction=EXAMPLE_INPUTS["instruction"], - query=EXAMPLE_INPUTS["query"], - document=document, - ) - tokenized = _tokenize_pair(processor, pair) - runtime_prompt_len = int(tokenized["input_ids"].shape[1]) - - hf_inputs = {} - for key, value in tokenized.items(): - hf_inputs[key] = value.to(next(model_hf.parameters()).device) if torch.is_tensor(value) else value - with torch.no_grad(): - hf_last_hidden = model_hf.model(**hf_inputs).last_hidden_state - hf_score = _score_from_last_hidden(hf_last_hidden, score_linear)[0] - hf_scores_list.append(float(hf_score)) - - if "image_grid_thw" in tokenized and tokenized["image_grid_thw"].numel() > 0: - grid = tokenized["image_grid_thw"] - max_grid_h = max(max_grid_h, int(grid[..., 1].max().item())) - max_grid_w = max(max_grid_w, int(grid[..., 2].max().item())) - - doc_contexts.append( - { - "tokenized": tokenized, - } - ) - max_prompt_len = max(max_prompt_len, runtime_prompt_len) - - patch_size = int(qeff_model.model.config.vision_config.patch_size) - compile_height = max_grid_h * patch_size - compile_width = max_grid_w * patch_size - - qpc_paths = qeff_model.compile( - img_size=max(compile_height, compile_width), - height=compile_height, - width=compile_width, - prefill_seq_len=max_prompt_len, - ctx_len=model_cfg["ctx_len"], - num_devices=1, - num_cores=16, - mxfp6_matmul=False, - ) - - ai100_scores_list = [] - - prepared_contexts = [] - vision_template_ai100 = None - for context in doc_contexts: - prepared_inputs, _ = _prepare_qeff_inputs( - qeff_model=qeff_model, - tokenized_inputs=context["tokenized"], - prefill_seq_len=max_prompt_len, - ) - prepared_contexts.append( - { - "prepared_inputs": prepared_inputs, - } - ) - if vision_template_ai100 is None and "pixel_values" in prepared_inputs and "image_grid_thw" in prepared_inputs: - vision_template_ai100 = _run_ai100_vision(qpc_paths["vision_qpc_path"], prepared_inputs) - - if vision_template_ai100 is None: - raise ValueError("Expected at least one image document to initialize vision templates.") - - for context in prepared_contexts: - prepared_inputs_runtime = context["prepared_inputs"] - ai100_logits = _run_ai100_prefill( - qpc_paths=qpc_paths, - prepared_inputs=prepared_inputs_runtime, - vision_template=vision_template_ai100, - ) - ai100_score = _score_from_logits(ai100_logits, yes_token_id, no_token_id)[0] - ai100_scores_list.append(float(ai100_score)) - - hf_scores = np.array(hf_scores_list, dtype=np.float64) - ai100_scores = np.array(ai100_scores_list, dtype=np.float64) - - print(f"[SCORES] PyTorch(original): {hf_scores.tolist()}") - print(f"[SCORES] AI100: {ai100_scores.tolist()}") - - pt_ai100_mad_mean, pt_ai100_mad_max = _mad_stats(hf_scores, ai100_scores) - print(f"[MAD] PyTorch(original) vs AI100: mean={pt_ai100_mad_mean:.6e}, max={pt_ai100_mad_max:.6e}") - assert pt_ai100_mad_max <= PT_AI100_MAD_MAX, ( - f"PyTorch(original) vs AI100 MAD max {pt_ai100_mad_max:.6e} " - f"exceeds threshold {PT_AI100_MAD_MAX:.6e}. " - f"Check tokenizer ids, prompt formatting, runtime prompt length slicing, and compile dimensions." - ) From 85070cb616879063a6086c3a498cdcdb44358d18 Mon Sep 17 00:00:00 2001 From: Amit Date: Wed, 20 May 2026 23:32:03 +0530 Subject: [PATCH 03/15] Addressed comments and fix CI issue Signed-off-by: Amit Signed-off-by: Amit Raj --- examples/reranker/qwen3vl/README.md | 7 ++----- examples/reranker/qwen3vl/reranker_model.py | 10 +++------- scripts/Jenkinsfile | 2 +- 3 files changed, 6 insertions(+), 13 deletions(-) diff --git a/examples/reranker/qwen3vl/README.md b/examples/reranker/qwen3vl/README.md index d9d96645a8..7ebe1d7db8 100644 --- a/examples/reranker/qwen3vl/README.md +++ b/examples/reranker/qwen3vl/README.md @@ -23,11 +23,8 @@ pip install "qwen-vl-utils>=0.0.14" ## Scripts -- `qwen3_vl_reranker.py` - runnable example that explicitly shows: - - `QEFFAutoModelForImageTextToText.from_pretrained(...)` - - `model.compile(...)` arguments for QPC generation - - AI100 scoring call flow -- `reranker_model.py` - Qwen3-VL-specific helper logic (prompting/tokenization/scoring/runtime glue) adapted from the official Qwen reranker reference: +- `qwen3_vl_reranker.py` - simple runnable API usage example. +- `reranker_model.py` - AI100 dual-QPC implementation adapted from the official Qwen reranker reference: https://huggingface.co/Qwen/Qwen3-VL-Reranker-2B/blob/main/scripts/qwen3_vl_reranker.py ## Run diff --git a/examples/reranker/qwen3vl/reranker_model.py b/examples/reranker/qwen3vl/reranker_model.py index 33e73b05f6..8577c8a979 100644 --- a/examples/reranker/qwen3vl/reranker_model.py +++ b/examples/reranker/qwen3vl/reranker_model.py @@ -5,17 +5,13 @@ # # ---------------------------------------------------------------------------- -"""Qwen3-VL-specific reranker helpers for AI100 runtime. +"""Core AI100 reranker implementation for Qwen3-VL reranker models. The tokenization/scoring flow is adapted from the official Qwen reference: https://huggingface.co/Qwen/Qwen3-VL-Reranker-2B/blob/main/scripts/qwen3_vl_reranker.py -This module intentionally keeps only Qwen3-VL-specific reranker logic -(prompt construction, multimodal tokenization, yes/no score computation, -and AI100 runtime orchestration with compiled QPC paths). - -Model loading (`from_pretrained`) and model compilation (`compile`) are exposed -in `qwen3_vl_reranker.py` so users can directly see QEff API usage. +This module isolates AI100 dual-QPC runtime details so the user-facing example +script (`qwen3_vl_reranker.py`) remains focused on simple API usage. """ from typing import Dict, List, Tuple diff --git a/scripts/Jenkinsfile b/scripts/Jenkinsfile index f437a1521a..8f40bf247c 100644 --- a/scripts/Jenkinsfile +++ b/scripts/Jenkinsfile @@ -64,7 +64,7 @@ pipeline { pip install .[test] && pip install junitparser pytest-xdist && pip install librosa==0.10.2 soundfile==0.13.1 && - pip install qwen-vl-utils==0.0.14 && + pip install "qwen-vl-utils>=0.0.14" && pip install --extra-index-url https://download.pytorch.org/whl/cpu timm==1.0.14 torchvision==0.22.0+cpu einops==0.8.1 && pip install onnx_ir rm -rf QEfficient" From 6b015cbeeb2ae0efa2524100775659aa015020b1 Mon Sep 17 00:00:00 2001 From: Amit Raj Date: Thu, 21 May 2026 11:00:53 +0530 Subject: [PATCH 04/15] Updated installation of qwen-vl-utils Signed-off-by: Amit Raj --- QEfficient/transformers/models/modeling_auto.py | 8 ++------ .../transformers/models/whisper/modeling_whisper.py | 5 ++++- scripts/Jenkinsfile | 5 ++--- 3 files changed, 8 insertions(+), 10 deletions(-) diff --git a/QEfficient/transformers/models/modeling_auto.py b/QEfficient/transformers/models/modeling_auto.py index af7cce655b..65b89d274f 100755 --- a/QEfficient/transformers/models/modeling_auto.py +++ b/QEfficient/transformers/models/modeling_auto.py @@ -1389,7 +1389,7 @@ def export( kv_offload=True, continuous_batching=self.continuous_batching, comp_ctx_lengths=self.comp_ctx_lengths_decode, - prefill_seq_len=prefill_seq_len, + **dummy_inputs_kwargs, ) dynamic_axes = self.model.get_onnx_dynamic_axes( kv_offload=True, @@ -1397,11 +1397,7 @@ def export( comp_ctx_lengths=self.comp_ctx_lengths_decode, ) except TypeError: - inputs = self.model.get_dummy_inputs( - kv_offload=True, - comp_ctx_lengths=self.comp_ctx_lengths_decode, - prefill_seq_len=prefill_seq_len, - ) + inputs = self.model.get_dummy_inputs(kv_offload=True, comp_ctx_lengths=self.comp_ctx_lengths_decode) dynamic_axes = self.model.get_onnx_dynamic_axes( kv_offload=True, comp_ctx_lengths=self.comp_ctx_lengths_decode ) diff --git a/QEfficient/transformers/models/whisper/modeling_whisper.py b/QEfficient/transformers/models/whisper/modeling_whisper.py index 1bdcd07ada..bf01a1779f 100644 --- a/QEfficient/transformers/models/whisper/modeling_whisper.py +++ b/QEfficient/transformers/models/whisper/modeling_whisper.py @@ -795,7 +795,10 @@ def get_dummy_inputs( **kwargs, ): bs = 1 - seq_len = int(kwargs.get("prefill_seq_len", ONNX_EXPORT_EXAMPLE_SEQ_LEN)) + seq_len = kwargs.get("prefill_seq_len") + if seq_len is None: + seq_len = 32 + seq_len = int(seq_len) encoder_seq_len = self.config.max_source_positions encoder_feature_count = self.config.num_mel_bins num_key_value_heads = self.config.decoder_attention_heads diff --git a/scripts/Jenkinsfile b/scripts/Jenkinsfile index 8f40bf247c..49f637c2f9 100644 --- a/scripts/Jenkinsfile +++ b/scripts/Jenkinsfile @@ -64,9 +64,8 @@ pipeline { pip install .[test] && pip install junitparser pytest-xdist && pip install librosa==0.10.2 soundfile==0.13.1 && - pip install "qwen-vl-utils>=0.0.14" && - pip install --extra-index-url https://download.pytorch.org/whl/cpu timm==1.0.14 torchvision==0.22.0+cpu einops==0.8.1 && - pip install onnx_ir + pip install qwen-vl-utils==0.0.14 && + pip install --extra-index-url https://download.pytorch.org/whl/cpu timm==1.0.14 torchvision==0.22.0+cpu einops==0.8.1 rm -rf QEfficient" ''' } From ef46e41fac3f5a99950c508e8eebfa6bb28bde29 Mon Sep 17 00:00:00 2001 From: Amit Raj Date: Fri, 22 May 2026 17:58:35 +0530 Subject: [PATCH 05/15] Addressed comments Signed-off-by: Amit Raj --- examples/reranker/qwen3vl/README.md | 7 +- .../reranker/qwen3vl/qwen3_vl_reranker.py | 4 +- examples/reranker/qwen3vl/reranker_model.py | 153 ++++++++++++++---- 3 files changed, 126 insertions(+), 38 deletions(-) diff --git a/examples/reranker/qwen3vl/README.md b/examples/reranker/qwen3vl/README.md index 7ebe1d7db8..d9d96645a8 100644 --- a/examples/reranker/qwen3vl/README.md +++ b/examples/reranker/qwen3vl/README.md @@ -23,8 +23,11 @@ pip install "qwen-vl-utils>=0.0.14" ## Scripts -- `qwen3_vl_reranker.py` - simple runnable API usage example. -- `reranker_model.py` - AI100 dual-QPC implementation adapted from the official Qwen reranker reference: +- `qwen3_vl_reranker.py` - runnable example that explicitly shows: + - `QEFFAutoModelForImageTextToText.from_pretrained(...)` + - `model.compile(...)` arguments for QPC generation + - AI100 scoring call flow +- `reranker_model.py` - Qwen3-VL-specific helper logic (prompting/tokenization/scoring/runtime glue) adapted from the official Qwen reranker reference: https://huggingface.co/Qwen/Qwen3-VL-Reranker-2B/blob/main/scripts/qwen3_vl_reranker.py ## Run diff --git a/examples/reranker/qwen3vl/qwen3_vl_reranker.py b/examples/reranker/qwen3vl/qwen3_vl_reranker.py index 01884d0d08..42e2cf5082 100644 --- a/examples/reranker/qwen3vl/qwen3_vl_reranker.py +++ b/examples/reranker/qwen3vl/qwen3_vl_reranker.py @@ -85,13 +85,13 @@ def main() -> None: model_source = resolve_model_source(args.model_name) # 1) Load config + processor + QEff model through public QEff/HF APIs. - config = AutoConfig.from_pretrained(model_source, trust_remote_code=True) + config = AutoConfig.from_pretrained(model_source, trust_remote_code=True, padding=True) if hasattr(config, "use_cache"): config.use_cache = True if hasattr(config, "text_config") and hasattr(config.text_config, "use_cache"): config.text_config.use_cache = True - processor = AutoProcessor.from_pretrained(model_source, trust_remote_code=True) + processor = AutoProcessor.from_pretrained(model_source, trust_remote_code=True, padding=True) model = QEFFAutoModelForImageTextToText.from_pretrained( model_source, kv_offload=True, diff --git a/examples/reranker/qwen3vl/reranker_model.py b/examples/reranker/qwen3vl/reranker_model.py index 8577c8a979..8cd8a5ed4f 100644 --- a/examples/reranker/qwen3vl/reranker_model.py +++ b/examples/reranker/qwen3vl/reranker_model.py @@ -5,32 +5,27 @@ # # ---------------------------------------------------------------------------- -"""Core AI100 reranker implementation for Qwen3-VL reranker models. +"""Qwen3-VL-specific reranker helpers for AI100 runtime. The tokenization/scoring flow is adapted from the official Qwen reference: https://huggingface.co/Qwen/Qwen3-VL-Reranker-2B/blob/main/scripts/qwen3_vl_reranker.py -This module isolates AI100 dual-QPC runtime details so the user-facing example -script (`qwen3_vl_reranker.py`) remains focused on simple API usage. +This module intentionally keeps only Qwen3-VL-specific reranker logic +(prompt construction, multimodal tokenization, yes/no score computation, +and AI100 runtime orchestration with compiled QPC paths). + +Model loading (`from_pretrained`) and model compilation (`compile`) are exposed +in `qwen3_vl_reranker.py` so users can directly see QEff API usage. """ from typing import Dict, List, Tuple import numpy as np import torch +from huggingface_hub import snapshot_download +from qwen_vl_utils import process_vision_info from QEfficient.generation.cloud_infer import QAICInferenceSession -from QEfficient.transformers.models.qwen3_vl._reranker_utils import ( - format_mm_content, - format_mm_instruction, - get_yes_no_token_ids, - score_from_logits, - tokenize_pair, - truncate_tokens_optimized, -) -from QEfficient.transformers.models.qwen3_vl._reranker_utils import ( - resolve_model_source as _resolve_model_source, -) # Max token budget used by this example's manual truncation/padding flow. MAX_LENGTH = 8192 @@ -48,7 +43,9 @@ def resolve_model_source(model_name_or_path: str) -> str: Some transformers versions can fail when resolving chat templates from repo-id mode for this model. Using a local snapshot path avoids that path. """ - return _resolve_model_source(model_name_or_path) + if os.path.isdir(model_name_or_path): + return model_name_or_path + return snapshot_download(repo_id=model_name_or_path) class QEffQwen3VLReranker: @@ -84,40 +81,128 @@ def _score_from_logits(logits, yes_token_id: int, no_token_id: int) -> float: Score formula: sigmoid(logit_yes - logit_no) """ - score = score_from_logits(logits, yes_token_id, no_token_id) + logits_tensor = torch.from_numpy(logits) if isinstance(logits, np.ndarray) else logits.detach().cpu() + if logits_tensor.ndim == 3: + logits_tensor = logits_tensor[:, -1, :] + score = torch.sigmoid(logits_tensor[:, yes_token_id] - logits_tensor[:, no_token_id]) return float(score[0].item()) @staticmethod def _truncate_tokens_optimized(tokens: List[int], max_length: int, special_tokens: List[int]) -> List[int]: """Truncate while preserving all special tokens in sequence order.""" - return truncate_tokens_optimized(tokens, max_length, special_tokens) + if len(tokens) <= max_length: + return tokens + + special_tokens_set = set(special_tokens) + num_special = sum(1 for token in tokens if token in special_tokens_set) + num_non_special_to_keep = max_length - num_special + + final_tokens = [] + non_special_kept_count = 0 + for token in tokens: + if token in special_tokens_set: + final_tokens.append(token) + elif non_special_kept_count < num_non_special_to_keep: + final_tokens.append(token) + non_special_kept_count += 1 + return final_tokens def _format_mm_content(self, text, image, video, prefix: str) -> List[Dict]: """Build one multimodal content block (prefix + optional image + optional text).""" - return format_mm_content( - text=text, - image=image, - video=video, - prefix=prefix, - min_pixels=MIN_PIXELS, - max_pixels=MAX_PIXELS, - unsupported_video_error="Video input is not supported in this AI100-only example.", - ) + content = [{"type": "text", "text": prefix}] + + if not text and not image and not video: + content.append({"type": "text", "text": "NULL"}) + return content + + if video: + raise ValueError("Video input is not supported in this AI100-only example.") + + if image: + if isinstance(image, str): + image_content = image if image.startswith(("http", "oss")) else "file://" + image + else: + image_content = image + content.append( + { + "type": "image", + "image": image_content, + "min_pixels": MIN_PIXELS, + "max_pixels": MAX_PIXELS, + } + ) + + if text: + content.append({"type": "text", "text": text}) + + return content def _format_mm_instruction(self, instruction: str, query: Dict, document: Dict) -> List[Dict]: """Create the chat payload for one query-document pair.""" - return format_mm_instruction( - instruction=instruction, - query=query, - document=document, - min_pixels=MIN_PIXELS, - max_pixels=MAX_PIXELS, - unsupported_video_error="Video input is not supported in this AI100-only example.", + contents = [{"type": "text", "text": ": " + instruction}] + + contents.extend( + self._format_mm_content( + query.get("text"), + query.get("image"), + query.get("video"), + prefix=":", + ) ) def _tokenize_pair(self, pair: List[Dict]) -> Dict: """Tokenize a query-document pair with the exact HF multimodal pipeline.""" - return tokenize_pair(self.processor, pair, self.max_length) + pairs = [pair] + text = self.processor.apply_chat_template(pairs, tokenize=False, add_generation_prompt=True) + + images, videos, video_kwargs = process_vision_info( + pairs, + image_patch_size=16, + return_video_kwargs=True, + return_video_metadata=True, + ) + + if videos is not None: + videos, video_metadatas = zip(*videos) + videos = list(videos) + video_metadatas = list(video_metadatas) + else: + video_metadatas = None + + inputs = self.processor( + text=text, + images=images, + videos=videos, + video_metadata=video_metadatas, + truncation=False, + padding=False, + do_resize=False, + **video_kwargs, + ) + + for i, input_ids in enumerate(inputs["input_ids"]): + inputs["input_ids"][i] = ( + self._truncate_tokens_optimized( + input_ids[:-5], + self.max_length, + self.processor.tokenizer.all_special_ids, + ) + + input_ids[-5:] + ) + + padded = self.processor.tokenizer.pad( + {"input_ids": inputs["input_ids"]}, + padding=True, + return_tensors="pt", + max_length=self.max_length, + ) + for key in padded: + inputs[key] = padded[key] + + if "pixel_values" in inputs: + inputs["pixel_values"] = inputs["pixel_values"].to(torch.float32) + + return inputs def _prepare_inputs(self, tokenized_inputs: Dict, prefill_seq_len: int): """Prepare model inputs for dual-QPC prefill execution.""" From 08e7765c9c5d65099625d88941f7bd5578396859 Mon Sep 17 00:00:00 2001 From: Amit Raj Date: Mon, 1 Jun 2026 14:25:26 +0530 Subject: [PATCH 06/15] Rebased and addressed comments Signed-off-by: Amit Raj --- .../models/whisper/modeling_whisper.py | 5 +- .../reranker/qwen3vl/qwen3_vl_reranker.py | 4 +- examples/reranker/qwen3vl/reranker_model.py | 143 ++++-------------- 3 files changed, 34 insertions(+), 118 deletions(-) diff --git a/QEfficient/transformers/models/whisper/modeling_whisper.py b/QEfficient/transformers/models/whisper/modeling_whisper.py index bf01a1779f..1bdcd07ada 100644 --- a/QEfficient/transformers/models/whisper/modeling_whisper.py +++ b/QEfficient/transformers/models/whisper/modeling_whisper.py @@ -795,10 +795,7 @@ def get_dummy_inputs( **kwargs, ): bs = 1 - seq_len = kwargs.get("prefill_seq_len") - if seq_len is None: - seq_len = 32 - seq_len = int(seq_len) + seq_len = int(kwargs.get("prefill_seq_len", ONNX_EXPORT_EXAMPLE_SEQ_LEN)) encoder_seq_len = self.config.max_source_positions encoder_feature_count = self.config.num_mel_bins num_key_value_heads = self.config.decoder_attention_heads diff --git a/examples/reranker/qwen3vl/qwen3_vl_reranker.py b/examples/reranker/qwen3vl/qwen3_vl_reranker.py index 42e2cf5082..01884d0d08 100644 --- a/examples/reranker/qwen3vl/qwen3_vl_reranker.py +++ b/examples/reranker/qwen3vl/qwen3_vl_reranker.py @@ -85,13 +85,13 @@ def main() -> None: model_source = resolve_model_source(args.model_name) # 1) Load config + processor + QEff model through public QEff/HF APIs. - config = AutoConfig.from_pretrained(model_source, trust_remote_code=True, padding=True) + config = AutoConfig.from_pretrained(model_source, trust_remote_code=True) if hasattr(config, "use_cache"): config.use_cache = True if hasattr(config, "text_config") and hasattr(config.text_config, "use_cache"): config.text_config.use_cache = True - processor = AutoProcessor.from_pretrained(model_source, trust_remote_code=True, padding=True) + processor = AutoProcessor.from_pretrained(model_source, trust_remote_code=True) model = QEFFAutoModelForImageTextToText.from_pretrained( model_source, kv_offload=True, diff --git a/examples/reranker/qwen3vl/reranker_model.py b/examples/reranker/qwen3vl/reranker_model.py index 8cd8a5ed4f..33e73b05f6 100644 --- a/examples/reranker/qwen3vl/reranker_model.py +++ b/examples/reranker/qwen3vl/reranker_model.py @@ -22,10 +22,19 @@ import numpy as np import torch -from huggingface_hub import snapshot_download -from qwen_vl_utils import process_vision_info from QEfficient.generation.cloud_infer import QAICInferenceSession +from QEfficient.transformers.models.qwen3_vl._reranker_utils import ( + format_mm_content, + format_mm_instruction, + get_yes_no_token_ids, + score_from_logits, + tokenize_pair, + truncate_tokens_optimized, +) +from QEfficient.transformers.models.qwen3_vl._reranker_utils import ( + resolve_model_source as _resolve_model_source, +) # Max token budget used by this example's manual truncation/padding flow. MAX_LENGTH = 8192 @@ -43,9 +52,7 @@ def resolve_model_source(model_name_or_path: str) -> str: Some transformers versions can fail when resolving chat templates from repo-id mode for this model. Using a local snapshot path avoids that path. """ - if os.path.isdir(model_name_or_path): - return model_name_or_path - return snapshot_download(repo_id=model_name_or_path) + return _resolve_model_source(model_name_or_path) class QEffQwen3VLReranker: @@ -81,128 +88,40 @@ def _score_from_logits(logits, yes_token_id: int, no_token_id: int) -> float: Score formula: sigmoid(logit_yes - logit_no) """ - logits_tensor = torch.from_numpy(logits) if isinstance(logits, np.ndarray) else logits.detach().cpu() - if logits_tensor.ndim == 3: - logits_tensor = logits_tensor[:, -1, :] - score = torch.sigmoid(logits_tensor[:, yes_token_id] - logits_tensor[:, no_token_id]) + score = score_from_logits(logits, yes_token_id, no_token_id) return float(score[0].item()) @staticmethod def _truncate_tokens_optimized(tokens: List[int], max_length: int, special_tokens: List[int]) -> List[int]: """Truncate while preserving all special tokens in sequence order.""" - if len(tokens) <= max_length: - return tokens - - special_tokens_set = set(special_tokens) - num_special = sum(1 for token in tokens if token in special_tokens_set) - num_non_special_to_keep = max_length - num_special - - final_tokens = [] - non_special_kept_count = 0 - for token in tokens: - if token in special_tokens_set: - final_tokens.append(token) - elif non_special_kept_count < num_non_special_to_keep: - final_tokens.append(token) - non_special_kept_count += 1 - return final_tokens + return truncate_tokens_optimized(tokens, max_length, special_tokens) def _format_mm_content(self, text, image, video, prefix: str) -> List[Dict]: """Build one multimodal content block (prefix + optional image + optional text).""" - content = [{"type": "text", "text": prefix}] - - if not text and not image and not video: - content.append({"type": "text", "text": "NULL"}) - return content - - if video: - raise ValueError("Video input is not supported in this AI100-only example.") - - if image: - if isinstance(image, str): - image_content = image if image.startswith(("http", "oss")) else "file://" + image - else: - image_content = image - content.append( - { - "type": "image", - "image": image_content, - "min_pixels": MIN_PIXELS, - "max_pixels": MAX_PIXELS, - } - ) - - if text: - content.append({"type": "text", "text": text}) - - return content + return format_mm_content( + text=text, + image=image, + video=video, + prefix=prefix, + min_pixels=MIN_PIXELS, + max_pixels=MAX_PIXELS, + unsupported_video_error="Video input is not supported in this AI100-only example.", + ) def _format_mm_instruction(self, instruction: str, query: Dict, document: Dict) -> List[Dict]: """Create the chat payload for one query-document pair.""" - contents = [{"type": "text", "text": ": " + instruction}] - - contents.extend( - self._format_mm_content( - query.get("text"), - query.get("image"), - query.get("video"), - prefix=":", - ) + return format_mm_instruction( + instruction=instruction, + query=query, + document=document, + min_pixels=MIN_PIXELS, + max_pixels=MAX_PIXELS, + unsupported_video_error="Video input is not supported in this AI100-only example.", ) def _tokenize_pair(self, pair: List[Dict]) -> Dict: """Tokenize a query-document pair with the exact HF multimodal pipeline.""" - pairs = [pair] - text = self.processor.apply_chat_template(pairs, tokenize=False, add_generation_prompt=True) - - images, videos, video_kwargs = process_vision_info( - pairs, - image_patch_size=16, - return_video_kwargs=True, - return_video_metadata=True, - ) - - if videos is not None: - videos, video_metadatas = zip(*videos) - videos = list(videos) - video_metadatas = list(video_metadatas) - else: - video_metadatas = None - - inputs = self.processor( - text=text, - images=images, - videos=videos, - video_metadata=video_metadatas, - truncation=False, - padding=False, - do_resize=False, - **video_kwargs, - ) - - for i, input_ids in enumerate(inputs["input_ids"]): - inputs["input_ids"][i] = ( - self._truncate_tokens_optimized( - input_ids[:-5], - self.max_length, - self.processor.tokenizer.all_special_ids, - ) - + input_ids[-5:] - ) - - padded = self.processor.tokenizer.pad( - {"input_ids": inputs["input_ids"]}, - padding=True, - return_tensors="pt", - max_length=self.max_length, - ) - for key in padded: - inputs[key] = padded[key] - - if "pixel_values" in inputs: - inputs["pixel_values"] = inputs["pixel_values"].to(torch.float32) - - return inputs + return tokenize_pair(self.processor, pair, self.max_length) def _prepare_inputs(self, tokenized_inputs: Dict, prefill_seq_len: int): """Prepare model inputs for dual-QPC prefill execution.""" From 31d41a4a4e10bab0529b15a42091dc1c6e210108 Mon Sep 17 00:00:00 2001 From: Amit Raj Date: Thu, 4 Jun 2026 01:05:16 +0530 Subject: [PATCH 07/15] Intial fix Signed-off-by: Amit Raj --- QEfficient/transformers/models/modeling_auto.py | 9 ++++----- .../models/qwen3_vl/modeling_qwen3_vl.py | 15 +++++---------- examples/reranker/qwen3vl/README.md | 1 - examples/reranker/qwen3vl/qwen3_vl_reranker.py | 2 -- examples/reranker/qwen3vl/reranker_model.py | 5 +++-- 5 files changed, 12 insertions(+), 20 deletions(-) diff --git a/QEfficient/transformers/models/modeling_auto.py b/QEfficient/transformers/models/modeling_auto.py index 65b89d274f..0f64d79175 100755 --- a/QEfficient/transformers/models/modeling_auto.py +++ b/QEfficient/transformers/models/modeling_auto.py @@ -1379,17 +1379,12 @@ def export( List[str] A list containing the paths to the generated ONNX graph files for both components. """ - dummy_inputs_kwargs = {} - if prefill_seq_len is not None: - dummy_inputs_kwargs["prefill_seq_len"] = int(prefill_seq_len) - # TODO This is a temporary change as continous batching is enabled only for few models. Once support is added for all the models this exception handing can be removed. try: inputs = self.model.get_dummy_inputs( kv_offload=True, continuous_batching=self.continuous_batching, comp_ctx_lengths=self.comp_ctx_lengths_decode, - **dummy_inputs_kwargs, ) dynamic_axes = self.model.get_onnx_dynamic_axes( kv_offload=True, @@ -1733,6 +1728,10 @@ def filter_custom_io_lang(custom_io_lang, onnx_path): elif prefill_seq_len == 1: specializations = specializations["lang"][-1:] qpc_key = "lang_decode_qpc_path" + elif prefill_seq_len is not None and ctx_len is not None and prefill_seq_len == ctx_len: + # Single-shot mode (e.g. reranker): no decode steps, only prefill kernel needed. + specializations = specializations["lang"][:1] + qpc_key = "lang_qpc_path" else: specializations = specializations["lang"] qpc_key = "lang_qpc_path" diff --git a/QEfficient/transformers/models/qwen3_vl/modeling_qwen3_vl.py b/QEfficient/transformers/models/qwen3_vl/modeling_qwen3_vl.py index 0f6ab210de..9f609ea2ea 100644 --- a/QEfficient/transformers/models/qwen3_vl/modeling_qwen3_vl.py +++ b/QEfficient/transformers/models/qwen3_vl/modeling_qwen3_vl.py @@ -847,13 +847,8 @@ def get_dummy_inputs( continuous_batching: bool = False, **kwargs, ): - prefill_seq_len = kwargs.get("prefill_seq_len", constants.ONNX_EXPORT_EXAMPLE_SEQ_LEN) - if prefill_seq_len is None: - prefill_seq_len = constants.ONNX_EXPORT_EXAMPLE_SEQ_LEN - prefill_seq_len = int(prefill_seq_len) - inputs_shapes = {} - inputs_shapes["input_ids"] = (constants.ONNX_EXPORT_EXAMPLE_BATCH_SIZE, prefill_seq_len) + inputs_shapes["input_ids"] = (constants.ONNX_EXPORT_EXAMPLE_BATCH_SIZE, constants.ONNX_EXPORT_EXAMPLE_SEQ_LEN) # vision_size = 1024 vision_size = 187 inputs_shapes["vision_embeds"] = ( @@ -865,7 +860,7 @@ def get_dummy_inputs( inputs_shapes["position_ids"] = ( 3, constants.ONNX_EXPORT_EXAMPLE_BATCH_SIZE, - prefill_seq_len, + constants.ONNX_EXPORT_EXAMPLE_SEQ_LEN, ) inputs_shapes["pixel_values"] = (748, 1536) inputs_shapes["image_idx"] = (1, 1) @@ -889,8 +884,8 @@ def get_dummy_inputs( ) lang_inputs["position_ids"] = ( ( - torch.arange(prefill_seq_len, dtype=torch.int64) - .view(1, prefill_seq_len) + torch.arange(constants.ONNX_EXPORT_EXAMPLE_SEQ_LEN, dtype=torch.int64) + .view(1, constants.ONNX_EXPORT_EXAMPLE_SEQ_LEN) .repeat(constants.ONNX_EXPORT_EXAMPLE_BATCH_SIZE, 1) ) .unsqueeze(0) @@ -908,7 +903,7 @@ def get_dummy_inputs( kv_cache_shape = get_padding_shape_from_config( config=self.model.config.text_config, batch_size=fbs if continuous_batching else bs, - seq_len=prefill_seq_len, + seq_len=constants.ONNX_EXPORT_EXAMPLE_SEQ_LEN, ) lang_inputs["past_key_values"] = [[] for _ in range(self.model.config.text_config.num_hidden_layers)] diff --git a/examples/reranker/qwen3vl/README.md b/examples/reranker/qwen3vl/README.md index d9d96645a8..74bc9d4a2a 100644 --- a/examples/reranker/qwen3vl/README.md +++ b/examples/reranker/qwen3vl/README.md @@ -49,7 +49,6 @@ With compile parameters: ```bash python examples/reranker/qwen3vl/qwen3_vl_reranker.py \ --model-name Qwen/Qwen3-VL-Reranker-2B \ - --ctx-len 2048 \ --num-cores 16 \ --num-devices 1 \ --compile-prefill-seq-len 4096 \ diff --git a/examples/reranker/qwen3vl/qwen3_vl_reranker.py b/examples/reranker/qwen3vl/qwen3_vl_reranker.py index 01884d0d08..504280e7d9 100644 --- a/examples/reranker/qwen3vl/qwen3_vl_reranker.py +++ b/examples/reranker/qwen3vl/qwen3_vl_reranker.py @@ -30,7 +30,6 @@ def parse_args() -> argparse.Namespace: """Parse command-line arguments for AI100 compile/inference knobs.""" parser = argparse.ArgumentParser(description="Qwen3-VL reranker example.") parser.add_argument("--model-name", type=str, default="Qwen/Qwen3-VL-Reranker-2B") - parser.add_argument("--ctx-len", type=int, default=2048, help="Context length used at compile time.") parser.add_argument("--num-cores", type=int, default=16, help="Number of AI100 cores.") parser.add_argument("--num-devices", type=int, default=1, help="Number of AI100 devices.") parser.add_argument( @@ -106,7 +105,6 @@ def main() -> None: # 3) Derive compile requirements from current payload. compile_specs = reranker.get_compile_specs( inputs=inputs, - ctx_len=args.ctx_len, prefill_seq_len=args.compile_prefill_seq_len, ) diff --git a/examples/reranker/qwen3vl/reranker_model.py b/examples/reranker/qwen3vl/reranker_model.py index 33e73b05f6..32c4e65eaa 100644 --- a/examples/reranker/qwen3vl/reranker_model.py +++ b/examples/reranker/qwen3vl/reranker_model.py @@ -173,7 +173,7 @@ def _collect_contexts(self, inputs: Dict): return prepared_contexts, max_prompt_len, max_grid_h, max_grid_w - def get_compile_specs(self, inputs: Dict, ctx_len: int, prefill_seq_len: int = None) -> Dict[str, int]: + def get_compile_specs(self, inputs: Dict, prefill_seq_len: int = None) -> Dict[str, int]: """Return compile parameters required for this input batch.""" _, max_prompt_len, max_grid_h, max_grid_w = self._collect_contexts(inputs) if max_prompt_len == 0: @@ -189,9 +189,10 @@ def get_compile_specs(self, inputs: Dict, ctx_len: int, prefill_seq_len: int = N height = max_grid_h * patch_size width = max_grid_w * patch_size + # ctx_len == prefill_seq_len always: reranker is single-shot prefill, no decode steps. return { "prefill_seq_len": target_prefill_seq_len, - "ctx_len": int(ctx_len), + "ctx_len": target_prefill_seq_len, "img_size": max(height, width), "height": height, "width": width, From 78f91926a1b4ffa025682ddcc88dd0782964c259 Mon Sep 17 00:00:00 2001 From: Amit Raj Date: Thu, 4 Jun 2026 14:38:11 +0530 Subject: [PATCH 08/15] Update the exmple script and modelling files Signed-off-by: Amit Raj --- .../models/gemma3/modeling_gemma3.py | 20 ++++--------- .../models/internvl/modeling_internvl.py | 20 ++++--------- .../models/llama4/modeling_llama4.py | 20 ++++--------- .../models/llava/modeling_llava.py | 8 ++--- .../models/llava_next/modeling_llava_next.py | 10 +++---- .../models/mistral3/modeling_mistral3.py | 14 ++++----- .../models/mllama/modeling_mllama.py | 7 ++--- .../models/molmo/modeling_molmo.py | 14 ++++----- .../models/qwen2_5_vl/modeling_qwen2_5_vl.py | 14 ++++----- .../qwen3_vl_moe/modeling_qwen3_vl_moe.py | 14 ++++----- .../models/whisper/modeling_whisper.py | 5 ++-- tests/configs/image_text_model_configs.json | 30 ------------------- tests/configs/reranker_model_configs.json | 28 +++++++++++++++++ .../models/reranker/test_reranker_mad.py | 7 ++--- .../reranker/test_reranker_models_unit.py | 9 +++--- 15 files changed, 83 insertions(+), 137 deletions(-) create mode 100644 tests/configs/reranker_model_configs.json diff --git a/QEfficient/transformers/models/gemma3/modeling_gemma3.py b/QEfficient/transformers/models/gemma3/modeling_gemma3.py index a3e9257a73..35d9c07cf8 100644 --- a/QEfficient/transformers/models/gemma3/modeling_gemma3.py +++ b/QEfficient/transformers/models/gemma3/modeling_gemma3.py @@ -969,16 +969,8 @@ def get_dummy_pkv_cache(self, config, batch_size, seq_len, dtype=None): return past_key_values def get_dummy_inputs( - self, - comp_ctx_lengths: Optional[List[int]] = None, - kv_offload: bool = False, - continuous_batching: bool = False, - **kwargs, + self, comp_ctx_lengths: Optional[List[int]] = None, kv_offload: bool = False, continuous_batching: bool = False ): - prefill_seq_len = kwargs.get("prefill_seq_len") - if prefill_seq_len is None: - prefill_seq_len = constants.ONNX_EXPORT_EXAMPLE_SEQ_LEN - prefill_seq_len = int(prefill_seq_len) if vis_cfg := getattr(self.config, "vision_config", None): img_size = getattr(vis_cfg, "image_size", 896) else: @@ -987,7 +979,7 @@ def get_dummy_inputs( mm_tokens_per_image = getattr(self.config, "mm_tokens_per_image", 256) # Define shapes inputs_shapes = {} - inputs_shapes["input_ids"] = (constants.ONNX_EXPORT_EXAMPLE_BATCH_SIZE, prefill_seq_len) + inputs_shapes["input_ids"] = (constants.ONNX_EXPORT_EXAMPLE_BATCH_SIZE, constants.ONNX_EXPORT_EXAMPLE_SEQ_LEN) inputs_shapes["vision_embeds"] = ( 1, # constants.INTERN_NUM_PATCHES, mm_tokens_per_image, # constants.INTERN_FEATURE_SIZE, @@ -995,7 +987,7 @@ def get_dummy_inputs( ) inputs_shapes["position_ids"] = ( constants.ONNX_EXPORT_EXAMPLE_BATCH_SIZE, - prefill_seq_len, + constants.ONNX_EXPORT_EXAMPLE_SEQ_LEN, ) inputs_shapes["pixel_values"] = ( constants.ONNX_EXPORT_EXAMPLE_BATCH_SIZE, @@ -1012,8 +1004,8 @@ def get_dummy_inputs( lang_inputs["input_ids"] = torch.zeros((inputs_shapes["input_ids"]), dtype=torch.int64) lang_inputs["vision_embeds"] = torch.zeros((inputs_shapes["vision_embeds"]), dtype=self.config.torch_dtype) lang_inputs["position_ids"] = ( - torch.arange(prefill_seq_len, dtype=torch.int64) - .view(1, prefill_seq_len) + torch.arange(constants.ONNX_EXPORT_EXAMPLE_SEQ_LEN, dtype=torch.int64) + .view(1, constants.ONNX_EXPORT_EXAMPLE_SEQ_LEN) .repeat(constants.ONNX_EXPORT_EXAMPLE_BATCH_SIZE, 1) ) lang_inputs["image_idx"] = torch.zeros((inputs_shapes["image_idx"]), dtype=torch.int64) @@ -1025,7 +1017,7 @@ def get_dummy_inputs( lang_inputs["past_key_values"] = self.get_dummy_pkv_cache( config=self.language_model.config, batch_size=fbs if continuous_batching else bs, - seq_len=prefill_seq_len, + seq_len=constants.ONNX_EXPORT_EXAMPLE_SEQ_LEN, ) if comp_ctx_lengths is not None: diff --git a/QEfficient/transformers/models/internvl/modeling_internvl.py b/QEfficient/transformers/models/internvl/modeling_internvl.py index 563c42e256..821381ac0d 100644 --- a/QEfficient/transformers/models/internvl/modeling_internvl.py +++ b/QEfficient/transformers/models/internvl/modeling_internvl.py @@ -273,16 +273,8 @@ def get_output_names(self, kv_offload: bool = False): return output_names def get_dummy_inputs( - self, - comp_ctx_lengths: Optional[List[int]] = None, - kv_offload: bool = False, - continuous_batching: bool = False, - **kwargs, + self, comp_ctx_lengths: Optional[List[int]] = None, kv_offload: bool = False, continuous_batching: bool = False ): - prefill_seq_len = kwargs.get("prefill_seq_len") - if prefill_seq_len is None: - prefill_seq_len = constants.ONNX_EXPORT_EXAMPLE_SEQ_LEN - prefill_seq_len = int(prefill_seq_len) if vis_cfg := getattr(self.config, "vision_config", None): img_size = getattr(vis_cfg, "image_size", constants.INTERN_IMG_SIZE) else: @@ -301,7 +293,7 @@ def get_dummy_inputs( # Define shapes inputs_shapes = {} - inputs_shapes["input_ids"] = (constants.ONNX_EXPORT_EXAMPLE_BATCH_SIZE, prefill_seq_len) + inputs_shapes["input_ids"] = (constants.ONNX_EXPORT_EXAMPLE_BATCH_SIZE, constants.ONNX_EXPORT_EXAMPLE_SEQ_LEN) inputs_shapes["vision_embeds"] = ( 1, computed_feature_size * constants.ONNX_EXPORT_EXAMPLE_BATCH_SIZE, @@ -309,7 +301,7 @@ def get_dummy_inputs( ) inputs_shapes["position_ids"] = ( constants.ONNX_EXPORT_EXAMPLE_BATCH_SIZE, - prefill_seq_len, + constants.ONNX_EXPORT_EXAMPLE_SEQ_LEN, ) inputs_shapes["pixel_values"] = ( constants.INTERN_NUM_PATCHES * constants.ONNX_EXPORT_EXAMPLE_BATCH_SIZE, @@ -329,8 +321,8 @@ def get_dummy_inputs( (inputs_shapes["vision_embeds"]), dtype=self.config.vision_config.torch_dtype ) lang_inputs["position_ids"] = ( - torch.arange(prefill_seq_len, dtype=torch.int64) - .view(1, prefill_seq_len) + torch.arange(constants.ONNX_EXPORT_EXAMPLE_SEQ_LEN, dtype=torch.int64) + .view(1, constants.ONNX_EXPORT_EXAMPLE_SEQ_LEN) .repeat(constants.ONNX_EXPORT_EXAMPLE_BATCH_SIZE, 1) ) lang_inputs["image_idx"] = torch.zeros((1, 1), dtype=torch.int64) @@ -342,7 +334,7 @@ def get_dummy_inputs( kv_cache_shape = get_padding_shape_from_config( config=self.language_model.config, batch_size=fbs if continuous_batching else bs, - seq_len=prefill_seq_len, + seq_len=constants.ONNX_EXPORT_EXAMPLE_SEQ_LEN, ) lang_inputs["past_key_values"] = [[] for _ in range(self.language_model.config.num_hidden_layers)] diff --git a/QEfficient/transformers/models/llama4/modeling_llama4.py b/QEfficient/transformers/models/llama4/modeling_llama4.py index 2cf5dbb2e9..7f90262bec 100644 --- a/QEfficient/transformers/models/llama4/modeling_llama4.py +++ b/QEfficient/transformers/models/llama4/modeling_llama4.py @@ -1185,16 +1185,8 @@ def get_dummy_pkv_cache(self, config, batch_size, seq_len): return past_key_values def get_dummy_inputs( - self, - comp_ctx_lengths: Optional[List[int]] = None, - kv_offload: bool = False, - continuous_batching: bool = False, - **kwargs, + self, comp_ctx_lengths: Optional[List[int]] = None, kv_offload: bool = False, continuous_batching: bool = False ): - prefill_seq_len = kwargs.get("prefill_seq_len") - if prefill_seq_len is None: - prefill_seq_len = constants.ONNX_EXPORT_EXAMPLE_SEQ_LEN - prefill_seq_len = int(prefill_seq_len) if vis_cfg := getattr(self.config, "vision_config", None): img_size = getattr(vis_cfg, "image_size", 336) else: @@ -1202,7 +1194,7 @@ def get_dummy_inputs( # Define shapes inputs_shapes = {} - inputs_shapes["input_ids"] = (constants.ONNX_EXPORT_EXAMPLE_BATCH_SIZE, prefill_seq_len) + inputs_shapes["input_ids"] = (constants.ONNX_EXPORT_EXAMPLE_BATCH_SIZE, constants.ONNX_EXPORT_EXAMPLE_SEQ_LEN) max_num_tiles = 17 downsample_ratio = int(round(1.0 / (self.config.vision_config.pixel_shuffle_ratio**2))) num_features_per_tile = int( @@ -1218,7 +1210,7 @@ def get_dummy_inputs( ) inputs_shapes["position_ids"] = ( constants.ONNX_EXPORT_EXAMPLE_BATCH_SIZE, - prefill_seq_len, + constants.ONNX_EXPORT_EXAMPLE_SEQ_LEN, ) inputs_shapes["pixel_values"] = ( max_num_tiles, # constants.INTERN_NUM_PATCHES, @@ -1234,8 +1226,8 @@ def get_dummy_inputs( lang_inputs["input_ids"] = torch.zeros((inputs_shapes["input_ids"]), dtype=torch.int64) lang_inputs["vision_embeds"] = torch.zeros((inputs_shapes["vision_embeds"]), dtype=self.config.torch_dtype) lang_inputs["position_ids"] = ( - torch.arange(prefill_seq_len, dtype=torch.int64) - .view(1, prefill_seq_len) + torch.arange(constants.ONNX_EXPORT_EXAMPLE_SEQ_LEN, dtype=torch.int64) + .view(1, constants.ONNX_EXPORT_EXAMPLE_SEQ_LEN) .repeat(constants.ONNX_EXPORT_EXAMPLE_BATCH_SIZE, 1) ) lang_inputs["image_idx"] = torch.zeros((inputs_shapes["image_idx"]), dtype=torch.int64) @@ -1247,7 +1239,7 @@ def get_dummy_inputs( past_key_values = self.get_dummy_pkv_cache( config=self.language_model.config, batch_size=fbs if continuous_batching else bs, - seq_len=prefill_seq_len, + seq_len=constants.ONNX_EXPORT_EXAMPLE_SEQ_LEN, ) lang_inputs["past_key_values"] = [[] for _ in range(self.language_model.config.num_hidden_layers)] diff --git a/QEfficient/transformers/models/llava/modeling_llava.py b/QEfficient/transformers/models/llava/modeling_llava.py index 88bb5e1027..3fdfd11b9e 100644 --- a/QEfficient/transformers/models/llava/modeling_llava.py +++ b/QEfficient/transformers/models/llava/modeling_llava.py @@ -168,10 +168,6 @@ def get_dummy_inputs( continuous_batching: bool = False, **kwargs, ): - prefill_seq_len = kwargs.get("prefill_seq_len") - if prefill_seq_len is None: - prefill_seq_len = SEQ_LEN - prefill_seq_len = int(prefill_seq_len) num_layers = self.config.text_config.num_hidden_layers num_key_value_heads = self.config.text_config.num_key_value_heads head_dim = self.config.text_config.hidden_size // self.config.text_config.num_attention_heads @@ -186,11 +182,11 @@ def get_dummy_inputs( "pixel_values": torch.zeros((BS, NUM_CHANNEL, img_size, img_size), dtype=self.config.torch_dtype), } lang_inputs = { - "input_ids": torch.ones((BS, prefill_seq_len), dtype=torch.int64), + "input_ids": torch.ones((BS, SEQ_LEN), dtype=torch.int64), "vision_embeds": torch.ones( (BS, vision_size, self.model.language_model.config.hidden_size), dtype=self.config.torch_dtype ), - "attention_mask": torch.ones((BS, prefill_seq_len), dtype=torch.int64), + "attention_mask": torch.ones((BS, SEQ_LEN), dtype=torch.int64), "image_idx": torch.zeros((1, 1), dtype=torch.int64), } lang_inputs["position_ids"] = lang_inputs.pop("attention_mask").cumsum(1) diff --git a/QEfficient/transformers/models/llava_next/modeling_llava_next.py b/QEfficient/transformers/models/llava_next/modeling_llava_next.py index 342269ce50..c2a9137006 100755 --- a/QEfficient/transformers/models/llava_next/modeling_llava_next.py +++ b/QEfficient/transformers/models/llava_next/modeling_llava_next.py @@ -195,10 +195,6 @@ def get_dummy_inputs( continuous_batching: bool = False, **kwargs, ): - prefill_seq_len = kwargs.get("prefill_seq_len") - if prefill_seq_len is None: - prefill_seq_len = constants.GRANITEVISION_SEQ_LEN - prefill_seq_len = int(prefill_seq_len) num_layers = self.config.text_config.num_hidden_layers num_key_value_heads = self.config.text_config.num_key_value_heads head_dim = self.config.text_config.hidden_size // self.config.text_config.num_attention_heads @@ -225,9 +221,11 @@ def get_dummy_inputs( ), } lang_inputs = { - "input_ids": torch.ones((constants.ONNX_EXPORT_EXAMPLE_BATCH_SIZE, prefill_seq_len), dtype=torch.int64), + "input_ids": torch.ones( + (constants.ONNX_EXPORT_EXAMPLE_BATCH_SIZE, constants.GRANITEVISION_SEQ_LEN), dtype=torch.int64 + ), "attention_mask": torch.ones( - (constants.ONNX_EXPORT_EXAMPLE_BATCH_SIZE, prefill_seq_len), dtype=torch.int64 + (constants.ONNX_EXPORT_EXAMPLE_BATCH_SIZE, constants.GRANITEVISION_SEQ_LEN), dtype=torch.int64 ), "vision_embeds": torch.ones( ( diff --git a/QEfficient/transformers/models/mistral3/modeling_mistral3.py b/QEfficient/transformers/models/mistral3/modeling_mistral3.py index 628d1dee2c..9c37353328 100644 --- a/QEfficient/transformers/models/mistral3/modeling_mistral3.py +++ b/QEfficient/transformers/models/mistral3/modeling_mistral3.py @@ -346,12 +346,8 @@ def get_dummy_inputs( continuous_batching: bool = False, **kwargs, ): - prefill_seq_len = kwargs.get("prefill_seq_len") - if prefill_seq_len is None: - prefill_seq_len = constants.ONNX_EXPORT_EXAMPLE_SEQ_LEN - prefill_seq_len = int(prefill_seq_len) inputs_shapes = {} - inputs_shapes["input_ids"] = (constants.ONNX_EXPORT_EXAMPLE_BATCH_SIZE, prefill_seq_len) + inputs_shapes["input_ids"] = (constants.ONNX_EXPORT_EXAMPLE_BATCH_SIZE, constants.ONNX_EXPORT_EXAMPLE_SEQ_LEN) height = self.config.vision_config.image_size width = self.config.vision_config.image_size patch_size = self.config.vision_config.patch_size @@ -367,7 +363,7 @@ def get_dummy_inputs( ) inputs_shapes["position_ids"] = ( constants.ONNX_EXPORT_EXAMPLE_BATCH_SIZE, - prefill_seq_len, + constants.ONNX_EXPORT_EXAMPLE_SEQ_LEN, ) inputs_shapes["pixel_values"] = ( constants.ONNX_EXPORT_EXAMPLE_BATCH_SIZE, @@ -384,8 +380,8 @@ def get_dummy_inputs( lang_inputs["input_ids"] = torch.zeros((inputs_shapes["input_ids"]), dtype=torch.int64) lang_inputs["vision_embeds"] = torch.zeros((inputs_shapes["vision_embeds"]), dtype=self.config.torch_dtype) lang_inputs["position_ids"] = ( - torch.arange(prefill_seq_len, dtype=torch.int64) - .view(1, prefill_seq_len) + torch.arange(constants.ONNX_EXPORT_EXAMPLE_SEQ_LEN, dtype=torch.int64) + .view(1, constants.ONNX_EXPORT_EXAMPLE_SEQ_LEN) .repeat(constants.ONNX_EXPORT_EXAMPLE_BATCH_SIZE, 1) ) lang_inputs["image_idx"] = torch.zeros((inputs_shapes["image_idx"]), dtype=torch.int64) @@ -397,7 +393,7 @@ def get_dummy_inputs( kv_cache_shape = get_padding_shape_from_config( config=self.model.config.text_config, batch_size=fbs if continuous_batching else bs, - seq_len=prefill_seq_len, + seq_len=constants.ONNX_EXPORT_EXAMPLE_SEQ_LEN, ) lang_inputs["past_key_values"] = [[] for _ in range(self.model.language_model.config.num_hidden_layers)] diff --git a/QEfficient/transformers/models/mllama/modeling_mllama.py b/QEfficient/transformers/models/mllama/modeling_mllama.py index 45649662a7..d9310c02e4 100644 --- a/QEfficient/transformers/models/mllama/modeling_mllama.py +++ b/QEfficient/transformers/models/mllama/modeling_mllama.py @@ -924,12 +924,9 @@ def forward( logits = self.lm_head(hidden_states).float() return logits, image_idx, outputs.past_key_values, pixel_values - def get_dummy_inputs(self, comp_ctx_lengths: Optional[List[int]] = None, kv_offload: bool = False, **kwargs): + def get_dummy_inputs(self, comp_ctx_lengths: Optional[List[int]] = None, kv_offload: bool = False): BS = constants.ONNX_EXPORT_EXAMPLE_BATCH_SIZE - seq_len = kwargs.get("prefill_seq_len") - if seq_len is None: - seq_len = constants.ONNX_EXPORT_EXAMPLE_SEQ_LEN - SEQ_LEN = int(seq_len) + SEQ_LEN = constants.ONNX_EXPORT_EXAMPLE_SEQ_LEN CTX_LEN = constants.ONNX_EXPORT_CTX_LEN txt_cfg = self.config.get_text_config() diff --git a/QEfficient/transformers/models/molmo/modeling_molmo.py b/QEfficient/transformers/models/molmo/modeling_molmo.py index d59ca4e017..3eefba47f5 100644 --- a/QEfficient/transformers/models/molmo/modeling_molmo.py +++ b/QEfficient/transformers/models/molmo/modeling_molmo.py @@ -931,13 +931,9 @@ def get_dummy_inputs( continuous_batching: bool = False, **kwargs, ): - prefill_seq_len = kwargs.get("prefill_seq_len") - if prefill_seq_len is None: - prefill_seq_len = constants.ONNX_EXPORT_EXAMPLE_SEQ_LEN - prefill_seq_len = int(prefill_seq_len) inputs_shapes = {} inputs_shapes_lang = {} - inputs_shapes["input_ids"] = (constants.ONNX_EXPORT_EXAMPLE_BATCH_SIZE, prefill_seq_len) + inputs_shapes["input_ids"] = (constants.ONNX_EXPORT_EXAMPLE_BATCH_SIZE, constants.ONNX_EXPORT_EXAMPLE_SEQ_LEN) inputs_shapes["vision_embeds"] = ( constants.ONNX_EXPORT_EXAMPLE_BATCH_SIZE, @@ -946,7 +942,7 @@ def get_dummy_inputs( ) inputs_shapes["position_ids"] = ( constants.ONNX_EXPORT_EXAMPLE_BATCH_SIZE, - prefill_seq_len, + constants.ONNX_EXPORT_EXAMPLE_SEQ_LEN, ) inputs_shapes["pixel_values"] = ( constants.ONNX_EXPORT_EXAMPLE_BATCH_SIZE, @@ -980,8 +976,8 @@ def get_dummy_inputs( lang_inputs["input_ids"] = torch.zeros((inputs_shapes["input_ids"]), dtype=torch.int64) lang_inputs["vision_embeds"] = torch.zeros((inputs_shapes["vision_embeds"]), dtype=self.config.torch_dtype) lang_inputs["position_ids"] = ( - torch.arange(prefill_seq_len, dtype=torch.int64) - .view(1, prefill_seq_len) + torch.arange(constants.ONNX_EXPORT_EXAMPLE_SEQ_LEN, dtype=torch.int64) + .view(1, constants.ONNX_EXPORT_EXAMPLE_SEQ_LEN) .repeat(constants.ONNX_EXPORT_EXAMPLE_BATCH_SIZE, 1) ) lang_inputs["image_idx"] = torch.zeros((inputs_shapes["image_idx"]), dtype=torch.int64) @@ -993,7 +989,7 @@ def get_dummy_inputs( kv_cache_shape = get_padding_shape_from_config( config=self.config, batch_size=fbs if continuous_batching else bs, - seq_len=prefill_seq_len, + seq_len=constants.ONNX_EXPORT_EXAMPLE_SEQ_LEN, ) lang_inputs["past_key_values"] = [[] for _ in range(self.model.config.n_layers)] diff --git a/QEfficient/transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py b/QEfficient/transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py index 357c4af16e..dd70a31c95 100644 --- a/QEfficient/transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py +++ b/QEfficient/transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py @@ -831,12 +831,8 @@ def get_dummy_inputs( continuous_batching: bool = False, **kwargs, ): - prefill_seq_len = kwargs.get("prefill_seq_len", constants.ONNX_EXPORT_EXAMPLE_SEQ_LEN) - if prefill_seq_len is None: - prefill_seq_len = constants.ONNX_EXPORT_EXAMPLE_SEQ_LEN - prefill_seq_len = int(prefill_seq_len) inputs_shapes = {} - inputs_shapes["input_ids"] = (constants.ONNX_EXPORT_EXAMPLE_BATCH_SIZE, prefill_seq_len) + inputs_shapes["input_ids"] = (constants.ONNX_EXPORT_EXAMPLE_BATCH_SIZE, constants.ONNX_EXPORT_EXAMPLE_SEQ_LEN) vision_size = 3577 inputs_shapes["vision_embeds"] = ( @@ -848,7 +844,7 @@ def get_dummy_inputs( inputs_shapes["position_ids"] = ( 3, constants.ONNX_EXPORT_EXAMPLE_BATCH_SIZE, - prefill_seq_len, + constants.ONNX_EXPORT_EXAMPLE_SEQ_LEN, ) inputs_shapes["pixel_values"] = (14308, 1176) inputs_shapes["image_idx"] = (1, 1) @@ -862,8 +858,8 @@ def get_dummy_inputs( lang_inputs["vision_embeds"] = torch.zeros((inputs_shapes["vision_embeds"]), dtype=self.config.torch_dtype) lang_inputs["position_ids"] = ( ( - torch.arange(prefill_seq_len, dtype=torch.int64) - .view(1, prefill_seq_len) + torch.arange(constants.ONNX_EXPORT_EXAMPLE_SEQ_LEN, dtype=torch.int64) + .view(1, constants.ONNX_EXPORT_EXAMPLE_SEQ_LEN) .repeat(constants.ONNX_EXPORT_EXAMPLE_BATCH_SIZE, 1) ) .unsqueeze(0) @@ -878,7 +874,7 @@ def get_dummy_inputs( kv_cache_shape = get_padding_shape_from_config( config=self.model.config.text_config, batch_size=fbs if continuous_batching else bs, - seq_len=prefill_seq_len, + seq_len=constants.ONNX_EXPORT_EXAMPLE_SEQ_LEN, ) lang_inputs["past_key_values"] = [[] for _ in range(self.model.config.text_config.num_hidden_layers)] diff --git a/QEfficient/transformers/models/qwen3_vl_moe/modeling_qwen3_vl_moe.py b/QEfficient/transformers/models/qwen3_vl_moe/modeling_qwen3_vl_moe.py index 4a6259bf8d..379c31b52b 100644 --- a/QEfficient/transformers/models/qwen3_vl_moe/modeling_qwen3_vl_moe.py +++ b/QEfficient/transformers/models/qwen3_vl_moe/modeling_qwen3_vl_moe.py @@ -934,12 +934,8 @@ def get_dummy_inputs( continuous_batching: bool = False, **kwargs, ): - prefill_seq_len = kwargs.get("prefill_seq_len") - if prefill_seq_len is None: - prefill_seq_len = constants.ONNX_EXPORT_EXAMPLE_SEQ_LEN - prefill_seq_len = int(prefill_seq_len) inputs_shapes = {} - inputs_shapes["input_ids"] = (constants.ONNX_EXPORT_EXAMPLE_BATCH_SIZE, prefill_seq_len) + inputs_shapes["input_ids"] = (constants.ONNX_EXPORT_EXAMPLE_BATCH_SIZE, constants.ONNX_EXPORT_EXAMPLE_SEQ_LEN) # vision_size = 1024 vision_size = 187 inputs_shapes["vision_embeds"] = ( @@ -951,7 +947,7 @@ def get_dummy_inputs( inputs_shapes["position_ids"] = ( 3, constants.ONNX_EXPORT_EXAMPLE_BATCH_SIZE, - prefill_seq_len, + constants.ONNX_EXPORT_EXAMPLE_SEQ_LEN, ) inputs_shapes["pixel_values"] = (748, 1536) inputs_shapes["image_idx"] = (1, 1) @@ -975,8 +971,8 @@ def get_dummy_inputs( ) lang_inputs["position_ids"] = ( ( - torch.arange(prefill_seq_len, dtype=torch.int64) - .view(1, prefill_seq_len) + torch.arange(constants.ONNX_EXPORT_EXAMPLE_SEQ_LEN, dtype=torch.int64) + .view(1, constants.ONNX_EXPORT_EXAMPLE_SEQ_LEN) .repeat(constants.ONNX_EXPORT_EXAMPLE_BATCH_SIZE, 1) ) .unsqueeze(0) @@ -994,7 +990,7 @@ def get_dummy_inputs( kv_cache_shape = get_padding_shape_from_config( config=self.model.config.text_config, batch_size=fbs if continuous_batching else bs, - seq_len=prefill_seq_len, + seq_len=constants.ONNX_EXPORT_EXAMPLE_SEQ_LEN, ) lang_inputs["past_key_values"] = [[] for _ in range(self.model.config.text_config.num_hidden_layers)] diff --git a/QEfficient/transformers/models/whisper/modeling_whisper.py b/QEfficient/transformers/models/whisper/modeling_whisper.py index 1bdcd07ada..89c52c9517 100644 --- a/QEfficient/transformers/models/whisper/modeling_whisper.py +++ b/QEfficient/transformers/models/whisper/modeling_whisper.py @@ -30,7 +30,7 @@ from QEfficient.transformers.cache_utils import QEffEncoderDecoderCache from QEfficient.transformers.modeling_attn_mask_utils import _create_causal_mask from QEfficient.utils._utils import IOInfo -from QEfficient.utils.constants import MIN_MASKED_ATTENTION_VALUE, ONNX_EXPORT_EXAMPLE_SEQ_LEN +from QEfficient.utils.constants import MIN_MASKED_ATTENTION_VALUE class QEffWhisperPositionalEmbedding(WhisperPositionalEmbedding): @@ -792,10 +792,9 @@ def forward( def get_dummy_inputs( self, - **kwargs, ): bs = 1 - seq_len = int(kwargs.get("prefill_seq_len", ONNX_EXPORT_EXAMPLE_SEQ_LEN)) + seq_len = 32 encoder_seq_len = self.config.max_source_positions encoder_feature_count = self.config.num_mel_bins num_key_value_heads = self.config.decoder_attention_heads diff --git a/tests/configs/image_text_model_configs.json b/tests/configs/image_text_model_configs.json index 85df559970..8181faf430 100644 --- a/tests/configs/image_text_model_configs.json +++ b/tests/configs/image_text_model_configs.json @@ -724,36 +724,6 @@ } } ], - "image_text_reranker_models": [ - { - "model_name": "Qwen/Qwen3-VL-Reranker-2B", - "model_type": "qwen3_vl", - "batch_size": 1, - "prompt_len": 128, - "ctx_len": 1024, - "img_size": 1540, - "img_url": "https://picsum.photos/id/237/536/354", - "instruction": "Retrieve candidates relevant to the query.", - "query_text": "A woman playing with her dog on a beach at sunset.", - "document_text": "A woman and her dog spend time together on a beach during sunset.", - "num_layers": 1, - "additional_params": {} - }, - { - "model_name": "Qwen/Qwen3-VL-Reranker-8B", - "model_type": "qwen3_vl", - "batch_size": 1, - "prompt_len": 128, - "ctx_len": 1024, - "img_size": 1540, - "img_url": "https://picsum.photos/id/237/536/354", - "instruction": "Retrieve candidates relevant to the query.", - "query_text": "A woman playing with her dog on a beach at sunset.", - "document_text": "A woman and her dog spend time together on a beach during sunset.", - "num_layers": 1, - "additional_params": {} - } - ], "image_text_embedding_models": [ { "model_name": "Qwen/Qwen3-VL-Embedding-8B", diff --git a/tests/configs/reranker_model_configs.json b/tests/configs/reranker_model_configs.json new file mode 100644 index 0000000000..4427b9da0c --- /dev/null +++ b/tests/configs/reranker_model_configs.json @@ -0,0 +1,28 @@ +[ + { + "model_name": "Qwen/Qwen3-VL-Reranker-2B", + "model_type": "qwen3_vl", + "batch_size": 1, + "prompt_len": 128, + "img_size": 1540, + "img_url": "https://picsum.photos/id/237/536/354", + "instruction": "Retrieve candidates relevant to the query.", + "query_text": "A woman playing with her dog on a beach at sunset.", + "document_text": "A woman and her dog spend time together on a beach during sunset.", + "num_layers": 1, + "additional_params": {} + }, + { + "model_name": "Qwen/Qwen3-VL-Reranker-8B", + "model_type": "qwen3_vl", + "batch_size": 1, + "prompt_len": 128, + "img_size": 1540, + "img_url": "https://picsum.photos/id/237/536/354", + "instruction": "Retrieve candidates relevant to the query.", + "query_text": "A woman playing with her dog on a beach at sunset.", + "document_text": "A woman and her dog spend time together on a beach during sunset.", + "num_layers": 1, + "additional_params": {} + } +] diff --git a/tests/transformers/models/reranker/test_reranker_mad.py b/tests/transformers/models/reranker/test_reranker_mad.py index 148935c5a7..4677f96933 100644 --- a/tests/transformers/models/reranker/test_reranker_mad.py +++ b/tests/transformers/models/reranker/test_reranker_mad.py @@ -39,7 +39,7 @@ ) from QEfficient.utils.test_utils import load_vlm_model, set_num_layers_vlm -CONFIG_PATH = os.path.join(os.path.dirname(__file__), "../../../configs/image_text_model_configs.json") +CONFIG_PATH = os.path.join(os.path.dirname(__file__), "../../../configs/reranker_model_configs.json") PT_AI100_MAD_MAX = 5e-3 MAX_LENGTH = 8192 @@ -60,8 +60,7 @@ } with open(CONFIG_PATH, "r") as f: - config_data = json.load(f) - reranker_models = config_data["image_text_reranker_models"] + reranker_models = json.load(f) test_reranker_models = [model_config["model_name"] for model_config in reranker_models] reranker_model_config_dict = {model["model_name"]: model for model in reranker_models} @@ -298,7 +297,7 @@ def test_qwen3_vl_reranker_mad_parity(model_name): height=compile_height, width=compile_width, prefill_seq_len=max_prompt_len, - ctx_len=model_cfg["ctx_len"], + ctx_len=max_prompt_len, num_devices=1, num_cores=16, mxfp6_matmul=False, diff --git a/tests/unit_test/models/reranker/test_reranker_models_unit.py b/tests/unit_test/models/reranker/test_reranker_models_unit.py index f3036502e1..b79a3d29c9 100644 --- a/tests/unit_test/models/reranker/test_reranker_models_unit.py +++ b/tests/unit_test/models/reranker/test_reranker_models_unit.py @@ -8,7 +8,7 @@ Generic unit coverage for image-text reranker model entries. This test is intentionally model-list driven: - - Add/remove reranker models only in tests/configs/image_text_model_configs.json + - Add/remove reranker models only in tests/configs/reranker_model_configs.json - The same unit checks run for every configured reranker model """ @@ -22,13 +22,12 @@ from QEfficient.utils.test_utils import set_num_layers_vlm -CONFIG_PATH = os.path.join(os.path.dirname(__file__), "../../../configs/image_text_model_configs.json") +CONFIG_PATH = os.path.join(os.path.dirname(__file__), "../../../configs/reranker_model_configs.json") def _load_reranker_model_configs() -> List[Dict]: with open(CONFIG_PATH, "r", encoding="utf-8") as file: - config_data = json.load(file) - return config_data.get("image_text_reranker_models", []) + return json.load(file) RERANKER_MODEL_CONFIGS = _load_reranker_model_configs() @@ -51,7 +50,7 @@ def _vision_num_layers(config) -> int: def test_reranker_model_list_is_present(): assert RERANKER_MODEL_CONFIGS, ( - "image_text_reranker_models is empty. Add reranker entries in tests/configs/image_text_model_configs.json." + "reranker_model_configs.json is empty. Add reranker entries in tests/configs/reranker_model_configs.json." ) From 8ffe1faa993dcbcb48202a03f3c175e1f1fbfaa3 Mon Sep 17 00:00:00 2001 From: Amit Raj Date: Thu, 4 Jun 2026 21:24:06 +0530 Subject: [PATCH 09/15] Embedding: single prefill specialization, remove ctx_len from API Mirror of the reranker fix: Qwen3-VL embedding is single-shot prefill (reads last-token hidden state as embedding vector, no decode loop). `get_compile_specs` now returns ctx_len == prefill_seq_len, triggering Solution A in modeling_auto.py to compile only the Prefill kernel. Signed-off-by: Amit Raj --- .../transformers/models/qwen3_vl/_embedding_utils.py | 7 +++---- examples/embeddings/qwen3vl/README.md | 1 - examples/embeddings/qwen3vl/qwen3_vl_embedding.py | 3 --- tests/configs/image_text_model_configs.json | 1 - .../models/embedding_models/test_qwen3vl_embedding_mad.py | 1 - .../models/embedding/test_qwen3vl_embedding_unit.py | 4 ++-- 6 files changed, 5 insertions(+), 12 deletions(-) diff --git a/QEfficient/transformers/models/qwen3_vl/_embedding_utils.py b/QEfficient/transformers/models/qwen3_vl/_embedding_utils.py index ca0316371d..bce751db9d 100644 --- a/QEfficient/transformers/models/qwen3_vl/_embedding_utils.py +++ b/QEfficient/transformers/models/qwen3_vl/_embedding_utils.py @@ -257,9 +257,7 @@ def _collect_contexts(self, inputs: List[Dict[str, Any]]): return contexts, max_prompt_len, max_grid_h, max_grid_w - def get_compile_specs( - self, inputs: List[Dict[str, Any]], ctx_len: int, prefill_seq_len: int = None - ) -> Dict[str, int]: + def get_compile_specs(self, inputs: List[Dict[str, Any]], prefill_seq_len: int = None) -> Dict[str, int]: """Compute compile-time spec values for the current input batch.""" _, max_prompt_len, max_grid_h, max_grid_w = self._collect_contexts(inputs) if max_prompt_len == 0: @@ -275,9 +273,10 @@ def get_compile_specs( height = max_grid_h * patch_size width = max_grid_w * patch_size + # ctx_len == prefill_seq_len always: embedding is single-shot prefill, no decode steps. return { "prefill_seq_len": target_prefill_seq_len, - "ctx_len": int(ctx_len), + "ctx_len": target_prefill_seq_len, "img_size": max(height, width), "height": height, "width": width, diff --git a/examples/embeddings/qwen3vl/README.md b/examples/embeddings/qwen3vl/README.md index cff14908cc..6f89fade06 100644 --- a/examples/embeddings/qwen3vl/README.md +++ b/examples/embeddings/qwen3vl/README.md @@ -40,7 +40,6 @@ With compile parameters: ```bash python examples/embeddings/qwen3vl/qwen3_vl_embedding.py \ --model-name Qwen/Qwen3-VL-Embedding-8B \ - --ctx-len 2048 \ --num-cores 16 \ --num-devices 1 \ --compile-prefill-seq-len 4096 \ diff --git a/examples/embeddings/qwen3vl/qwen3_vl_embedding.py b/examples/embeddings/qwen3vl/qwen3_vl_embedding.py index bd707ffb08..b3124352a6 100644 --- a/examples/embeddings/qwen3vl/qwen3_vl_embedding.py +++ b/examples/embeddings/qwen3vl/qwen3_vl_embedding.py @@ -24,7 +24,6 @@ from QEfficient.transformers.models.qwen3_vl._embedding_utils import configure_embedding_model_config DEFAULT_MODEL_NAME = "Qwen/Qwen3-VL-Embedding-8B" -DEFAULT_CTX_LEN = 2048 DEFAULT_NUM_CORES = 16 DEFAULT_NUM_DEVICES = 1 DEFAULT_NUM_HIDDEN_LAYERS = 36 @@ -36,7 +35,6 @@ def parse_args() -> argparse.Namespace: """Parse command-line arguments for AI100 compile/inference knobs.""" parser = argparse.ArgumentParser(description="Qwen3-VL embedding example.") parser.add_argument("--model-name", type=str, default=DEFAULT_MODEL_NAME) - parser.add_argument("--ctx-len", type=int, default=DEFAULT_CTX_LEN, help="Context length used at compile time.") parser.add_argument("--num-cores", type=int, default=DEFAULT_NUM_CORES, help="Number of AI100 cores.") parser.add_argument("--num-devices", type=int, default=DEFAULT_NUM_DEVICES, help="Number of AI100 devices.") parser.add_argument( @@ -121,7 +119,6 @@ def main() -> None: # 3) Derive compile requirements from current payload. compile_specs = embedder.get_compile_specs( inputs=model_inputs, - ctx_len=args.ctx_len, prefill_seq_len=args.compile_prefill_seq_len, ) diff --git a/tests/configs/image_text_model_configs.json b/tests/configs/image_text_model_configs.json index 8181faf430..d98d0e08a2 100644 --- a/tests/configs/image_text_model_configs.json +++ b/tests/configs/image_text_model_configs.json @@ -729,7 +729,6 @@ "model_name": "Qwen/Qwen3-VL-Embedding-8B", "model_type": "qwen3_vl", "batch_size": 1, - "ctx_len": 2048, "num_layers": 1, "vision_depth": 9, "deepstack_index": 8, diff --git a/tests/transformers/models/embedding_models/test_qwen3vl_embedding_mad.py b/tests/transformers/models/embedding_models/test_qwen3vl_embedding_mad.py index d540593b86..885372355d 100644 --- a/tests/transformers/models/embedding_models/test_qwen3vl_embedding_mad.py +++ b/tests/transformers/models/embedding_models/test_qwen3vl_embedding_mad.py @@ -108,7 +108,6 @@ def test_qwen3_vl_embedding_cpu_vs_ai100_mad_parity(model_name): model_inputs = EXAMPLE_QUERIES + EXAMPLE_DOCUMENTS compile_specs = embedder.get_compile_specs( inputs=model_inputs, - ctx_len=model_cfg["ctx_len"], prefill_seq_len=model_cfg.get("compile_prefill_seq_len", None), ) qpc_paths = qeff_model.compile( diff --git a/tests/unit_test/models/embedding/test_qwen3vl_embedding_unit.py b/tests/unit_test/models/embedding/test_qwen3vl_embedding_unit.py index ae7c88e837..a602a0f7dd 100644 --- a/tests/unit_test/models/embedding/test_qwen3vl_embedding_unit.py +++ b/tests/unit_test/models/embedding/test_qwen3vl_embedding_unit.py @@ -118,8 +118,8 @@ def _fake_run_ai100_prefill(prepared_inputs, vision_outputs, lang_qpc_path): monkeypatch.setattr(QEffQwen3VLEmbedder, "_run_ai100_vision", staticmethod(_fake_run_ai100_vision)) monkeypatch.setattr(QEffQwen3VLEmbedder, "_run_ai100_prefill", staticmethod(_fake_run_ai100_prefill)) - compile_specs = embedder.get_compile_specs(inputs=[{}, {}], ctx_len=64, prefill_seq_len=12) - assert compile_specs == {"prefill_seq_len": 12, "ctx_len": 64, "img_size": 160, "height": 96, "width": 160} + compile_specs = embedder.get_compile_specs(inputs=[{}, {}], prefill_seq_len=12) + assert compile_specs == {"prefill_seq_len": 12, "ctx_len": 12, "img_size": 160, "height": 96, "width": 160} embeddings = embedder.process( inputs=[{}, {}], From 4baed79cc9bf8745518d4d2c1d55bc81d5cebd9c Mon Sep 17 00:00:00 2001 From: Amit Raj Date: Thu, 4 Jun 2026 22:44:01 +0530 Subject: [PATCH 10/15] Address review comments: use ONNX_EXPORT_EXAMPLE_SEQ_LEN constant and simplify config path Signed-off-by: Amit Raj --- QEfficient/transformers/models/whisper/modeling_whisper.py | 4 ++-- tests/unit_test/models/reranker/test_reranker_models_unit.py | 3 +-- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/QEfficient/transformers/models/whisper/modeling_whisper.py b/QEfficient/transformers/models/whisper/modeling_whisper.py index 89c52c9517..4c30166289 100644 --- a/QEfficient/transformers/models/whisper/modeling_whisper.py +++ b/QEfficient/transformers/models/whisper/modeling_whisper.py @@ -30,7 +30,7 @@ from QEfficient.transformers.cache_utils import QEffEncoderDecoderCache from QEfficient.transformers.modeling_attn_mask_utils import _create_causal_mask from QEfficient.utils._utils import IOInfo -from QEfficient.utils.constants import MIN_MASKED_ATTENTION_VALUE +from QEfficient.utils.constants import MIN_MASKED_ATTENTION_VALUE, ONNX_EXPORT_EXAMPLE_SEQ_LEN class QEffWhisperPositionalEmbedding(WhisperPositionalEmbedding): @@ -794,7 +794,7 @@ def get_dummy_inputs( self, ): bs = 1 - seq_len = 32 + seq_len = ONNX_EXPORT_EXAMPLE_SEQ_LEN encoder_seq_len = self.config.max_source_positions encoder_feature_count = self.config.num_mel_bins num_key_value_heads = self.config.decoder_attention_heads diff --git a/tests/unit_test/models/reranker/test_reranker_models_unit.py b/tests/unit_test/models/reranker/test_reranker_models_unit.py index b79a3d29c9..7d1321a98b 100644 --- a/tests/unit_test/models/reranker/test_reranker_models_unit.py +++ b/tests/unit_test/models/reranker/test_reranker_models_unit.py @@ -14,7 +14,6 @@ import copy import json -import os from typing import Dict, List import pytest @@ -22,7 +21,7 @@ from QEfficient.utils.test_utils import set_num_layers_vlm -CONFIG_PATH = os.path.join(os.path.dirname(__file__), "../../../configs/reranker_model_configs.json") +CONFIG_PATH = "tests/configs/reranker_model_configs.json" def _load_reranker_model_configs() -> List[Dict]: From 65799d21ffda9a76c41c2fd296fdbbf66eac1440 Mon Sep 17 00:00:00 2001 From: Amit Raj Date: Fri, 5 Jun 2026 08:48:10 +0530 Subject: [PATCH 11/15] Reranker/VLM: fix single-QPC (kv_offload=False) for Qwen3-VL Three bugs in QEffQwen3VLForConditionalGeneration.forward (single-QPC path): 1. self.language_model -> self.model.language_model (attribute error) 2. indices0: wrong batch dim via unsqueeze(0) -> use selected.shape[0] with device 3. get_onnx_dynamic_axes: remove deepstack_features from single-QPC axes (computed internally by vision encoder, not a direct ONNX input) get_specializations kv_offload=False: add grid_height/width/h/w/time to lang specs so qaic-compile can resolve pixel_values dynamic symbols. modeling_auto.py single-QPC compile: apply Solution A (prefill-only spec) and compile without -retained-state for single-shot models to avoid pixel_values / pixel_values_RetainedState shape mismatch. reranker_model.py: add _run_ai100_single_qpc_prefill and update process() to dispatch on isinstance(qpc_paths, dict) for dual vs single QPC. Unit tests: add three tests covering dual/single QPC dispatch. Signed-off-by: Amit Raj --- .../transformers/models/modeling_auto.py | 8 +- .../models/qwen3_vl/modeling_qwen3_vl.py | 17 ++- examples/reranker/qwen3vl/reranker_model.py | 102 ++++++++++++---- .../reranker/test_reranker_models_unit.py | 112 ++++++++++++++++++ 4 files changed, 213 insertions(+), 26 deletions(-) diff --git a/QEfficient/transformers/models/modeling_auto.py b/QEfficient/transformers/models/modeling_auto.py index 0f64d79175..3f914eb848 100755 --- a/QEfficient/transformers/models/modeling_auto.py +++ b/QEfficient/transformers/models/modeling_auto.py @@ -2425,6 +2425,11 @@ def compile( **compiler_options, ) + # Single-shot mode (reranker/embedding): no decode steps, only prefill kernel needed. + single_shot = prefill_seq_len is not None and ctx_len is not None and prefill_seq_len == ctx_len + if single_shot: + specializations = specializations[:1] + if hasattr(self.model, "get_npi_file") and "node_precision_info" not in compiler_options: compiler_options["node_precision_info"] = self.model.get_npi_file(self.model.name_or_path) @@ -2452,7 +2457,8 @@ def compile( self._compile( onnx_path=onnx_path, compile_dir=compile_dir, - retained_state=True, + # Single-shot (reranker/embedding): no decode, no need for retained-state enforcement. + retained_state=not single_shot, specializations=specializations, convert_to_fp16=(CUSTOM_IO_DTYPE_MAP[target_dtype] == "float16"), mxfp6_matmul=mxfp6_matmul, diff --git a/QEfficient/transformers/models/qwen3_vl/modeling_qwen3_vl.py b/QEfficient/transformers/models/qwen3_vl/modeling_qwen3_vl.py index 9f609ea2ea..5a5a8240c2 100644 --- a/QEfficient/transformers/models/qwen3_vl/modeling_qwen3_vl.py +++ b/QEfficient/transformers/models/qwen3_vl/modeling_qwen3_vl.py @@ -819,12 +819,12 @@ def forward( selected = input_ids == self.model.config.image_token_id indices1 = selected.to(torch.int64).cumsum(1) - 1 indices1 = torch.where(indices1 != -1, indices1 + image_idx, indices1) - indices0 = torch.arange(selected.unsqueeze(0).shape[0]).view(-1, 1) + indices0 = torch.arange(selected.shape[0], device=selected.device).view(-1, 1) image_features_expanded = image_embeds.reshape(-1, C).unsqueeze(0)[indices0, indices1] # TODO: deepstack_features are not processed for single QPC setup yet. Will do if required. image_input_embeds = torch.where(selected.unsqueeze(-1), image_features_expanded, inputs_embeds) inputs_embeds = torch.where(input_ids.shape[1] == torch.tensor(1), inputs_embeds, image_input_embeds) - outputs = self.language_model( + outputs = self.model.language_model( inputs_embeds=inputs_embeds, position_ids=position_ids, past_key_values=past_key_values, @@ -1108,8 +1108,15 @@ def smart_resize( specializations["lang"] = lang return specializations, compiler_options else: - lang[0].pop("vision_size") - lang[1].pop("vision_size") + # Single QPC: pixel_values and image_grid_thw are direct inputs, + # so the compiler needs the vision spatial symbols in every spec. + for lang_spec in lang: + lang_spec.pop("vision_size") + lang_spec["grid_height"] = grid_height + lang_spec["grid_width"] = grid_width + lang_spec["grid_h"] = grid_h + lang_spec["grid_w"] = grid_w + lang_spec["time"] = time return lang, compiler_options def get_onnx_dynamic_axes( @@ -1153,6 +1160,8 @@ def get_onnx_dynamic_axes( dynamic_axes["lang"] = lang_dynamic_axes else: lang_dynamic_axes.pop("vision_embeds") + # deepstack_features are computed internally by vision encoder in single QPC — not a direct input + vision_dynamic_axes.pop("deepstack_features") dynamic_axes = {**vision_dynamic_axes, **lang_dynamic_axes} return dynamic_axes diff --git a/examples/reranker/qwen3vl/reranker_model.py b/examples/reranker/qwen3vl/reranker_model.py index 32c4e65eaa..77e7be2dc4 100644 --- a/examples/reranker/qwen3vl/reranker_model.py +++ b/examples/reranker/qwen3vl/reranker_model.py @@ -265,8 +265,58 @@ def _run_ai100_prefill( lang_session.deactivate() return outputs["logits"] - def process(self, inputs: Dict, qpc_paths: Dict[str, str], prefill_seq_len: int) -> List[float]: - """Score all documents for one query on AI100 using precompiled QPCs.""" + @staticmethod + def _run_ai100_single_qpc_prefill(prepared_inputs: Dict, qpc_path: str) -> np.ndarray: + """Run single-QPC (vision+language fused) prefill and return logits.""" + prefill_len = prepared_inputs["position_ids"].shape[-1] + input_ids = prepared_inputs["input_ids"] + if input_ids.shape[1] < prefill_len: + pad = torch.full( + (input_ids.shape[0], prefill_len - input_ids.shape[1]), + 1, + dtype=input_ids.dtype, + device=input_ids.device, + ) + input_ids = torch.cat([input_ids, pad], dim=1) + else: + input_ids = input_ids[:, :prefill_len] + + position_ids = prepared_inputs["position_ids"][..., :prefill_len] + + session = QAICInferenceSession(str(qpc_path)) + + run_inputs = { + "input_ids": input_ids.detach().cpu().numpy().astype(np.int64), + "position_ids": position_ids.detach().cpu().numpy().astype(np.int64), + "image_idx": np.zeros((1, 1), dtype=np.int64), + } + + # image_grid_thw is baked as a constant during single-QPC ONNX tracing; only + # pixel_values remains as a dynamic input for the vision encoder. + if "pixel_values" in prepared_inputs: + run_inputs["pixel_values"] = prepared_inputs["pixel_values"].detach().cpu().numpy().astype(np.float16) + else: + # Text-only: pass zeros with the shape fixed at compile time. + pv_idx = session.binding_index_map["pixel_values"] + run_inputs["pixel_values"] = np.zeros(session.bindings[pv_idx].dims, dtype=np.float16) + + # Compiled without -retained-state so the device cannot manage KV cache + # internally — provide explicit zero buffers for the single prefill pass. + for name in session.input_names: + if name.startswith("past_"): + idx = session.binding_index_map[name] + run_inputs[name] = np.zeros(session.bindings[idx].dims, dtype=np.float16) + + outputs = session.run(run_inputs) + session.deactivate() + return outputs["logits"] + + def process(self, inputs: Dict, qpc_paths, prefill_seq_len: int) -> List[float]: + """Score all documents for one query on AI100 using precompiled QPCs. + + Supports both dual-QPC (qpc_paths is a dict with 'vision_qpc_path' and + 'lang_qpc_path') and single-QPC (qpc_paths is a str/Path to the combined QPC). + """ prepared_contexts, max_prompt_len, _, _ = self._collect_contexts(inputs) if max_prompt_len == 0: return [] @@ -277,30 +327,40 @@ def process(self, inputs: Dict, qpc_paths: Dict[str, str], prefill_seq_len: int) f"prefill_seq_len ({target_prefill_seq_len}) must be >= max runtime prompt length ({max_prompt_len})." ) - if "vision_qpc_path" not in qpc_paths or "lang_qpc_path" not in qpc_paths: - raise ValueError("qpc_paths must contain 'vision_qpc_path' and 'lang_qpc_path'.") - prepared_contexts_with_prefill = [] - vision_template = None for ctx in prepared_contexts: prepared_inputs = self._prepare_inputs(ctx["tokenized"], prefill_seq_len=target_prefill_seq_len) prepared_contexts_with_prefill.append({"prepared_inputs": prepared_inputs}) - if vision_template is None and "pixel_values" in prepared_inputs and "image_grid_thw" in prepared_inputs: - vision_template = self._run_ai100_vision(prepared_inputs, vision_qpc_path=qpc_paths["vision_qpc_path"]) - - if vision_template is None: - raise ValueError("At least one image document is required to initialize AI100 vision buffers.") - + is_dual_qpc = isinstance(qpc_paths, dict) scores = [] - for ctx in prepared_contexts_with_prefill: - logits = self._run_ai100_prefill( - ctx["prepared_inputs"], - vision_template=vision_template, - lang_qpc_path=qpc_paths["lang_qpc_path"], - vision_qpc_path=qpc_paths["vision_qpc_path"], - ) - score = self._score_from_logits(logits, self.yes_token_id, self.no_token_id) - scores.append(score) + + if is_dual_qpc: + if "vision_qpc_path" not in qpc_paths or "lang_qpc_path" not in qpc_paths: + raise ValueError("qpc_paths must contain 'vision_qpc_path' and 'lang_qpc_path'.") + + vision_template = None + for ctx in prepared_contexts_with_prefill: + if vision_template is None and "pixel_values" in ctx["prepared_inputs"]: + vision_template = self._run_ai100_vision( + ctx["prepared_inputs"], vision_qpc_path=qpc_paths["vision_qpc_path"] + ) + + if vision_template is None: + raise ValueError("At least one image document is required to initialize AI100 vision buffers.") + + for ctx in prepared_contexts_with_prefill: + logits = self._run_ai100_prefill( + ctx["prepared_inputs"], + vision_template=vision_template, + lang_qpc_path=qpc_paths["lang_qpc_path"], + vision_qpc_path=qpc_paths["vision_qpc_path"], + ) + scores.append(self._score_from_logits(logits, self.yes_token_id, self.no_token_id)) + else: + # Single QPC: vision + language fused in one compiled binary. + for ctx in prepared_contexts_with_prefill: + logits = self._run_ai100_single_qpc_prefill(ctx["prepared_inputs"], qpc_path=qpc_paths) + scores.append(self._score_from_logits(logits, self.yes_token_id, self.no_token_id)) return scores diff --git a/tests/unit_test/models/reranker/test_reranker_models_unit.py b/tests/unit_test/models/reranker/test_reranker_models_unit.py index 7d1321a98b..800801f2dd 100644 --- a/tests/unit_test/models/reranker/test_reranker_models_unit.py +++ b/tests/unit_test/models/reranker/test_reranker_models_unit.py @@ -14,9 +14,13 @@ import copy import json +from pathlib import Path from typing import Dict, List +from unittest.mock import MagicMock +import numpy as np import pytest +import torch from transformers import AutoConfig from QEfficient.utils.test_utils import set_num_layers_vlm @@ -79,3 +83,111 @@ def test_reranker_config_reduction_keeps_valid_deepstack(model_cfg: Dict): assert max(deepstack_idxs) < _vision_num_layers(reduced_cfg), ( f"{model_name}: deepstack indexes must be in [0, vision_num_layers)" ) + + +# --------------------------------------------------------------------------- +# Tests: kv_offload=False (single QPC) runtime dispatch in QEffQwen3VLReranker +# --------------------------------------------------------------------------- + + +def _make_dummy_reranker(): + """Build a minimal QEffQwen3VLReranker with mocked internals.""" + # Import the reranker class from the examples directory via importlib + import importlib.util + import sys + + spec = importlib.util.spec_from_file_location( + "reranker_model", + Path(__file__).parents[4] / "examples" / "reranker" / "qwen3vl" / "reranker_model.py", + ) + mod = importlib.util.module_from_spec(spec) + # Stub heavy dependencies so the module loads without hardware + sys.modules.setdefault("QEfficient.generation.cloud_infer", MagicMock()) + sys.modules.setdefault("QEfficient.transformers.models.qwen3_vl._reranker_utils", MagicMock()) + spec.loader.exec_module(mod) + return mod.QEffQwen3VLReranker + + +@pytest.fixture() +def reranker_cls(): + return _make_dummy_reranker() + + +def _fake_prepared_inputs(has_image: bool, prefill_len: int = 8): + inputs = { + "input_ids": torch.ones((1, prefill_len), dtype=torch.int64), + "position_ids": torch.arange(prefill_len).reshape(1, 1, prefill_len).expand(4, 1, -1), + } + if has_image: + inputs["pixel_values"] = torch.zeros((748, 1536), dtype=torch.float32) + inputs["image_grid_thw"] = torch.zeros((1, 1, 22, 34), dtype=torch.int64) + return inputs + + +def test_reranker_process_dispatches_to_dual_qpc(reranker_cls, monkeypatch): + """process() with dict qpc_paths uses the dual-QPC path.""" + reranker = object.__new__(reranker_cls) + reranker.yes_token_id = 0 + reranker.no_token_id = 1 + + fake_logits = np.zeros((1, 1, 10), dtype=np.float32) + fake_logits[0, 0, 0] = 2.0 # yes logit > no logit → score > 0.5 + + monkeypatch.setattr(reranker, "_collect_contexts", lambda _: ([{"tokenized": {}}], 4, 22, 34)) + monkeypatch.setattr(reranker, "_prepare_inputs", lambda tok, prefill_seq_len: _fake_prepared_inputs(True)) + monkeypatch.setattr( + reranker_cls, "_run_ai100_vision", staticmethod(lambda pi, vision_qpc_path: {"v": np.zeros((1,))}) + ) + monkeypatch.setattr( + reranker_cls, + "_run_ai100_prefill", + staticmethod(lambda pi, vision_template, lang_qpc_path, vision_qpc_path: fake_logits), + ) + monkeypatch.setattr(reranker_cls, "_score_from_logits", staticmethod(lambda logits, y, n: 0.88)) + + scores = reranker.process( + inputs={}, + qpc_paths={"vision_qpc_path": "v.qpc", "lang_qpc_path": "l.qpc"}, + prefill_seq_len=8, + ) + assert scores == [0.88] + + +def test_reranker_process_dispatches_to_single_qpc(reranker_cls, monkeypatch): + """process() with a non-dict qpc_paths uses the single-QPC path.""" + reranker = object.__new__(reranker_cls) + reranker.yes_token_id = 0 + reranker.no_token_id = 1 + + fake_logits = np.zeros((1, 1, 10), dtype=np.float32) + + monkeypatch.setattr(reranker, "_collect_contexts", lambda _: ([{"tokenized": {}}], 4, 22, 34)) + monkeypatch.setattr(reranker, "_prepare_inputs", lambda tok, prefill_seq_len: _fake_prepared_inputs(False)) + monkeypatch.setattr( + reranker_cls, + "_run_ai100_single_qpc_prefill", + staticmethod(lambda pi, qpc_path: fake_logits), + ) + monkeypatch.setattr(reranker_cls, "_score_from_logits", staticmethod(lambda logits, y, n: 0.72)) + + scores = reranker.process(inputs={}, qpc_paths="/path/to/single.qpc", prefill_seq_len=8) + assert scores == [0.72] + + +def test_reranker_process_single_qpc_with_pathlib(reranker_cls, monkeypatch): + """Single QPC path also accepts a pathlib.Path object.""" + reranker = object.__new__(reranker_cls) + reranker.yes_token_id = 0 + reranker.no_token_id = 1 + + monkeypatch.setattr(reranker, "_collect_contexts", lambda _: ([{"tokenized": {}}], 4, 22, 34)) + monkeypatch.setattr(reranker, "_prepare_inputs", lambda tok, prefill_seq_len: _fake_prepared_inputs(True)) + monkeypatch.setattr( + reranker_cls, + "_run_ai100_single_qpc_prefill", + staticmethod(lambda pi, qpc_path: np.zeros((1, 1, 10), dtype=np.float32)), + ) + monkeypatch.setattr(reranker_cls, "_score_from_logits", staticmethod(lambda logits, y, n: 0.5)) + + scores = reranker.process(inputs={}, qpc_paths=Path("/tmp/model.qpc"), prefill_seq_len=8) + assert scores == [0.5] From a9486c8a414050e9f8e42b137a490e5ef3615006 Mon Sep 17 00:00:00 2001 From: Amit Raj Date: Fri, 5 Jun 2026 10:06:32 +0530 Subject: [PATCH 12/15] Reranker/VLM: fix single-QPC (kv_offload=False) for Qwen3-VL Four bugs in QEffQwen3VLForConditionalGeneration (single-QPC path): 1. self.language_model -> self.model.language_model 2. indices0 wrong batch dim (unsqueeze) -> selected.shape[0] + device 3. get_onnx_dynamic_axes: drop deepstack_features from single-QPC axes 4. get_specializations: add grid_height/width/h/w/time for pixel_values modeling_auto.py single-QPC compile: - Solution A (prefill-only spec when prefill_seq_len == ctx_len) - retained_state=False for single-shot to avoid pixel_values shape mismatch reranker_model.py: - _run_ai100_single_qpc_prefill: runs fused session with explicit zero KV buffers (retained_state=False requires host-managed buffers) - process(): dispatch on isinstance(qpc_paths, dict) dual vs single QPC Unit tests: three new tests covering dual/single QPC dispatch Note: KV cache removal from single-shot ONNX is a future optimization requiring KVCacheTransform changes (tracked as TODO in code). Signed-off-by: Amit Raj --- examples/reranker/qwen3vl/qwen3_vl_reranker.py | 2 +- examples/reranker/qwen3vl/reranker_model.py | 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/examples/reranker/qwen3vl/qwen3_vl_reranker.py b/examples/reranker/qwen3vl/qwen3_vl_reranker.py index 504280e7d9..8bcf924e95 100644 --- a/examples/reranker/qwen3vl/qwen3_vl_reranker.py +++ b/examples/reranker/qwen3vl/qwen3_vl_reranker.py @@ -93,7 +93,7 @@ def main() -> None: processor = AutoProcessor.from_pretrained(model_source, trust_remote_code=True) model = QEFFAutoModelForImageTextToText.from_pretrained( model_source, - kv_offload=True, + kv_offload=False, trust_remote_code=True, config=config, ) diff --git a/examples/reranker/qwen3vl/reranker_model.py b/examples/reranker/qwen3vl/reranker_model.py index 77e7be2dc4..7687f6b6c5 100644 --- a/examples/reranker/qwen3vl/reranker_model.py +++ b/examples/reranker/qwen3vl/reranker_model.py @@ -302,6 +302,8 @@ def _run_ai100_single_qpc_prefill(prepared_inputs: Dict, qpc_path: str) -> np.nd # Compiled without -retained-state so the device cannot manage KV cache # internally — provide explicit zero buffers for the single prefill pass. + # TODO: KV cache can be eliminated from single-shot ONNX once KVCacheTransform + # is extended to support a no-cache path; tracked as a future optimization. for name in session.input_names: if name.startswith("past_"): idx = session.binding_index_map[name] From 1b784c35c65095727ecf2aab1aad06a690da5539 Mon Sep 17 00:00:00 2001 From: Amit Raj Date: Fri, 5 Jun 2026 10:46:23 +0530 Subject: [PATCH 13/15] Removed past-key values from onnx and qpcs input output Signed-off-by: Amit Raj --- QEfficient/blocking/attention_blocking.py | 1 + .../transformers/models/modeling_auto.py | 5 +++ .../models/qwen3_vl/modeling_qwen3_vl.py | 45 ++++++++++++++++--- .../embeddings/qwen3vl/qwen3_vl_embedding.py | 2 +- .../reranker/qwen3vl/qwen3_vl_reranker.py | 1 + examples/reranker/qwen3vl/reranker_model.py | 9 ---- 6 files changed, 48 insertions(+), 15 deletions(-) diff --git a/QEfficient/blocking/attention_blocking.py b/QEfficient/blocking/attention_blocking.py index b753420132..cae8840811 100644 --- a/QEfficient/blocking/attention_blocking.py +++ b/QEfficient/blocking/attention_blocking.py @@ -81,6 +81,7 @@ def past_key_value_update( position_ids: Optional[torch.LongTensor] = None, sliding_window: Optional[int] = None, ): + cache_kwargs = {} if past_key_value is not None: cache_kwargs = {"batch_index": batch_index, "position_ids": position_ids} if sliding_window is not None: diff --git a/QEfficient/transformers/models/modeling_auto.py b/QEfficient/transformers/models/modeling_auto.py index 3f914eb848..68a125c08d 100755 --- a/QEfficient/transformers/models/modeling_auto.py +++ b/QEfficient/transformers/models/modeling_auto.py @@ -2450,6 +2450,11 @@ def compile( CUSTOM_IO_DTYPE_MAP[target_dtype] if "pixel_values" in output_name else kv_cache_dtype ) + # Single-shot mode has no retained state; pixel_values is a direct input so + # its dtype must still be set explicitly (float16 for hardware). + if single_shot: + custom_io["pixel_values"] = CUSTOM_IO_DTYPE_MAP[target_dtype] + # TODO this hould be removed once the continous batching is supported for all the models. compiler_options.pop("continuous_batching", None) compiler_options.pop("kv_cache_batch_size", None) diff --git a/QEfficient/transformers/models/qwen3_vl/modeling_qwen3_vl.py b/QEfficient/transformers/models/qwen3_vl/modeling_qwen3_vl.py index 5a5a8240c2..c016646d06 100644 --- a/QEfficient/transformers/models/qwen3_vl/modeling_qwen3_vl.py +++ b/QEfficient/transformers/models/qwen3_vl/modeling_qwen3_vl.py @@ -57,6 +57,18 @@ def _should_export_embedding_output(module) -> bool: return False +def _is_single_shot_mode(module) -> bool: + """True when model is single-shot prefill only (reranker/embedding) — no KV cache needed.""" + for holder in (module, getattr(module, "model", None)): + if holder is None: + continue + qaic_config = getattr(holder, "qaic_config", None) + if isinstance(qaic_config, dict): + if qaic_config.get("no_kv_cache", False) or qaic_config.get("export_embedding", False): + return True + return False + + def qeff_apply_interleaved_mrope(freqs, mrope_section): """Apply interleaved MRoPE to 3D rotary embeddings. Reorganizes frequency layout from chunked [TTT...HHH...WWW] to @@ -549,7 +561,9 @@ def forward( ) -> Union[Tuple, BaseModelOutputWithPast]: if (input_ids is None) ^ (inputs_embeds is not None): raise ValueError("You must specify exactly one of input_ids or inputs_embeds") - if self.config.use_cache and not isinstance(past_key_values, Cache): + return_legacy_cache = False + effective_use_cache = use_cache if use_cache is not None else self.config.use_cache + if effective_use_cache and not isinstance(past_key_values, Cache): return_legacy_cache = True past_key_values = QEffDynamicCache.from_legacy_cache(past_key_values) @@ -567,7 +581,11 @@ def forward( elif position_ids.dim() == 2: position_ids = position_ids[None, ...].expand(3, position_ids.shape[0], -1) - target_length = attention_mask.shape[-1] if isinstance(attention_mask, torch.Tensor) else past_seen_tokens + target_length = ( + attention_mask.shape[-1] + if isinstance(attention_mask, torch.Tensor) + else (past_seen_tokens if past_seen_tokens > 0 else inputs_embeds.shape[1]) + ) causal_mask = _create_causal_mask( position_ids=position_ids[0], target_length=target_length, sliding_window=None ) @@ -805,7 +823,7 @@ def forward( self, input_ids, position_ids, - past_key_values, + past_key_values=None, pixel_values: Optional[torch.FloatTensor] = None, image_idx: Optional[torch.LongTensor] = None, comp_ctx_lengths: Optional[List[int]] = None, @@ -824,18 +842,24 @@ def forward( # TODO: deepstack_features are not processed for single QPC setup yet. Will do if required. image_input_embeds = torch.where(selected.unsqueeze(-1), image_features_expanded, inputs_embeds) inputs_embeds = torch.where(input_ids.shape[1] == torch.tensor(1), inputs_embeds, image_input_embeds) + + single_shot = _is_single_shot_mode(self) outputs = self.model.language_model( inputs_embeds=inputs_embeds, position_ids=position_ids, - past_key_values=past_key_values, + past_key_values=None if single_shot else past_key_values, comp_ctx_lengths=comp_ctx_lengths, batch_index=batch_index, - use_cache=True, + use_cache=not single_shot, ) logit_index = position_ids[0].to(torch.int32).argmax(1, keepdim=True) hidden_states = outputs.last_hidden_state[torch.arange(position_ids[0].shape[0]).view(-1, 1), logit_index] logits = self.lm_head(hidden_states) image_idx = (indices1.max() + 1).unsqueeze(0).unsqueeze(0) + if single_shot: + if _should_export_embedding_output(self): + return logits, image_embeds, image_idx, hidden_states + return logits, image_embeds, image_idx if _should_export_embedding_output(self): return logits, image_embeds, image_idx, hidden_states, outputs.past_key_values return logits, image_embeds, image_idx, outputs.past_key_values @@ -924,6 +948,8 @@ def get_dummy_inputs( inputs["lang"] = lang_inputs else: lang_inputs.pop("vision_embeds") + if _is_single_shot_mode(self): + lang_inputs.pop("past_key_values") inputs = {**vision_inputs, **lang_inputs} return inputs @@ -1162,6 +1188,10 @@ def get_onnx_dynamic_axes( lang_dynamic_axes.pop("vision_embeds") # deepstack_features are computed internally by vision encoder in single QPC — not a direct input vision_dynamic_axes.pop("deepstack_features") + if _is_single_shot_mode(self): + for i in range(num_layers): + lang_dynamic_axes.pop(f"past_key.{i}", None) + lang_dynamic_axes.pop(f"past_value.{i}", None) dynamic_axes = {**vision_dynamic_axes, **lang_dynamic_axes} return dynamic_axes @@ -1183,6 +1213,11 @@ def get_output_names(self, kv_offload: bool = False): output_names["vision"] = vision_output_names output_names["lang"] = lang_output_names else: + if _is_single_shot_mode(self): + single_shot_outputs = ["logits", "image_idx_output"] + if _should_export_embedding_output(self): + single_shot_outputs.insert(1, "embedding_output") + return single_shot_outputs lang_output_names.insert(1, "pixel_values_RetainedState") lang_output_names.insert(2, "image_idx_output") if _should_export_embedding_output(self): diff --git a/examples/embeddings/qwen3vl/qwen3_vl_embedding.py b/examples/embeddings/qwen3vl/qwen3_vl_embedding.py index b3124352a6..076d3f058f 100644 --- a/examples/embeddings/qwen3vl/qwen3_vl_embedding.py +++ b/examples/embeddings/qwen3vl/qwen3_vl_embedding.py @@ -105,7 +105,7 @@ def main() -> None: processor = AutoProcessor.from_pretrained(model_source, trust_remote_code=True, padding=True) model = QEFFAutoModelForImageTextToText.from_pretrained( model_source, - kv_offload=True, + kv_offload=False, trust_remote_code=True, config=config, qaic_config={"export_embedding": True}, diff --git a/examples/reranker/qwen3vl/qwen3_vl_reranker.py b/examples/reranker/qwen3vl/qwen3_vl_reranker.py index 8bcf924e95..d6e0c5e633 100644 --- a/examples/reranker/qwen3vl/qwen3_vl_reranker.py +++ b/examples/reranker/qwen3vl/qwen3_vl_reranker.py @@ -96,6 +96,7 @@ def main() -> None: kv_offload=False, trust_remote_code=True, config=config, + qaic_config={"no_kv_cache": True}, ) # 2) Build reranker helper and reference payload. diff --git a/examples/reranker/qwen3vl/reranker_model.py b/examples/reranker/qwen3vl/reranker_model.py index 7687f6b6c5..b82143dd18 100644 --- a/examples/reranker/qwen3vl/reranker_model.py +++ b/examples/reranker/qwen3vl/reranker_model.py @@ -300,15 +300,6 @@ def _run_ai100_single_qpc_prefill(prepared_inputs: Dict, qpc_path: str) -> np.nd pv_idx = session.binding_index_map["pixel_values"] run_inputs["pixel_values"] = np.zeros(session.bindings[pv_idx].dims, dtype=np.float16) - # Compiled without -retained-state so the device cannot manage KV cache - # internally — provide explicit zero buffers for the single prefill pass. - # TODO: KV cache can be eliminated from single-shot ONNX once KVCacheTransform - # is extended to support a no-cache path; tracked as a future optimization. - for name in session.input_names: - if name.startswith("past_"): - idx = session.binding_index_map[name] - run_inputs[name] = np.zeros(session.bindings[idx].dims, dtype=np.float16) - outputs = session.run(run_inputs) session.deactivate() return outputs["logits"] From d484b027fd7587cf85ef694ff4bd8d981daae889 Mon Sep 17 00:00:00 2001 From: Amit Raj Date: Fri, 5 Jun 2026 16:03:29 +0530 Subject: [PATCH 14/15] Eabled embedding as well Signed-off-by: Amit Raj --- .../models/qwen3_vl/_embedding_utils.py | 124 ++++++++++++++---- .../models/qwen3_vl/modeling_qwen3_vl.py | 6 +- .../models/qwen3vl/qwen3_vl.py | 6 +- .../embedding/test_qwen3vl_embedding_unit.py | 45 +++++++ 4 files changed, 148 insertions(+), 33 deletions(-) diff --git a/QEfficient/transformers/models/qwen3_vl/_embedding_utils.py b/QEfficient/transformers/models/qwen3_vl/_embedding_utils.py index bce751db9d..5c99a867a7 100644 --- a/QEfficient/transformers/models/qwen3_vl/_embedding_utils.py +++ b/QEfficient/transformers/models/qwen3_vl/_embedding_utils.py @@ -351,17 +351,71 @@ def _run_ai100_prefill( embedding_output = embedding_output.reshape(embedding_output.shape[0], -1) return embedding_output + @staticmethod + def _run_ai100_single_qpc_prefill( + prepared_inputs: Dict[str, torch.Tensor], + qpc_path, + ) -> np.ndarray: + """Execute single-QPC (vision+language fused) prefill and return the embedding row.""" + prefill_len = prepared_inputs["position_ids"].shape[-1] + input_ids = prepared_inputs["input_ids"] + if input_ids.shape[1] < prefill_len: + pad = torch.full( + (input_ids.shape[0], prefill_len - input_ids.shape[1]), + 1, + dtype=input_ids.dtype, + device=input_ids.device, + ) + input_ids = torch.cat([input_ids, pad], dim=1) + else: + input_ids = input_ids[:, :prefill_len] + + position_ids = prepared_inputs["position_ids"][..., :prefill_len] + + session = QAICInferenceSession(str(qpc_path)) + + run_inputs = { + "input_ids": input_ids.detach().cpu().numpy().astype(np.int64), + "position_ids": position_ids.detach().cpu().numpy().astype(np.int64), + "image_idx": np.zeros((1, 1), dtype=np.int64), + } + + if "pixel_values" in prepared_inputs: + run_inputs["pixel_values"] = prepared_inputs["pixel_values"].detach().cpu().numpy().astype(np.float16) + else: + pv_idx = session.binding_index_map["pixel_values"] + run_inputs["pixel_values"] = np.zeros(session.bindings[pv_idx].dims, dtype=np.float16) + + for name in session.input_names: + if name.startswith("past_"): + idx = session.binding_index_map[name] + run_inputs[name] = np.zeros(session.bindings[idx].dims, dtype=np.float16) + + outputs = session.run(run_inputs) + session.deactivate() + + if "embedding_output" not in outputs: + raise KeyError( + "Missing 'embedding_output' in single-QPC outputs. Ensure export_embedding is enabled in qaic_config." + ) + + embedding_output = outputs["embedding_output"] + if embedding_output.ndim > 2: + embedding_output = embedding_output.reshape(embedding_output.shape[0], -1) + return embedding_output + def process( self, inputs: List[Dict[str, Any]], - qpc_paths: Dict[str, str], + qpc_paths, prefill_seq_len: int, normalize: bool = True, ) -> torch.Tensor: - """Run AI100 embedding generation for all inputs and return stacked rows.""" - if "vision_qpc_path" not in qpc_paths or "lang_qpc_path" not in qpc_paths: - raise ValueError("qpc_paths must contain 'vision_qpc_path' and 'lang_qpc_path'.") + """Run AI100 embedding generation for all inputs and return stacked rows. + Supports both dual-QPC (qpc_paths is a dict with 'vision_qpc_path' and + 'lang_qpc_path') and single-QPC (qpc_paths is a str/Path to the combined QPC). + """ contexts, max_prompt_len, _, _ = self._collect_contexts(inputs) if max_prompt_len == 0: return torch.empty((0, 0), dtype=torch.float32) @@ -373,7 +427,6 @@ def process( ) prepared_contexts = [] - vision_template = None for ctx in contexts: prepared_inputs, _ = self._prepare_qeff_inputs( qeff_model=self.model, @@ -382,32 +435,47 @@ def process( ) prepared_contexts.append({"prepared_inputs": prepared_inputs}) - if vision_template is None and "pixel_values" in prepared_inputs and "image_grid_thw" in prepared_inputs: - vision_template = self._run_ai100_vision( - vision_qpc_path=qpc_paths["vision_qpc_path"], - prepared_inputs=prepared_inputs, - ) - - if vision_template is None: - raise ValueError("At least one input with an image is required to initialize AI100 vision buffers.") - + is_dual_qpc = isinstance(qpc_paths, dict) embedding_rows = [] - for ctx in prepared_contexts: - prepared_inputs = ctx["prepared_inputs"] - if "pixel_values" in prepared_inputs and "image_grid_thw" in prepared_inputs: - vision_outputs = self._run_ai100_vision( - vision_qpc_path=qpc_paths["vision_qpc_path"], + + if is_dual_qpc: + if "vision_qpc_path" not in qpc_paths or "lang_qpc_path" not in qpc_paths: + raise ValueError("qpc_paths must contain 'vision_qpc_path' and 'lang_qpc_path'.") + + vision_template = None + for ctx in prepared_contexts: + if vision_template is None and "pixel_values" in ctx["prepared_inputs"]: + vision_template = self._run_ai100_vision( + vision_qpc_path=qpc_paths["vision_qpc_path"], + prepared_inputs=ctx["prepared_inputs"], + ) + + if vision_template is None: + raise ValueError("At least one input with an image is required to initialize AI100 vision buffers.") + + for ctx in prepared_contexts: + prepared_inputs = ctx["prepared_inputs"] + if "pixel_values" in prepared_inputs and "image_grid_thw" in prepared_inputs: + vision_outputs = self._run_ai100_vision( + vision_qpc_path=qpc_paths["vision_qpc_path"], + prepared_inputs=prepared_inputs, + ) + else: + vision_outputs = self._zero_vision_outputs(vision_template) + + embedding_output = self._run_ai100_prefill( prepared_inputs=prepared_inputs, + vision_outputs=vision_outputs, + lang_qpc_path=qpc_paths["lang_qpc_path"], ) - else: - vision_outputs = self._zero_vision_outputs(vision_template) - - embedding_output = self._run_ai100_prefill( - prepared_inputs=prepared_inputs, - vision_outputs=vision_outputs, - lang_qpc_path=qpc_paths["lang_qpc_path"], - ) - embedding_rows.append(torch.from_numpy(embedding_output).to(torch.float32)) + embedding_rows.append(torch.from_numpy(embedding_output).to(torch.float32)) + else: + # Single QPC: vision + language fused in one compiled binary. + for ctx in prepared_contexts: + embedding_output = self._run_ai100_single_qpc_prefill( + prepared_inputs=ctx["prepared_inputs"], qpc_path=qpc_paths + ) + embedding_rows.append(torch.from_numpy(embedding_output).to(torch.float32)) embeddings = torch.cat(embedding_rows, dim=0) if normalize: diff --git a/QEfficient/transformers/models/qwen3_vl/modeling_qwen3_vl.py b/QEfficient/transformers/models/qwen3_vl/modeling_qwen3_vl.py index c016646d06..39faee7547 100644 --- a/QEfficient/transformers/models/qwen3_vl/modeling_qwen3_vl.py +++ b/QEfficient/transformers/models/qwen3_vl/modeling_qwen3_vl.py @@ -1214,9 +1214,11 @@ def get_output_names(self, kv_offload: bool = False): output_names["lang"] = lang_output_names else: if _is_single_shot_mode(self): - single_shot_outputs = ["logits", "image_idx_output"] + # Single-shot forward returns: (logits, image_embeds, image_idx) + # embedding adds hidden_states: (logits, image_embeds, image_idx, hidden_states) + single_shot_outputs = ["logits", "image_embeds", "image_idx_output"] if _should_export_embedding_output(self): - single_shot_outputs.insert(1, "embedding_output") + single_shot_outputs.append("embedding_output") return single_shot_outputs lang_output_names.insert(1, "pixel_values_RetainedState") lang_output_names.insert(2, "image_idx_output") diff --git a/examples/image_text_to_text/models/qwen3vl/qwen3_vl.py b/examples/image_text_to_text/models/qwen3vl/qwen3_vl.py index 6b86ea874a..6ba5cfba3f 100644 --- a/examples/image_text_to_text/models/qwen3vl/qwen3_vl.py +++ b/examples/image_text_to_text/models/qwen3vl/qwen3_vl.py @@ -16,9 +16,9 @@ model_id = "Qwen/Qwen3-VL-32B-Instruct" config = AutoConfig.from_pretrained(model_id) -# config.vision_config.depth = 9 -# config.text_config.num_hidden_layers = 1 -# config.vision_config.deepstack_visual_indexes = [8] +config.vision_config.depth = 9 +config.text_config.num_hidden_layers = 1 +config.vision_config.deepstack_visual_indexes = [8] qeff_model = QEFFAutoModelForImageTextToText.from_pretrained( model_id, attn_implementation="eager", kv_offload=True, config=config diff --git a/tests/unit_test/models/embedding/test_qwen3vl_embedding_unit.py b/tests/unit_test/models/embedding/test_qwen3vl_embedding_unit.py index a602a0f7dd..a7e94eac8e 100644 --- a/tests/unit_test/models/embedding/test_qwen3vl_embedding_unit.py +++ b/tests/unit_test/models/embedding/test_qwen3vl_embedding_unit.py @@ -130,3 +130,48 @@ def _fake_run_ai100_prefill(prepared_inputs, vision_outputs, lang_qpc_path): assert tuple(embeddings.shape) == (2, 4) norms = torch.linalg.norm(embeddings, dim=-1) assert torch.allclose(norms, torch.ones_like(norms), atol=1e-6) + + +@pytest.mark.embedding +def test_qwen3_vl_embedder_single_qpc_dispatch(monkeypatch): + """process() with a non-dict qpc_paths uses the single-QPC path.""" + from pathlib import Path + + embedder = QEffQwen3VLEmbedder(processor=None, model=_DummyQEffModel()) + + contexts = [{"tokenized": {"kind": "image"}}, {"tokenized": {"kind": "text"}}] + + def _fake_collect_contexts(_inputs): + return contexts, 8, 6, 10 + + def _fake_prepare_qeff_inputs(qeff_model, tokenized_inputs, prefill_seq_len): + del qeff_model + prepared = { + "input_ids": torch.arange(8, dtype=torch.int64).unsqueeze(0), + "position_ids": torch.arange(prefill_seq_len, dtype=torch.int64).reshape(1, 1, prefill_seq_len), + } + if tokenized_inputs.get("kind") == "image": + prepared["pixel_values"] = torch.ones((1, 3, 2, 2), dtype=torch.float32) + return prepared, 8 + + def _fake_single_qpc_prefill(prepared_inputs, qpc_path): + del qpc_path + if "pixel_values" in prepared_inputs: + return np.array([[1.0, 2.0, 3.0, 4.0]], dtype=np.float32) + return np.array([[2.0, 1.0, 0.5, 1.0]], dtype=np.float32) + + monkeypatch.setattr(embedder, "_collect_contexts", _fake_collect_contexts) + monkeypatch.setattr(QEffQwen3VLEmbedder, "_prepare_qeff_inputs", staticmethod(_fake_prepare_qeff_inputs)) + monkeypatch.setattr(QEffQwen3VLEmbedder, "_run_ai100_single_qpc_prefill", staticmethod(_fake_single_qpc_prefill)) + + # Test with string path + embeddings = embedder.process(inputs=[{}, {}], qpc_paths="/path/to/single.qpc", prefill_seq_len=12, normalize=True) + assert tuple(embeddings.shape) == (2, 4) + norms = torch.linalg.norm(embeddings, dim=-1) + assert torch.allclose(norms, torch.ones_like(norms), atol=1e-6) + + # Test with Path object + embeddings = embedder.process( + inputs=[{}, {}], qpc_paths=Path("/tmp/model.qpc"), prefill_seq_len=12, normalize=False + ) + assert tuple(embeddings.shape) == (2, 4) From b2b0cfdf44045d6ad49fd65e726b16d8d2b346a2 Mon Sep 17 00:00:00 2001 From: Amit Raj Date: Sat, 6 Jun 2026 13:23:12 +0530 Subject: [PATCH 15/15] =?UTF-8?q?Restore=20unrelated=20VLM=20model=20files?= =?UTF-8?q?=20to=20base=20=E2=80=94=20keep=20only=20reranker/embedding=20c?= =?UTF-8?q?hanges?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Amit Raj --- .../models/gemma3/modeling_gemma3.py | 20 +++++++++++++------ .../models/internvl/modeling_internvl.py | 20 +++++++++++++------ .../models/llama4/modeling_llama4.py | 20 +++++++++++++------ .../models/llava/modeling_llava.py | 8 ++++++-- .../models/llava_next/modeling_llava_next.py | 10 ++++++---- .../models/mistral3/modeling_mistral3.py | 14 ++++++++----- .../models/mllama/modeling_mllama.py | 7 +++++-- .../models/molmo/modeling_molmo.py | 14 ++++++++----- .../models/qwen2_5_vl/modeling_qwen2_5_vl.py | 14 ++++++++----- .../qwen3_vl_moe/modeling_qwen3_vl_moe.py | 14 ++++++++----- scripts/Jenkinsfile | 3 ++- 11 files changed, 97 insertions(+), 47 deletions(-) diff --git a/QEfficient/transformers/models/gemma3/modeling_gemma3.py b/QEfficient/transformers/models/gemma3/modeling_gemma3.py index 35d9c07cf8..a3e9257a73 100644 --- a/QEfficient/transformers/models/gemma3/modeling_gemma3.py +++ b/QEfficient/transformers/models/gemma3/modeling_gemma3.py @@ -969,8 +969,16 @@ def get_dummy_pkv_cache(self, config, batch_size, seq_len, dtype=None): return past_key_values def get_dummy_inputs( - self, comp_ctx_lengths: Optional[List[int]] = None, kv_offload: bool = False, continuous_batching: bool = False + self, + comp_ctx_lengths: Optional[List[int]] = None, + kv_offload: bool = False, + continuous_batching: bool = False, + **kwargs, ): + prefill_seq_len = kwargs.get("prefill_seq_len") + if prefill_seq_len is None: + prefill_seq_len = constants.ONNX_EXPORT_EXAMPLE_SEQ_LEN + prefill_seq_len = int(prefill_seq_len) if vis_cfg := getattr(self.config, "vision_config", None): img_size = getattr(vis_cfg, "image_size", 896) else: @@ -979,7 +987,7 @@ def get_dummy_inputs( mm_tokens_per_image = getattr(self.config, "mm_tokens_per_image", 256) # Define shapes inputs_shapes = {} - inputs_shapes["input_ids"] = (constants.ONNX_EXPORT_EXAMPLE_BATCH_SIZE, constants.ONNX_EXPORT_EXAMPLE_SEQ_LEN) + inputs_shapes["input_ids"] = (constants.ONNX_EXPORT_EXAMPLE_BATCH_SIZE, prefill_seq_len) inputs_shapes["vision_embeds"] = ( 1, # constants.INTERN_NUM_PATCHES, mm_tokens_per_image, # constants.INTERN_FEATURE_SIZE, @@ -987,7 +995,7 @@ def get_dummy_inputs( ) inputs_shapes["position_ids"] = ( constants.ONNX_EXPORT_EXAMPLE_BATCH_SIZE, - constants.ONNX_EXPORT_EXAMPLE_SEQ_LEN, + prefill_seq_len, ) inputs_shapes["pixel_values"] = ( constants.ONNX_EXPORT_EXAMPLE_BATCH_SIZE, @@ -1004,8 +1012,8 @@ def get_dummy_inputs( lang_inputs["input_ids"] = torch.zeros((inputs_shapes["input_ids"]), dtype=torch.int64) lang_inputs["vision_embeds"] = torch.zeros((inputs_shapes["vision_embeds"]), dtype=self.config.torch_dtype) lang_inputs["position_ids"] = ( - torch.arange(constants.ONNX_EXPORT_EXAMPLE_SEQ_LEN, dtype=torch.int64) - .view(1, constants.ONNX_EXPORT_EXAMPLE_SEQ_LEN) + torch.arange(prefill_seq_len, dtype=torch.int64) + .view(1, prefill_seq_len) .repeat(constants.ONNX_EXPORT_EXAMPLE_BATCH_SIZE, 1) ) lang_inputs["image_idx"] = torch.zeros((inputs_shapes["image_idx"]), dtype=torch.int64) @@ -1017,7 +1025,7 @@ def get_dummy_inputs( lang_inputs["past_key_values"] = self.get_dummy_pkv_cache( config=self.language_model.config, batch_size=fbs if continuous_batching else bs, - seq_len=constants.ONNX_EXPORT_EXAMPLE_SEQ_LEN, + seq_len=prefill_seq_len, ) if comp_ctx_lengths is not None: diff --git a/QEfficient/transformers/models/internvl/modeling_internvl.py b/QEfficient/transformers/models/internvl/modeling_internvl.py index 821381ac0d..563c42e256 100644 --- a/QEfficient/transformers/models/internvl/modeling_internvl.py +++ b/QEfficient/transformers/models/internvl/modeling_internvl.py @@ -273,8 +273,16 @@ def get_output_names(self, kv_offload: bool = False): return output_names def get_dummy_inputs( - self, comp_ctx_lengths: Optional[List[int]] = None, kv_offload: bool = False, continuous_batching: bool = False + self, + comp_ctx_lengths: Optional[List[int]] = None, + kv_offload: bool = False, + continuous_batching: bool = False, + **kwargs, ): + prefill_seq_len = kwargs.get("prefill_seq_len") + if prefill_seq_len is None: + prefill_seq_len = constants.ONNX_EXPORT_EXAMPLE_SEQ_LEN + prefill_seq_len = int(prefill_seq_len) if vis_cfg := getattr(self.config, "vision_config", None): img_size = getattr(vis_cfg, "image_size", constants.INTERN_IMG_SIZE) else: @@ -293,7 +301,7 @@ def get_dummy_inputs( # Define shapes inputs_shapes = {} - inputs_shapes["input_ids"] = (constants.ONNX_EXPORT_EXAMPLE_BATCH_SIZE, constants.ONNX_EXPORT_EXAMPLE_SEQ_LEN) + inputs_shapes["input_ids"] = (constants.ONNX_EXPORT_EXAMPLE_BATCH_SIZE, prefill_seq_len) inputs_shapes["vision_embeds"] = ( 1, computed_feature_size * constants.ONNX_EXPORT_EXAMPLE_BATCH_SIZE, @@ -301,7 +309,7 @@ def get_dummy_inputs( ) inputs_shapes["position_ids"] = ( constants.ONNX_EXPORT_EXAMPLE_BATCH_SIZE, - constants.ONNX_EXPORT_EXAMPLE_SEQ_LEN, + prefill_seq_len, ) inputs_shapes["pixel_values"] = ( constants.INTERN_NUM_PATCHES * constants.ONNX_EXPORT_EXAMPLE_BATCH_SIZE, @@ -321,8 +329,8 @@ def get_dummy_inputs( (inputs_shapes["vision_embeds"]), dtype=self.config.vision_config.torch_dtype ) lang_inputs["position_ids"] = ( - torch.arange(constants.ONNX_EXPORT_EXAMPLE_SEQ_LEN, dtype=torch.int64) - .view(1, constants.ONNX_EXPORT_EXAMPLE_SEQ_LEN) + torch.arange(prefill_seq_len, dtype=torch.int64) + .view(1, prefill_seq_len) .repeat(constants.ONNX_EXPORT_EXAMPLE_BATCH_SIZE, 1) ) lang_inputs["image_idx"] = torch.zeros((1, 1), dtype=torch.int64) @@ -334,7 +342,7 @@ def get_dummy_inputs( kv_cache_shape = get_padding_shape_from_config( config=self.language_model.config, batch_size=fbs if continuous_batching else bs, - seq_len=constants.ONNX_EXPORT_EXAMPLE_SEQ_LEN, + seq_len=prefill_seq_len, ) lang_inputs["past_key_values"] = [[] for _ in range(self.language_model.config.num_hidden_layers)] diff --git a/QEfficient/transformers/models/llama4/modeling_llama4.py b/QEfficient/transformers/models/llama4/modeling_llama4.py index 7f90262bec..2cf5dbb2e9 100644 --- a/QEfficient/transformers/models/llama4/modeling_llama4.py +++ b/QEfficient/transformers/models/llama4/modeling_llama4.py @@ -1185,8 +1185,16 @@ def get_dummy_pkv_cache(self, config, batch_size, seq_len): return past_key_values def get_dummy_inputs( - self, comp_ctx_lengths: Optional[List[int]] = None, kv_offload: bool = False, continuous_batching: bool = False + self, + comp_ctx_lengths: Optional[List[int]] = None, + kv_offload: bool = False, + continuous_batching: bool = False, + **kwargs, ): + prefill_seq_len = kwargs.get("prefill_seq_len") + if prefill_seq_len is None: + prefill_seq_len = constants.ONNX_EXPORT_EXAMPLE_SEQ_LEN + prefill_seq_len = int(prefill_seq_len) if vis_cfg := getattr(self.config, "vision_config", None): img_size = getattr(vis_cfg, "image_size", 336) else: @@ -1194,7 +1202,7 @@ def get_dummy_inputs( # Define shapes inputs_shapes = {} - inputs_shapes["input_ids"] = (constants.ONNX_EXPORT_EXAMPLE_BATCH_SIZE, constants.ONNX_EXPORT_EXAMPLE_SEQ_LEN) + inputs_shapes["input_ids"] = (constants.ONNX_EXPORT_EXAMPLE_BATCH_SIZE, prefill_seq_len) max_num_tiles = 17 downsample_ratio = int(round(1.0 / (self.config.vision_config.pixel_shuffle_ratio**2))) num_features_per_tile = int( @@ -1210,7 +1218,7 @@ def get_dummy_inputs( ) inputs_shapes["position_ids"] = ( constants.ONNX_EXPORT_EXAMPLE_BATCH_SIZE, - constants.ONNX_EXPORT_EXAMPLE_SEQ_LEN, + prefill_seq_len, ) inputs_shapes["pixel_values"] = ( max_num_tiles, # constants.INTERN_NUM_PATCHES, @@ -1226,8 +1234,8 @@ def get_dummy_inputs( lang_inputs["input_ids"] = torch.zeros((inputs_shapes["input_ids"]), dtype=torch.int64) lang_inputs["vision_embeds"] = torch.zeros((inputs_shapes["vision_embeds"]), dtype=self.config.torch_dtype) lang_inputs["position_ids"] = ( - torch.arange(constants.ONNX_EXPORT_EXAMPLE_SEQ_LEN, dtype=torch.int64) - .view(1, constants.ONNX_EXPORT_EXAMPLE_SEQ_LEN) + torch.arange(prefill_seq_len, dtype=torch.int64) + .view(1, prefill_seq_len) .repeat(constants.ONNX_EXPORT_EXAMPLE_BATCH_SIZE, 1) ) lang_inputs["image_idx"] = torch.zeros((inputs_shapes["image_idx"]), dtype=torch.int64) @@ -1239,7 +1247,7 @@ def get_dummy_inputs( past_key_values = self.get_dummy_pkv_cache( config=self.language_model.config, batch_size=fbs if continuous_batching else bs, - seq_len=constants.ONNX_EXPORT_EXAMPLE_SEQ_LEN, + seq_len=prefill_seq_len, ) lang_inputs["past_key_values"] = [[] for _ in range(self.language_model.config.num_hidden_layers)] diff --git a/QEfficient/transformers/models/llava/modeling_llava.py b/QEfficient/transformers/models/llava/modeling_llava.py index 3fdfd11b9e..88bb5e1027 100644 --- a/QEfficient/transformers/models/llava/modeling_llava.py +++ b/QEfficient/transformers/models/llava/modeling_llava.py @@ -168,6 +168,10 @@ def get_dummy_inputs( continuous_batching: bool = False, **kwargs, ): + prefill_seq_len = kwargs.get("prefill_seq_len") + if prefill_seq_len is None: + prefill_seq_len = SEQ_LEN + prefill_seq_len = int(prefill_seq_len) num_layers = self.config.text_config.num_hidden_layers num_key_value_heads = self.config.text_config.num_key_value_heads head_dim = self.config.text_config.hidden_size // self.config.text_config.num_attention_heads @@ -182,11 +186,11 @@ def get_dummy_inputs( "pixel_values": torch.zeros((BS, NUM_CHANNEL, img_size, img_size), dtype=self.config.torch_dtype), } lang_inputs = { - "input_ids": torch.ones((BS, SEQ_LEN), dtype=torch.int64), + "input_ids": torch.ones((BS, prefill_seq_len), dtype=torch.int64), "vision_embeds": torch.ones( (BS, vision_size, self.model.language_model.config.hidden_size), dtype=self.config.torch_dtype ), - "attention_mask": torch.ones((BS, SEQ_LEN), dtype=torch.int64), + "attention_mask": torch.ones((BS, prefill_seq_len), dtype=torch.int64), "image_idx": torch.zeros((1, 1), dtype=torch.int64), } lang_inputs["position_ids"] = lang_inputs.pop("attention_mask").cumsum(1) diff --git a/QEfficient/transformers/models/llava_next/modeling_llava_next.py b/QEfficient/transformers/models/llava_next/modeling_llava_next.py index c2a9137006..342269ce50 100755 --- a/QEfficient/transformers/models/llava_next/modeling_llava_next.py +++ b/QEfficient/transformers/models/llava_next/modeling_llava_next.py @@ -195,6 +195,10 @@ def get_dummy_inputs( continuous_batching: bool = False, **kwargs, ): + prefill_seq_len = kwargs.get("prefill_seq_len") + if prefill_seq_len is None: + prefill_seq_len = constants.GRANITEVISION_SEQ_LEN + prefill_seq_len = int(prefill_seq_len) num_layers = self.config.text_config.num_hidden_layers num_key_value_heads = self.config.text_config.num_key_value_heads head_dim = self.config.text_config.hidden_size // self.config.text_config.num_attention_heads @@ -221,11 +225,9 @@ def get_dummy_inputs( ), } lang_inputs = { - "input_ids": torch.ones( - (constants.ONNX_EXPORT_EXAMPLE_BATCH_SIZE, constants.GRANITEVISION_SEQ_LEN), dtype=torch.int64 - ), + "input_ids": torch.ones((constants.ONNX_EXPORT_EXAMPLE_BATCH_SIZE, prefill_seq_len), dtype=torch.int64), "attention_mask": torch.ones( - (constants.ONNX_EXPORT_EXAMPLE_BATCH_SIZE, constants.GRANITEVISION_SEQ_LEN), dtype=torch.int64 + (constants.ONNX_EXPORT_EXAMPLE_BATCH_SIZE, prefill_seq_len), dtype=torch.int64 ), "vision_embeds": torch.ones( ( diff --git a/QEfficient/transformers/models/mistral3/modeling_mistral3.py b/QEfficient/transformers/models/mistral3/modeling_mistral3.py index 9c37353328..628d1dee2c 100644 --- a/QEfficient/transformers/models/mistral3/modeling_mistral3.py +++ b/QEfficient/transformers/models/mistral3/modeling_mistral3.py @@ -346,8 +346,12 @@ def get_dummy_inputs( continuous_batching: bool = False, **kwargs, ): + prefill_seq_len = kwargs.get("prefill_seq_len") + if prefill_seq_len is None: + prefill_seq_len = constants.ONNX_EXPORT_EXAMPLE_SEQ_LEN + prefill_seq_len = int(prefill_seq_len) inputs_shapes = {} - inputs_shapes["input_ids"] = (constants.ONNX_EXPORT_EXAMPLE_BATCH_SIZE, constants.ONNX_EXPORT_EXAMPLE_SEQ_LEN) + inputs_shapes["input_ids"] = (constants.ONNX_EXPORT_EXAMPLE_BATCH_SIZE, prefill_seq_len) height = self.config.vision_config.image_size width = self.config.vision_config.image_size patch_size = self.config.vision_config.patch_size @@ -363,7 +367,7 @@ def get_dummy_inputs( ) inputs_shapes["position_ids"] = ( constants.ONNX_EXPORT_EXAMPLE_BATCH_SIZE, - constants.ONNX_EXPORT_EXAMPLE_SEQ_LEN, + prefill_seq_len, ) inputs_shapes["pixel_values"] = ( constants.ONNX_EXPORT_EXAMPLE_BATCH_SIZE, @@ -380,8 +384,8 @@ def get_dummy_inputs( lang_inputs["input_ids"] = torch.zeros((inputs_shapes["input_ids"]), dtype=torch.int64) lang_inputs["vision_embeds"] = torch.zeros((inputs_shapes["vision_embeds"]), dtype=self.config.torch_dtype) lang_inputs["position_ids"] = ( - torch.arange(constants.ONNX_EXPORT_EXAMPLE_SEQ_LEN, dtype=torch.int64) - .view(1, constants.ONNX_EXPORT_EXAMPLE_SEQ_LEN) + torch.arange(prefill_seq_len, dtype=torch.int64) + .view(1, prefill_seq_len) .repeat(constants.ONNX_EXPORT_EXAMPLE_BATCH_SIZE, 1) ) lang_inputs["image_idx"] = torch.zeros((inputs_shapes["image_idx"]), dtype=torch.int64) @@ -393,7 +397,7 @@ def get_dummy_inputs( kv_cache_shape = get_padding_shape_from_config( config=self.model.config.text_config, batch_size=fbs if continuous_batching else bs, - seq_len=constants.ONNX_EXPORT_EXAMPLE_SEQ_LEN, + seq_len=prefill_seq_len, ) lang_inputs["past_key_values"] = [[] for _ in range(self.model.language_model.config.num_hidden_layers)] diff --git a/QEfficient/transformers/models/mllama/modeling_mllama.py b/QEfficient/transformers/models/mllama/modeling_mllama.py index d9310c02e4..45649662a7 100644 --- a/QEfficient/transformers/models/mllama/modeling_mllama.py +++ b/QEfficient/transformers/models/mllama/modeling_mllama.py @@ -924,9 +924,12 @@ def forward( logits = self.lm_head(hidden_states).float() return logits, image_idx, outputs.past_key_values, pixel_values - def get_dummy_inputs(self, comp_ctx_lengths: Optional[List[int]] = None, kv_offload: bool = False): + def get_dummy_inputs(self, comp_ctx_lengths: Optional[List[int]] = None, kv_offload: bool = False, **kwargs): BS = constants.ONNX_EXPORT_EXAMPLE_BATCH_SIZE - SEQ_LEN = constants.ONNX_EXPORT_EXAMPLE_SEQ_LEN + seq_len = kwargs.get("prefill_seq_len") + if seq_len is None: + seq_len = constants.ONNX_EXPORT_EXAMPLE_SEQ_LEN + SEQ_LEN = int(seq_len) CTX_LEN = constants.ONNX_EXPORT_CTX_LEN txt_cfg = self.config.get_text_config() diff --git a/QEfficient/transformers/models/molmo/modeling_molmo.py b/QEfficient/transformers/models/molmo/modeling_molmo.py index 3eefba47f5..d59ca4e017 100644 --- a/QEfficient/transformers/models/molmo/modeling_molmo.py +++ b/QEfficient/transformers/models/molmo/modeling_molmo.py @@ -931,9 +931,13 @@ def get_dummy_inputs( continuous_batching: bool = False, **kwargs, ): + prefill_seq_len = kwargs.get("prefill_seq_len") + if prefill_seq_len is None: + prefill_seq_len = constants.ONNX_EXPORT_EXAMPLE_SEQ_LEN + prefill_seq_len = int(prefill_seq_len) inputs_shapes = {} inputs_shapes_lang = {} - inputs_shapes["input_ids"] = (constants.ONNX_EXPORT_EXAMPLE_BATCH_SIZE, constants.ONNX_EXPORT_EXAMPLE_SEQ_LEN) + inputs_shapes["input_ids"] = (constants.ONNX_EXPORT_EXAMPLE_BATCH_SIZE, prefill_seq_len) inputs_shapes["vision_embeds"] = ( constants.ONNX_EXPORT_EXAMPLE_BATCH_SIZE, @@ -942,7 +946,7 @@ def get_dummy_inputs( ) inputs_shapes["position_ids"] = ( constants.ONNX_EXPORT_EXAMPLE_BATCH_SIZE, - constants.ONNX_EXPORT_EXAMPLE_SEQ_LEN, + prefill_seq_len, ) inputs_shapes["pixel_values"] = ( constants.ONNX_EXPORT_EXAMPLE_BATCH_SIZE, @@ -976,8 +980,8 @@ def get_dummy_inputs( lang_inputs["input_ids"] = torch.zeros((inputs_shapes["input_ids"]), dtype=torch.int64) lang_inputs["vision_embeds"] = torch.zeros((inputs_shapes["vision_embeds"]), dtype=self.config.torch_dtype) lang_inputs["position_ids"] = ( - torch.arange(constants.ONNX_EXPORT_EXAMPLE_SEQ_LEN, dtype=torch.int64) - .view(1, constants.ONNX_EXPORT_EXAMPLE_SEQ_LEN) + torch.arange(prefill_seq_len, dtype=torch.int64) + .view(1, prefill_seq_len) .repeat(constants.ONNX_EXPORT_EXAMPLE_BATCH_SIZE, 1) ) lang_inputs["image_idx"] = torch.zeros((inputs_shapes["image_idx"]), dtype=torch.int64) @@ -989,7 +993,7 @@ def get_dummy_inputs( kv_cache_shape = get_padding_shape_from_config( config=self.config, batch_size=fbs if continuous_batching else bs, - seq_len=constants.ONNX_EXPORT_EXAMPLE_SEQ_LEN, + seq_len=prefill_seq_len, ) lang_inputs["past_key_values"] = [[] for _ in range(self.model.config.n_layers)] diff --git a/QEfficient/transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py b/QEfficient/transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py index dd70a31c95..357c4af16e 100644 --- a/QEfficient/transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py +++ b/QEfficient/transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py @@ -831,8 +831,12 @@ def get_dummy_inputs( continuous_batching: bool = False, **kwargs, ): + prefill_seq_len = kwargs.get("prefill_seq_len", constants.ONNX_EXPORT_EXAMPLE_SEQ_LEN) + if prefill_seq_len is None: + prefill_seq_len = constants.ONNX_EXPORT_EXAMPLE_SEQ_LEN + prefill_seq_len = int(prefill_seq_len) inputs_shapes = {} - inputs_shapes["input_ids"] = (constants.ONNX_EXPORT_EXAMPLE_BATCH_SIZE, constants.ONNX_EXPORT_EXAMPLE_SEQ_LEN) + inputs_shapes["input_ids"] = (constants.ONNX_EXPORT_EXAMPLE_BATCH_SIZE, prefill_seq_len) vision_size = 3577 inputs_shapes["vision_embeds"] = ( @@ -844,7 +848,7 @@ def get_dummy_inputs( inputs_shapes["position_ids"] = ( 3, constants.ONNX_EXPORT_EXAMPLE_BATCH_SIZE, - constants.ONNX_EXPORT_EXAMPLE_SEQ_LEN, + prefill_seq_len, ) inputs_shapes["pixel_values"] = (14308, 1176) inputs_shapes["image_idx"] = (1, 1) @@ -858,8 +862,8 @@ def get_dummy_inputs( lang_inputs["vision_embeds"] = torch.zeros((inputs_shapes["vision_embeds"]), dtype=self.config.torch_dtype) lang_inputs["position_ids"] = ( ( - torch.arange(constants.ONNX_EXPORT_EXAMPLE_SEQ_LEN, dtype=torch.int64) - .view(1, constants.ONNX_EXPORT_EXAMPLE_SEQ_LEN) + torch.arange(prefill_seq_len, dtype=torch.int64) + .view(1, prefill_seq_len) .repeat(constants.ONNX_EXPORT_EXAMPLE_BATCH_SIZE, 1) ) .unsqueeze(0) @@ -874,7 +878,7 @@ def get_dummy_inputs( kv_cache_shape = get_padding_shape_from_config( config=self.model.config.text_config, batch_size=fbs if continuous_batching else bs, - seq_len=constants.ONNX_EXPORT_EXAMPLE_SEQ_LEN, + seq_len=prefill_seq_len, ) lang_inputs["past_key_values"] = [[] for _ in range(self.model.config.text_config.num_hidden_layers)] diff --git a/QEfficient/transformers/models/qwen3_vl_moe/modeling_qwen3_vl_moe.py b/QEfficient/transformers/models/qwen3_vl_moe/modeling_qwen3_vl_moe.py index 379c31b52b..4a6259bf8d 100644 --- a/QEfficient/transformers/models/qwen3_vl_moe/modeling_qwen3_vl_moe.py +++ b/QEfficient/transformers/models/qwen3_vl_moe/modeling_qwen3_vl_moe.py @@ -934,8 +934,12 @@ def get_dummy_inputs( continuous_batching: bool = False, **kwargs, ): + prefill_seq_len = kwargs.get("prefill_seq_len") + if prefill_seq_len is None: + prefill_seq_len = constants.ONNX_EXPORT_EXAMPLE_SEQ_LEN + prefill_seq_len = int(prefill_seq_len) inputs_shapes = {} - inputs_shapes["input_ids"] = (constants.ONNX_EXPORT_EXAMPLE_BATCH_SIZE, constants.ONNX_EXPORT_EXAMPLE_SEQ_LEN) + inputs_shapes["input_ids"] = (constants.ONNX_EXPORT_EXAMPLE_BATCH_SIZE, prefill_seq_len) # vision_size = 1024 vision_size = 187 inputs_shapes["vision_embeds"] = ( @@ -947,7 +951,7 @@ def get_dummy_inputs( inputs_shapes["position_ids"] = ( 3, constants.ONNX_EXPORT_EXAMPLE_BATCH_SIZE, - constants.ONNX_EXPORT_EXAMPLE_SEQ_LEN, + prefill_seq_len, ) inputs_shapes["pixel_values"] = (748, 1536) inputs_shapes["image_idx"] = (1, 1) @@ -971,8 +975,8 @@ def get_dummy_inputs( ) lang_inputs["position_ids"] = ( ( - torch.arange(constants.ONNX_EXPORT_EXAMPLE_SEQ_LEN, dtype=torch.int64) - .view(1, constants.ONNX_EXPORT_EXAMPLE_SEQ_LEN) + torch.arange(prefill_seq_len, dtype=torch.int64) + .view(1, prefill_seq_len) .repeat(constants.ONNX_EXPORT_EXAMPLE_BATCH_SIZE, 1) ) .unsqueeze(0) @@ -990,7 +994,7 @@ def get_dummy_inputs( kv_cache_shape = get_padding_shape_from_config( config=self.model.config.text_config, batch_size=fbs if continuous_batching else bs, - seq_len=constants.ONNX_EXPORT_EXAMPLE_SEQ_LEN, + seq_len=prefill_seq_len, ) lang_inputs["past_key_values"] = [[] for _ in range(self.model.config.text_config.num_hidden_layers)] diff --git a/scripts/Jenkinsfile b/scripts/Jenkinsfile index 49f637c2f9..f437a1521a 100644 --- a/scripts/Jenkinsfile +++ b/scripts/Jenkinsfile @@ -65,7 +65,8 @@ pipeline { pip install junitparser pytest-xdist && pip install librosa==0.10.2 soundfile==0.13.1 && pip install qwen-vl-utils==0.0.14 && - pip install --extra-index-url https://download.pytorch.org/whl/cpu timm==1.0.14 torchvision==0.22.0+cpu einops==0.8.1 + pip install --extra-index-url https://download.pytorch.org/whl/cpu timm==1.0.14 torchvision==0.22.0+cpu einops==0.8.1 && + pip install onnx_ir rm -rf QEfficient" ''' }