diff --git a/QEfficient/generation/vlm_generation.py b/QEfficient/generation/vlm_generation.py index bb0c649f34..a0faf9e4d7 100644 --- a/QEfficient/generation/vlm_generation.py +++ b/QEfficient/generation/vlm_generation.py @@ -23,6 +23,7 @@ from typing import Any, Dict, List, Optional, Union import numpy as np +import torch from transformers import AutoImageProcessor, PreTrainedTokenizer, PreTrainedTokenizerFast from QEfficient.generation.cloud_infer import QAICInferenceSession @@ -33,6 +34,7 @@ QEffTextGenerationBase, TextGeneration, calculate_latency, + get_compilation_dims, write_io_files, ) from QEfficient.utils import LRUCache @@ -467,7 +469,15 @@ def _prepare_vision_language_prompt(self, text_prompt, image_path): return text_prompt def generate( - self, images: List[str], prompts: List[str], generation_len: Optional[int] = None, stream: bool = True, **kwargs + self, + images: List[str], + prompts: List[str], + inputs: torch.Tensor = None, + num_frames: Optional[int] = None, + multi_specs: Optional[bool] = None, + generation_len: Optional[int] = None, + stream: bool = True, + **kwargs, ) -> CloudAI100ExecInfo: """ Main generation method maintaining API compatibility with VisionLanguageGeneration @@ -485,6 +495,9 @@ def generate( Raises: ValueError: If images and prompts lengths don't match """ + if num_frames or multi_specs: + return self._generate_multi_frame_specialization(inputs, num_frames, generation_len) + if len(images) != len(prompts): raise ValueError(f"Number of images ({len(images)}) must match number of prompts ({len(prompts)})") @@ -504,6 +517,228 @@ def generate( # Regular batching mode return self._generate_regular_batching(vision_prompts, generation_len, stream, **kwargs) + def run_prefill_multi_frame_specialization( + self, inputs: Optional[torch.Tensor], num_frames: Optional[int] = 1, generation_len: int = None + ): + """ + Run prefill for multi-frame specialization. This is a special case where we have a fixed number of frames + and we want to prefill the model with the input frames before generating the output. + + Args: + inputs: Input tensor + num_frames: Number of frames to process + generation_len: Length of generation + stream: Whether to stream the output + + Returns: + Generation result + """ + + if not self._qpc_path: + raise TypeError("Please run compile API for language model first!") + + self._session.deactivate() + self._vision_session.activate() + + if not num_frames: + logger.warning("num_frames not specified, defaulting to 1") + num_frames = 1 + + batch_size, ctx_len, fbs = get_compilation_dims(self._qpc_path) + + pad_token_id = 1 + + # Skip inputs/outputs + self._session.skip_buffers( + [ + x + for x in self._session.input_names + self._session.output_names + if x.startswith("past_") or x.endswith("_RetainedState") + ] + ) + + prefill_seq_len = max( + [x[self._session.binding_index_map["input_ids"]][1][1] for x in self._session.allowed_shapes] + + [self._session.bindings[self._session.binding_index_map["input_ids"]].dims[1]] + ) + + input_len = inputs["attention_mask"].sum(1, keepdims=True) + input_ids_length = inputs["input_ids"].shape[1] + num_chunks = -(input_ids_length // -prefill_seq_len) # ceil divide without float + padded_len = num_chunks * prefill_seq_len # Convert to a multiple of prompt_len + + if generation_len is None: + generation_len = ctx_len - input_len.max() + assert generation_len > 0, "generation length should be greater than zero" + + inputs["input_ids"] = torch.nn.functional.pad( + inputs["input_ids"], + (0, padded_len - input_ids_length), + "constant", + pad_token_id, + ) + inputs["attention_mask"] = torch.nn.functional.pad( + inputs["attention_mask"], (0, padded_len - input_ids_length), "constant", 0 + ) + + for k, v in inputs.items(): + inputs[k] = np.array(v) + + vision_inputs = { + k: v + for k, v in inputs.items() + if k + in {"pixel_values", "image_masks", "image_input_idx", "valid_idx", "aspect_ratio_ids", "aspect_ratio_mask"} + } + + vision_inputs_fp16 = {"pixel_values", "image_masks"} + vision_inputs.update({k: vision_inputs[k].astype("float16") for k in vision_inputs_fp16 if k in vision_inputs}) + + vision_outputs = {} + if vision_inputs: + vision_size = vision_inputs["pixel_values"].shape[0] // num_frames + + pixel_values_shape = list(vision_inputs["pixel_values"][:vision_size].shape) + + idx = next( + i for i, inner in enumerate(self._vision_session.allowed_shapes) if (2, pixel_values_shape) in inner + ) + + buffer_set = { + "vision_embeds": np.zeros( + self._vision_session.allowed_shapes[idx][self._vision_session.binding_index_map["vision_embeds"]][ + 1 + ], + dtype=np.float16, + ), + "image_grid_thw": np.zeros( + self._vision_session.allowed_shapes[idx][self._vision_session.binding_index_map["image_grid_thw"]][ + 1 + ], + dtype=np.int64, + ), + } + if "deepstack_features" in self._vision_session.binding_index_map: + buffer_set["deepstack_features"] = np.zeros( + self._vision_session.allowed_shapes[idx][ + self._vision_session.binding_index_map["deepstack_features"] + ][1], + dtype=np.float16, + ) + + self._vision_session.set_buffers(buffer_set) + + chunk_inputs = vision_inputs.copy() + + for i in range(num_frames): + chunk_inputs["pixel_values"] = vision_inputs["pixel_values"][i * vision_size : (i + 1) * vision_size] + chunk_outputs = self._vision_session.run(chunk_inputs) + if i == 0: + vision_outputs = chunk_outputs + else: + vision_outputs["vision_embeds"] = np.concatenate( + (vision_outputs["vision_embeds"], chunk_outputs["vision_embeds"]), axis=1 + ) + + vision_outputs["vision_embeds"] = np.pad( + vision_outputs["vision_embeds"], + pad_width=( + (0, 0), + (0, self._session.allowed_shapes[0][1][1][1] - vision_outputs["vision_embeds"].shape[-2]), + (0, 0), + ), # pad axis=1 only + mode="constant", + constant_values=0, + ) + if "deepstack_features" in vision_outputs: + vision_outputs["deepstack_features"] = np.pad( + vision_outputs["deepstack_features"], + pad_width=( + (0, 0), + (0, 0), + (0, self._session.allowed_shapes[0][1][1][1] - vision_outputs["deepstack_features"].shape[-2]), + (0, 0), + ), # pad axis=1 only + mode="constant", + constant_values=0, + ) + + lang_inputs = {k: v for k, v in inputs.items() if k not in vision_inputs} + lang_inputs.pop("attention_mask") + + if self._vision_qpc_path: + self._vision_session.deactivate() + + self._session.activate() + + self._session.set_buffers(vision_outputs) + logger.debug(f"Vision buffers set: {list(vision_outputs.keys())}") + self._vision_processed = True + self._vision_outputs = vision_outputs + + # Calculate generation_len consistent with ctx_len + max_gen_len = self._ctx_len - np.where(lang_inputs["position_ids"] != -1, 1, 0).sum(1, keepdims=True).max() + generation_len = self._fetch_generation_len(generation_len, max_gen_len) + + # Execute chunked prefill + outputs = self._execute_chunked_prefill(lang_inputs, num_chunks) + + self._session.skip_buffers(vision_outputs) + + # Prepare position_ids for decode phase (next position after prefill) + position_ids_decode = np.max(lang_inputs["position_ids"], axis=-1, keepdims=True) + 1 + + return outputs, position_ids_decode, generation_len + + def _generate_multi_frame_specialization( + self, + inputs: Optional[torch.Tensor], + num_frames: Optional[int] = 1, + generation_len: int = None, + stream: List[str] = None, + ): + + exec_batch_size = self.batch_size + max_gen_length = self._ctx_len if not generation_len else max(self._ctx_len, generation_len) + self.initialize_decode_inputs( + num_prompts=1, execution_batch_size=exec_batch_size, max_gen_length=max_gen_length + ) + + if self.is_qwen_vl: + self.decode_pos_ids = np.zeros((4, exec_batch_size, 1), np.int64) + + # Prefill using VLM-aware run_prefill (batch is a list of (image, text)) + start = perf_counter() + outputs, position_ids, generation_len_final = self.run_prefill_multi_frame_specialization( + inputs, num_frames, generation_len + ) + self.update_decode_input(outputs, position_ids, generation_len_final) + + # Prepare decode + decode_inputs = self.prepare_decode_inputs() + + # Decode loop + loop_start = perf_counter() + num_token = self.run_decode(decode_inputs, generation_len_final, automation=False, streamer=None) + end = perf_counter() + + # Decode generated texts + generated_texts = self.tokenizer.batch_decode(self.generated_ids, skip_special_tokens=True) + + # Latency metrics + total_decode_tokens = num_token + prefill_time, decode_perf, total_perf, total_time = calculate_latency( + total_decode_tokens, loop_start, start, end + ) + perf_metrics = PerfMetrics(prefill_time, decode_perf, total_perf, total_time) + + return CloudAI100ExecInfo( + batch_size=self.batch_size, + generated_texts=generated_texts, + generated_ids=self.generated_ids, + perf_metrics=perf_metrics, + ) + def _generate_regular_batching(self, vision_prompts, generation_len, stream, **kwargs): """Handle regular batching for vision-language generation without creating a second language session""" batch_results = [] diff --git a/QEfficient/transformers/models/modeling_auto.py b/QEfficient/transformers/models/modeling_auto.py index 222bb4c658..73536b24a1 100644 --- a/QEfficient/transformers/models/modeling_auto.py +++ b/QEfficient/transformers/models/modeling_auto.py @@ -1697,6 +1697,8 @@ def generate( generation_len: Optional[int] = None, image_height: Optional[int] = None, image_width: Optional[int] = None, + multi_specs: Optional[bool] = None, + num_frames: Optional[int] = None, **kwargs, ) -> Union[torch.Tensor, np.ndarray]: """ @@ -1745,7 +1747,7 @@ def generate( self._write_io_dir = os.path.join(os.path.dirname(self.onnx_path[1]), "io_dir") if write_io else None # Use VisionLanguageGeneration for image-prompt pairs - if (processor and images) or (tokenizer and prompts): + if (processor and images) or (tokenizer and prompts) or multi_specs or num_frames: # Create VisionLanguageGeneration instance batch_size_comp, ctx_len_comp, fbs = get_compilation_dims(self.lang_model.qpc_path) vlm_gen = VisionLanguageGeneration( @@ -1767,6 +1769,9 @@ def generate( # Call generate method return vlm_gen.generate( + inputs=inputs, + num_frames=num_frames, + multi_specs=multi_specs, images=images, prompts=prompts, generation_len=generation_len, diff --git a/QEfficient/transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py b/QEfficient/transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py index dd70a31c95..78f068ab97 100644 --- a/QEfficient/transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py +++ b/QEfficient/transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py @@ -11,6 +11,7 @@ import torch import torch.nn as nn import torch.nn.functional as F +from qwen_vl_utils import smart_resize from transformers import Qwen2_5_VLForConditionalGeneration, Qwen2_5_VLModel from transformers.cache_utils import Cache from transformers.modeling_outputs import ( @@ -904,9 +905,9 @@ def get_specializations( prefill_seq_len: int, ctx_len: int, img_size: None, - height: int = None, - width: int = None, - num_frames: int = 1, + height: int | List[int] = None, + width: int | List[int] = None, + num_frames: int | List[int] = 1, kv_offload: bool = False, continuous_batching: bool = False, kv_cache_batch_size: Optional[int] = None, @@ -922,80 +923,69 @@ def get_specializations( logger.warning( f"Setting height and width to be {height} and {width} respectively, as it was neither passed nor found in vision_config" ) + height = [height] if isinstance(height, int) else height + width = [width] if isinstance(width, int) else width + num_frames = [num_frames] * len(height) if isinstance(num_frames, int) else num_frames + prefill_seq_len = prefill_seq_len if prefill_seq_len else 128 ctx_len = ctx_len if ctx_len else constants.INTERN_CTX_LEN channel = 3 patch_size = self.config.vision_config.patch_size temporal_patch_size = self.config.vision_config.temporal_patch_size - IMAGE_FACTOR = 28 - MIN_PIXELS = 4 * 28 * 28 - MAX_PIXELS = 16384 * 28 * 28 - MAX_RATIO = 200 - - def round_by_factor(number: int, factor: int) -> int: - """Returns the closest integer to 'number' that is divisible by 'factor'.""" - return round(number / factor) * factor - - def ceil_by_factor(number: int, factor: int) -> int: - """Returns the smallest integer greater than or equal to 'number' that is divisible by 'factor'.""" - return math.ceil(number / factor) * factor - - def floor_by_factor(number: int, factor: int) -> int: - """Returns the largest integer less than or equal to 'number' that is divisible by 'factor'.""" - return math.floor(number / factor) * factor - - def smart_resize( - height: int, - width: int, - factor: int = IMAGE_FACTOR, - min_pixels: int = MIN_PIXELS, - max_pixels: int = MAX_PIXELS, - ) -> tuple[int, int]: - """ - Rescales the image so that the following conditions are met: - - 1. Both dimensions (height and width) are divisible by 'factor'. - - 2. The total number of pixels is within the range ['min_pixels', 'max_pixels']. - - 3. The aspect ratio of the image is maintained as closely as possible. - """ - if max(height, width) / min(height, width) > MAX_RATIO: - raise ValueError( - f"absolute aspect ratio must be smaller than {MAX_RATIO}, got {max(height, width) / min(height, width)}" + IMAGE_FACTOR = constants.IMAGE_FACTOR_QWEN_2_5 + IMAGE_MIN_TOKEN_NUM = constants.IMAGE_MIN_TOKEN_NUM + IMAGE_MAX_TOKEN_NUM = constants.IMAGE_MAX_TOKEN_NUM + min_pixels = IMAGE_MIN_TOKEN_NUM * IMAGE_FACTOR**2 + max_pixels = IMAGE_MAX_TOKEN_NUM * IMAGE_FACTOR**2 + mm_processor_kwargs = compiler_options.pop("mm_processor_kwargs", None) + if mm_processor_kwargs: + min_pixels = mm_processor_kwargs.get("min_pixels", min_pixels) + max_pixels = mm_processor_kwargs.get("max_pixels", max_pixels) + + vision = [] + max_vision_size = 0 + user_vision_size = compiler_options.pop("vision_size", None) + if user_vision_size: + assert user_vision_size < ctx_len, "vision_size must be less than ctx_len" + max_vision_size = user_vision_size + + for h, w, f in zip(height, width, num_frames): + resized_height, resized_width = smart_resize( + height=h, width=w, factor=IMAGE_FACTOR, min_pixels=min_pixels, max_pixels=max_pixels + ) + grid_h, grid_w = resized_height // patch_size, resized_width // patch_size + grid_height = grid_h * grid_w + grid_width = patch_size * patch_size * temporal_patch_size * channel + vision_size = grid_height // 4 + grid_height = grid_height * batch_size + if not user_vision_size: + max_vision_size = max(max_vision_size, vision_size * f) + assert max_vision_size < ctx_len, ( + f"Computed vision_size of {vision_size * f} tokens " + f"(vision_size={vision_size}, num_frames={f}) for image resolution " + f"(width={w}, height={h}) must be less than ctx_len. Please adjust the image " + "resolution." + ) + else: + assert vision_size * f < user_vision_size, ( + f"Computed vision_size of {vision_size * f} tokens " + f"(vision_size={vision_size}, num_frames={f}) for image resolution " + f"(width={w}, height={h}) cannot exceed the provided " + f"vision_size={user_vision_size}. Please adjust the image resolution or " + "increase the vision_size." ) - h_bar = max(factor, round_by_factor(height, factor)) - w_bar = max(factor, round_by_factor(width, factor)) - if h_bar * w_bar > max_pixels: - beta = math.sqrt((height * width) / max_pixels) - h_bar = floor_by_factor(height / beta, factor) - w_bar = floor_by_factor(width / beta, factor) - elif h_bar * w_bar < min_pixels: - beta = math.sqrt(min_pixels / (height * width)) - h_bar = ceil_by_factor(height * beta, factor) - w_bar = ceil_by_factor(width * beta, factor) - return h_bar, w_bar - - resized_height, resized_width = smart_resize(height=height, width=width) - grid_h, grid_w = resized_height // patch_size, resized_width // patch_size - grid_height = grid_h * grid_w - grid_width = patch_size * patch_size * temporal_patch_size * channel - vision_size = grid_height // 4 - vision_size = vision_size * num_frames - grid_height = grid_height * batch_size - - vision = [ - { - "batch_size": batch_size, - "vision_size": vision_size, - "grid_height": grid_height, - "grid_width": grid_width, - "grid_h": grid_h, - "grid_w": grid_w, - } - ] + vision.append( + { + "batch_size": batch_size, + "vision_size": vision_size, + "grid_height": grid_height, + "grid_width": grid_width, + "grid_h": grid_h, + "grid_w": grid_w, + } + ) if comp_ctx_lengths_prefill is not None: lang = [] @@ -1004,7 +994,7 @@ def smart_resize( "batch_size": 1 if continuous_batching else batch_size, "seq_len": prefill_seq_len, "ctx_len": ctx_len, - "vision_size": vision_size, + "vision_size": max_vision_size, "comp_ctx_lengths": comp_ctx_lengths_prefill[i], "vision_batch_size": batch_size, } @@ -1023,7 +1013,7 @@ def smart_resize( "batch_size": full_batch_size if continuous_batching else batch_size, "seq_len": "1", "ctx_len": ctx_len, - "vision_size": vision_size, + "vision_size": max_vision_size, "comp_ctx_lengths": comp_ctx_lengths_decode[i], "vision_batch_size": batch_size, } @@ -1039,7 +1029,7 @@ def smart_resize( "batch_size": 1 if continuous_batching else batch_size, "seq_len": prefill_seq_len, "ctx_len": ctx_len, - "vision_size": vision_size, + "vision_size": max_vision_size, "vision_batch_size": batch_size, } @@ -1054,7 +1044,7 @@ def smart_resize( "batch_size": full_batch_size if continuous_batching else batch_size, "seq_len": 1, "ctx_len": ctx_len, - "vision_size": vision_size, + "vision_size": max_vision_size, "vision_batch_size": batch_size, } diff --git a/QEfficient/transformers/models/qwen3_vl/modeling_qwen3_vl.py b/QEfficient/transformers/models/qwen3_vl/modeling_qwen3_vl.py index 18acc22c53..54b352f73e 100644 --- a/QEfficient/transformers/models/qwen3_vl/modeling_qwen3_vl.py +++ b/QEfficient/transformers/models/qwen3_vl/modeling_qwen3_vl.py @@ -10,6 +10,7 @@ import torch import torch.nn as nn import torch.nn.functional as F +from qwen_vl_utils import smart_resize from transformers.cache_utils import Cache from transformers.modeling_outputs import ( BaseModelOutputWithPast, @@ -908,11 +909,10 @@ def get_specializations( prefill_seq_len: int, ctx_len: int, img_size: None, - height: int = None, - width: int = None, + height: int | List[int] = None, + width: int | List[int] = None, time: int = 1, - # dimensions: List = None, - num_frames: int = 1, + num_frames: int | List[int] = 1, kv_offload: bool = False, continuous_batching: bool = False, kv_cache_batch_size: Optional[int] = None, @@ -927,81 +927,71 @@ def get_specializations( logger.warning( f"Setting height and width to be {height} and {width} respectively, as it was neither passed nor found in vision_config" ) + height = [height] if isinstance(height, int) else height + width = [width] if isinstance(width, int) else width + num_frames = [num_frames] * len(height) if isinstance(num_frames, int) else num_frames prefill_seq_len = prefill_seq_len if prefill_seq_len else 128 ctx_len = ctx_len if ctx_len else constants.INTERN_CTX_LEN channel = 3 patch_size = self.config.vision_config.patch_size temporal_patch_size = self.config.vision_config.temporal_patch_size - IMAGE_FACTOR = 32 - MIN_PIXELS = 64 * 32 * 32 - MAX_PIXELS = 16384 * 32 * 32 - MAX_RATIO = 200 - - def round_by_factor(number: int, factor: int) -> int: - """Returns the closest integer to 'number' that is divisible by 'factor'.""" - return round(number / factor) * factor - - def ceil_by_factor(number: int, factor: int) -> int: - """Returns the smallest integer greater than or equal to 'number' that is divisible by 'factor'.""" - return math.ceil(number / factor) * factor - - def floor_by_factor(number: int, factor: int) -> int: - """Returns the largest integer less than or equal to 'number' that is divisible by 'factor'.""" - return math.floor(number / factor) * factor - - def smart_resize( - height: int, - width: int, - factor: int = IMAGE_FACTOR, - min_pixels: int = MIN_PIXELS, - max_pixels: int = MAX_PIXELS, - ) -> tuple[int, int]: - """ - Rescales the image so that the following conditions are met: - - 1. Both dimensions (height and width) are divisible by 'factor'. - - 2. The total number of pixels is within the range ['min_pixels', 'max_pixels']. - - 3. The aspect ratio of the image is maintained as closely as possible. - """ - if max(height, width) / min(height, width) > MAX_RATIO: - raise ValueError( - f"absolute aspect ratio must be smaller than {MAX_RATIO}, got {max(height, width) / min(height, width)}" + IMAGE_FACTOR = constants.IMAGE_FACTOR_QWEN_3 + IMAGE_MIN_TOKEN_NUM = constants.IMAGE_MIN_TOKEN_NUM + IMAGE_MAX_TOKEN_NUM = constants.IMAGE_MAX_TOKEN_NUM + min_pixels = IMAGE_MIN_TOKEN_NUM * IMAGE_FACTOR**2 + max_pixels = IMAGE_MAX_TOKEN_NUM * IMAGE_FACTOR**2 + mm_processor_kwargs = compiler_options.pop("mm_processor_kwargs", None) + if mm_processor_kwargs: + min_pixels = mm_processor_kwargs.get("min_pixels", min_pixels) + max_pixels = mm_processor_kwargs.get("max_pixels", max_pixels) + + vision = [] + max_vision_size = 0 + user_vision_size = compiler_options.pop("vision_size", None) + if user_vision_size: + assert user_vision_size < ctx_len, "vision_size must be less than ctx_len" + max_vision_size = user_vision_size + + for h, w, f in zip(height, width, num_frames): + resized_height, resized_width = smart_resize( + height=h, width=w, factor=IMAGE_FACTOR, min_pixels=min_pixels, max_pixels=max_pixels + ) + grid_h, grid_w = resized_height // patch_size, resized_width // patch_size + grid_height = grid_h * grid_w + grid_width = patch_size * patch_size * temporal_patch_size * channel + vision_size = grid_height // 4 + vision_size = vision_size * time + grid_height = grid_height * time * batch_size + if not user_vision_size: + max_vision_size = max(max_vision_size, vision_size * f) + assert max_vision_size < ctx_len, ( + f"Computed vision_size of {vision_size * f} tokens " + f"(vision_size={vision_size}, num_frames={f}) for image resolution " + f"(width={w}, height={h}) must be less than ctx_len. Please adjust the image " + "resolution." + ) + else: + assert vision_size * f < user_vision_size, ( + f"Computed vision_size of {vision_size * f} tokens " + f"(vision_size={vision_size}, num_frames={f}) for image resolution " + f"(width={w}, height={h}) cannot exceed the provided " + f"vision_size={user_vision_size}. Please adjust the image resolution or " + "increase the vision_size." ) - h_bar = max(factor, round_by_factor(height, factor)) - w_bar = max(factor, round_by_factor(width, factor)) - if h_bar * w_bar > max_pixels: - beta = math.sqrt((height * width) / max_pixels) - h_bar = floor_by_factor(height / beta, factor) - w_bar = floor_by_factor(width / beta, factor) - elif h_bar * w_bar < min_pixels: - beta = math.sqrt(min_pixels / (height * width)) - h_bar = ceil_by_factor(height * beta, factor) - w_bar = ceil_by_factor(width * beta, factor) - return h_bar, w_bar - - resized_height, resized_width = smart_resize(height=height, width=width) - grid_h, grid_w = resized_height // patch_size, resized_width // patch_size - grid_height = grid_h * grid_w - grid_width = patch_size * patch_size * temporal_patch_size * channel - vision_size = grid_height // 4 - vision_size = vision_size * num_frames * time - grid_height = grid_height * time * batch_size - - vision = [ - { - "batch_size": batch_size, - "vision_size": vision_size, - "grid_height": grid_height, - "grid_width": grid_width, - "time": time, - "grid_h": grid_h, - "grid_w": grid_w, - "num_feature_layers": len(self.config.vision_config.deepstack_visual_indexes), - } - ] + + vision.append( + { + "batch_size": batch_size, + "vision_size": vision_size, + "grid_height": grid_height, + "grid_width": grid_width, + "grid_h": grid_h, + "grid_w": grid_w, + "time": time, + "num_feature_layers": len(self.config.vision_config.deepstack_visual_indexes), + } + ) if comp_ctx_lengths_prefill is not None: lang = [] @@ -1011,7 +1001,7 @@ def smart_resize( "batch_size": 1 if continuous_batching else batch_size, "seq_len": prefill_seq_len, "ctx_len": ctx_len, - "vision_size": vision_size, + "vision_size": max_vision_size, "comp_ctx_lengths": comp_ctx_lengths_prefill[i], "vision_batch_size": batch_size, "num_feature_layers": len(self.config.vision_config.deepstack_visual_indexes), @@ -1031,7 +1021,7 @@ def smart_resize( "batch_size": full_batch_size if continuous_batching else batch_size, "seq_len": "1", "ctx_len": ctx_len, - "vision_size": vision_size, + "vision_size": max_vision_size, "comp_ctx_lengths": comp_ctx_lengths_decode[i], "vision_batch_size": batch_size, "num_feature_layers": len(self.config.vision_config.deepstack_visual_indexes), @@ -1048,7 +1038,7 @@ def smart_resize( "batch_size": 1 if continuous_batching else batch_size, "seq_len": prefill_seq_len, "ctx_len": ctx_len, - "vision_size": vision_size, + "vision_size": max_vision_size, "vision_batch_size": batch_size, "num_feature_layers": len(self.config.vision_config.deepstack_visual_indexes), } @@ -1064,7 +1054,7 @@ def smart_resize( "batch_size": full_batch_size if continuous_batching else batch_size, "seq_len": 1, "ctx_len": ctx_len, - "vision_size": vision_size, + "vision_size": max_vision_size, "vision_batch_size": batch_size, "num_feature_layers": len(self.config.vision_config.deepstack_visual_indexes), } diff --git a/QEfficient/transformers/models/qwen3_vl_moe/modeling_qwen3_vl_moe.py b/QEfficient/transformers/models/qwen3_vl_moe/modeling_qwen3_vl_moe.py index bf02b32ab4..cb83c870e4 100644 --- a/QEfficient/transformers/models/qwen3_vl_moe/modeling_qwen3_vl_moe.py +++ b/QEfficient/transformers/models/qwen3_vl_moe/modeling_qwen3_vl_moe.py @@ -10,6 +10,7 @@ import torch import torch.nn as nn import torch.nn.functional as F +from qwen_vl_utils import smart_resize from transformers.cache_utils import Cache from transformers.modeling_outputs import ( BaseModelOutputWithPast, @@ -946,11 +947,10 @@ def get_specializations( prefill_seq_len: int, ctx_len: int, img_size: None, - height: int = None, - width: int = None, + height: int | List[int] = None, + width: int | List[int] = None, time: int = 1, - # dimensions: List = None, - num_frames: int = 1, + num_frames: int | List[int] = 1, kv_offload: bool = False, continuous_batching: bool = False, kv_cache_batch_size: Optional[int] = None, @@ -965,81 +965,73 @@ def get_specializations( logger.warning( f"Setting height and width to be {height} and {width} respectively, as it was neither passed nor found in vision_config" ) + + height = [height] if isinstance(height, int) else height + width = [width] if isinstance(width, int) else width + num_frames = [num_frames] * len(height) if isinstance(num_frames, int) else num_frames + prefill_seq_len = prefill_seq_len if prefill_seq_len else 128 ctx_len = ctx_len if ctx_len else constants.INTERN_CTX_LEN channel = 3 patch_size = self.config.vision_config.patch_size temporal_patch_size = self.config.vision_config.temporal_patch_size - IMAGE_FACTOR = 32 - MIN_PIXELS = 64 * 32 * 32 - MAX_PIXELS = 16384 * 32 * 32 - MAX_RATIO = 200 - - def round_by_factor(number: int, factor: int) -> int: - """Returns the closest integer to 'number' that is divisible by 'factor'.""" - return round(number / factor) * factor - - def ceil_by_factor(number: int, factor: int) -> int: - """Returns the smallest integer greater than or equal to 'number' that is divisible by 'factor'.""" - return math.ceil(number / factor) * factor - - def floor_by_factor(number: int, factor: int) -> int: - """Returns the largest integer less than or equal to 'number' that is divisible by 'factor'.""" - return math.floor(number / factor) * factor - - def smart_resize( - height: int, - width: int, - factor: int = IMAGE_FACTOR, - min_pixels: int = MIN_PIXELS, - max_pixels: int = MAX_PIXELS, - ) -> tuple[int, int]: - """ - Rescales the image so that the following conditions are met: - - 1. Both dimensions (height and width) are divisible by 'factor'. - - 2. The total number of pixels is within the range ['min_pixels', 'max_pixels']. - - 3. The aspect ratio of the image is maintained as closely as possible. - """ - if max(height, width) / min(height, width) > MAX_RATIO: - raise ValueError( - f"absolute aspect ratio must be smaller than {MAX_RATIO}, got {max(height, width) / min(height, width)}" + IMAGE_FACTOR = constants.IMAGE_FACTOR_QWEN_3 + IMAGE_MIN_TOKEN_NUM = constants.IMAGE_MIN_TOKEN_NUM + IMAGE_MAX_TOKEN_NUM = constants.IMAGE_MAX_TOKEN_NUM + min_pixels = IMAGE_MIN_TOKEN_NUM * IMAGE_FACTOR**2 + max_pixels = IMAGE_MAX_TOKEN_NUM * IMAGE_FACTOR**2 + mm_processor_kwargs = compiler_options.pop("mm_processor_kwargs", None) + if mm_processor_kwargs: + min_pixels = mm_processor_kwargs.get("min_pixels", min_pixels) + max_pixels = mm_processor_kwargs.get("max_pixels", max_pixels) + + vision = [] + max_vision_size = 0 + user_vision_size = compiler_options.pop("vision_size", None) + if user_vision_size: + assert user_vision_size < ctx_len, "vision_size must be less than ctx_len" + max_vision_size = user_vision_size + + for h, w, f in zip(height, width, num_frames): + resized_height, resized_width = smart_resize( + height=h, width=w, factor=IMAGE_FACTOR, min_pixels=min_pixels, max_pixels=max_pixels + ) + grid_h, grid_w = resized_height // patch_size, resized_width // patch_size + grid_height = grid_h * grid_w + grid_width = patch_size * patch_size * temporal_patch_size * channel + vision_size = grid_height // 4 + vision_size = vision_size * time + grid_height = grid_height * time * batch_size + if not user_vision_size: + max_vision_size = max(max_vision_size, vision_size * f) + assert max_vision_size < ctx_len, ( + f"Computed vision_size of {vision_size * f} tokens " + f"(vision_size={vision_size}, num_frames={f}) for image resolution " + f"(width={w}, height={h}) must be less than ctx_len. Please adjust the image " + "resolution." ) - h_bar = max(factor, round_by_factor(height, factor)) - w_bar = max(factor, round_by_factor(width, factor)) - if h_bar * w_bar > max_pixels: - beta = math.sqrt((height * width) / max_pixels) - h_bar = floor_by_factor(height / beta, factor) - w_bar = floor_by_factor(width / beta, factor) - elif h_bar * w_bar < min_pixels: - beta = math.sqrt(min_pixels / (height * width)) - h_bar = ceil_by_factor(height * beta, factor) - w_bar = ceil_by_factor(width * beta, factor) - return h_bar, w_bar - - resized_height, resized_width = smart_resize(height=height, width=width) - grid_h, grid_w = resized_height // patch_size, resized_width // patch_size - grid_height = grid_h * grid_w - grid_width = patch_size * patch_size * temporal_patch_size * channel - vision_size = grid_height // 4 - vision_size = vision_size * num_frames * time - grid_height = grid_height * time * batch_size - - vision = [ - { - "batch_size": batch_size, - "vision_size": vision_size, - "grid_height": grid_height, - "grid_width": grid_width, - "time": time, - "grid_h": grid_h, - "grid_w": grid_w, - "num_feature_layers": len(self.config.vision_config.deepstack_visual_indexes), - } - ] + else: + assert vision_size * f < user_vision_size, ( + f"Computed vision_size of {vision_size * f} tokens " + f"(vision_size={vision_size}, num_frames={f}) for image resolution " + f"(width={w}, height={h}) cannot exceed the provided " + f"vision_size={user_vision_size}. Please adjust the image resolution or " + "increase the vision_size." + ) + + vision.append( + { + "batch_size": batch_size, + "vision_size": vision_size, + "grid_height": grid_height, + "grid_width": grid_width, + "grid_h": grid_h, + "grid_w": grid_w, + "time": time, + "num_feature_layers": len(self.config.vision_config.deepstack_visual_indexes), + } + ) if comp_ctx_lengths_prefill is not None: lang = [] diff --git a/QEfficient/utils/constants.py b/QEfficient/utils/constants.py index f71045834d..36dd35081f 100644 --- a/QEfficient/utils/constants.py +++ b/QEfficient/utils/constants.py @@ -184,10 +184,14 @@ def get_default_aic_hw_version() -> str: # Qwen2_5_vl Constants QWEN2_5_VL_HEIGHT = 354 QWEN2_5_VL_WIDTH = 536 +IMAGE_FACTOR_QWEN_2_5 = 28 +IMAGE_MIN_TOKEN_NUM = 4 +IMAGE_MAX_TOKEN_NUM = 16384 # Qwen3_vl Constanst QWEN3_VL_HEIGHT = 354 QWEN3_VL_WIDTH = 536 +IMAGE_FACTOR_QWEN_3 = 32 # Modules to cache while clearing the pytorch weights CACHE_MODULES = ["get_output_names", "get_dummy_inputs", "get_onnx_dynamic_axes", "get_specializations"] diff --git a/examples/image_text_to_text/models/qwen_vl/basic_inference.py b/examples/image_text_to_text/models/qwen2_5_vl/basic_inference.py similarity index 100% rename from examples/image_text_to_text/models/qwen_vl/basic_inference.py rename to examples/image_text_to_text/models/qwen2_5_vl/basic_inference.py diff --git a/examples/image_text_to_text/models/qwen_vl/configs/Qwen2.5-VL-32B-Instruct-AWQ.yaml b/examples/image_text_to_text/models/qwen2_5_vl/configs/Qwen2.5-VL-32B-Instruct-AWQ.yaml similarity index 100% rename from examples/image_text_to_text/models/qwen_vl/configs/Qwen2.5-VL-32B-Instruct-AWQ.yaml rename to examples/image_text_to_text/models/qwen2_5_vl/configs/Qwen2.5-VL-32B-Instruct-AWQ.yaml diff --git a/examples/image_text_to_text/models/qwen_vl/continuous_batching.py b/examples/image_text_to_text/models/qwen2_5_vl/continuous_batching.py similarity index 100% rename from examples/image_text_to_text/models/qwen_vl/continuous_batching.py rename to examples/image_text_to_text/models/qwen2_5_vl/continuous_batching.py diff --git a/examples/image_text_to_text/models/qwen2_5_vl/multi_specialization_inference.py b/examples/image_text_to_text/models/qwen2_5_vl/multi_specialization_inference.py new file mode 100644 index 0000000000..00d454ff01 --- /dev/null +++ b/examples/image_text_to_text/models/qwen2_5_vl/multi_specialization_inference.py @@ -0,0 +1,147 @@ +# ----------------------------------------------------------------------------- +# +# Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries. +# SPDX-License-Identifier: BSD-3-Clause +# +# ----------------------------------------------------------------------------- + +import requests +import transformers +from PIL import Image +from qwen_vl_utils import process_vision_info +from transformers import AutoConfig, AutoProcessor, TextStreamer + +from QEfficient import QEFFAutoModelForImageTextToText + +# For AWQ model update pytorch version to 2.8.* +model_id = "Qwen/Qwen2.5-VL-3B-Instruct" +config = AutoConfig.from_pretrained(model_id) +# config.text_config.num_hidden_layers = 2 + +qeff_model = QEFFAutoModelForImageTextToText.from_pretrained( + model_id, attn_implementation="eager", kv_offload=True, config=config +) +tokenizer = transformers.AutoTokenizer.from_pretrained(model_id) +processor = AutoProcessor.from_pretrained(model_id) + +# use skip_vision=True, if want to run only text +skip_vision = False + +if skip_vision: # Only Text + batch_size = 1 + qeff_model.compile( + batch_size=batch_size, + prefill_seq_len=128, + ctx_len=4096, + num_cores=16, + num_devices=8, + height=354, + width=536, + mxfp6_matmul=False, + aic_enable_depth_first=True, + skip_vision=True, + mos=1, + ) + + messages = [ + { + "role": "user", + "content": [ + {"type": "text", "text": "Tell me about yourself."}, + ], + }, + ] + + messages = [messages] * batch_size + + inputs = processor.apply_chat_template( + messages, + add_generation_prompt=True, + tokenize=True, + return_dict=True, + return_tensors="pt", + ) + + inputs = qeff_model.model.prepare_inputs_for_generation(inputs=inputs, prefill_seq_len=128, batch_size=batch_size) + + streamer = TextStreamer(tokenizer) + output = qeff_model.generate(inputs=inputs, generation_len=100) + print(output.generated_ids) + print(tokenizer.batch_decode(output.generated_ids)) + print(output) + +else: # Vision + Text + batch_size = 1 + ctx_len = 8192 + + resolutions = [ + {"width": 360, "height": 120, "num_frames": 3}, + {"width": 320, "height": 180, "num_frames": 2}, + {"width": 360, "height": 240, "num_frames": 1}, + {"width": 454, "height": 256, "num_frames": 1}, + ] + + widths = [s["width"] for s in resolutions] + heights = [s["height"] for s in resolutions] + num_frames = [s["num_frames"] for s in resolutions] + # vision_size = 4096 # vision_size is the maximum visual-token budget that limits the ViT + + # (Vision Transformer) embeddings passed to the language decoder, together + # with the text prompt. Increasing this value preserves more visual detail, + # but consumes more of the model’s context length. + # This argument is **optional**; if not provided, vision_size is automatically + # derived from the input image resolutions to support the largest resolution. + + qeff_model.compile( + batch_size=batch_size, + prefill_seq_len=128, + ctx_len=ctx_len, + num_cores=16, + num_devices=2, + height=heights, + width=widths, + num_frames=num_frames, + mm_processor_kwargs={ + "min_pixels": 4 * 28 * 28, + "max_pixels": 16384 * 28 * 28, + }, + # vision_size=vision_size, + mxfp6_matmul=True, + mxint8_kv_cache=True, + aic_enable_depth_first=True, + mos=1, + ) + + image_url = "https://picsum.photos/id/237/536/354" + image = Image.open(requests.get(image_url, stream=True).raw) + image = image.resize((360, 120)) # Resize to any dimension (width, height) present in specializations + frames = 3 + + content = [{"type": "image", "image": image} for _ in range(frames)] + [ + {"type": "text", "text": "Describe the image"} + ] + + messages = [{"role": "user", "content": content}] + + messages = [messages] * batch_size + texts = [processor.apply_chat_template(msg, tokenize=False, add_generation_prompt=True) for msg in messages] + + image_inputs, video_inputs = process_vision_info(messages) + inputs = processor( + text=texts, + images=image_inputs, + videos=video_inputs, + padding=True, + return_tensors="pt", + ) + + image_grid_thw = inputs.get("image_grid_thw") + inputs = qeff_model.model.prepare_inputs_for_generation(inputs=inputs, prefill_seq_len=128, batch_size=batch_size) + + streamer = TextStreamer(tokenizer) + output = qeff_model.generate( + inputs=inputs, tokenizer=tokenizer, generation_len=100, multi_specs=True, num_frames=frames + ) + print(output.generated_ids) + print(output.generated_texts) + print(output) diff --git a/examples/image_text_to_text/models/qwen3_vl_moe/multi_specialization_inference.py b/examples/image_text_to_text/models/qwen3_vl_moe/multi_specialization_inference.py new file mode 100644 index 0000000000..c9c1d81383 --- /dev/null +++ b/examples/image_text_to_text/models/qwen3_vl_moe/multi_specialization_inference.py @@ -0,0 +1,150 @@ +# ----------------------------------------------------------------------------- +# +# Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries. +# SPDX-License-Identifier: BSD-3-Clause +# +# ----------------------------------------------------------------------------- + +import requests +import transformers +from PIL import Image +from qwen_vl_utils import process_vision_info +from transformers import AutoConfig, AutoProcessor, TextStreamer + +from QEfficient import QEFFAutoModelForImageTextToText + +model_id = "Qwen/Qwen3-VL-30B-A3B-Instruct" +config = AutoConfig.from_pretrained(model_id) + +# For faster execution user can run with lesser layers, For Testing Purpose Only +# config.vision_config.depth = 9 +# config.text_config.num_hidden_layers = 1 +# config.vision_config.deepstack_visual_indexes = [8] + +qeff_model = QEFFAutoModelForImageTextToText.from_pretrained( + model_id, attn_implementation="eager", kv_offload=True, config=config +) +tokenizer = transformers.AutoTokenizer.from_pretrained(model_id) +processor = AutoProcessor.from_pretrained(model_id) + +# use skip_vision=True, if want to run only text +skip_vision = False + +if skip_vision: # Only Text + batch_size = 1 + qeff_model.compile( + batch_size=batch_size, + prefill_seq_len=128, + ctx_len=4096, + num_cores=16, + num_devices=4, + height=354, + width=536, + mxfp6_matmul=True, + aic_enable_depth_first=True, + skip_vision=True, + mos=1, + use_onnx_subfunctions=True, + ) + + messages = [ + { + "role": "user", + "content": [ + {"type": "text", "text": "Tell me about yourself."}, + ], + }, + ] + + messages = [messages] * batch_size + + inputs = processor.apply_chat_template( + messages, + add_generation_prompt=True, + tokenize=True, + return_dict=True, + return_tensors="pt", + ) + + inputs = qeff_model.model.prepare_inputs_for_generation(inputs=inputs, prefill_seq_len=128, batch_size=batch_size) + + streamer = TextStreamer(tokenizer) + output = qeff_model.generate(inputs=inputs, generation_len=100) + print(output.generated_ids) + print(tokenizer.batch_decode(output.generated_ids)) + print(output) + +else: # Vision + Text + batch_size = 1 + ctx_len = 8192 + + resolutions = [ + {"width": 360, "height": 240, "num_frames": 3}, + {"width": 536, "height": 354, "num_frames": 2}, + {"width": 1024, "height": 1024, "num_frames": 1}, + ] + + widths = [s["width"] for s in resolutions] + heights = [s["height"] for s in resolutions] + num_frames = [s["num_frames"] for s in resolutions] + # vision_size = 4096 # vision_size is the maximum visual-token budget that limits the ViT + + # (Vision Transformer) embeddings passed to the language decoder, together + # with the text prompt. Increasing this value preserves more visual detail, + # but consumes more of the model’s context length. + # This argument is **optional**; if not provided, vision_size is automatically + # derived from the input image resolutions to support the largest resolution. + + qeff_model.compile( + batch_size=batch_size, + prefill_seq_len=128, + ctx_len=ctx_len, + num_cores=16, + num_devices=4, + height=heights, + width=widths, + num_frames=num_frames, + mm_processor_kwargs={ + "min_pixels": 4 * 32 * 32, + "max_pixels": 16384 * 32 * 32, + }, + # vision_size=vision_size, + mxfp6_matmul=True, + mxint8_kv_cache=True, + aic_enable_depth_first=True, + mos=1, + use_onnx_subfunctions=False, + ) + + image_url = "https://picsum.photos/id/237/536/354" + image = Image.open(requests.get(image_url, stream=True).raw) + image = image.resize((360, 240)) # Resize to any dimension (width, height) present in specializations + frames = 3 + + content = [{"type": "image", "image": image} for _ in range(frames)] + [ + {"type": "text", "text": "Describe the visual"} + ] + + messages = [{"role": "user", "content": content}] + + messages = [messages] * batch_size + texts = [processor.apply_chat_template(msg, tokenize=False, add_generation_prompt=True) for msg in messages] + + image_inputs, video_inputs = process_vision_info(messages) + inputs = processor( + text=texts, + images=image_inputs, + videos=video_inputs, + padding=True, + return_tensors="pt", + ) + + inputs = qeff_model.model.prepare_inputs_for_generation(inputs=inputs, prefill_seq_len=128, batch_size=batch_size) + + streamer = TextStreamer(tokenizer) + output = qeff_model.generate( + inputs=inputs, tokenizer=tokenizer, generation_len=100, multi_specs=True, num_frames=frames + ) + print(output.generated_ids) + print(output.generated_texts) + print(output) diff --git a/examples/image_text_to_text/models/qwen3vl/multi_specialization_inference.py b/examples/image_text_to_text/models/qwen3vl/multi_specialization_inference.py new file mode 100644 index 0000000000..7456dce289 --- /dev/null +++ b/examples/image_text_to_text/models/qwen3vl/multi_specialization_inference.py @@ -0,0 +1,150 @@ +# ----------------------------------------------------------------------------- +# +# Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries. +# SPDX-License-Identifier: BSD-3-Clause +# +# ----------------------------------------------------------------------------- + +import requests +import transformers +from PIL import Image +from qwen_vl_utils import process_vision_info +from transformers import AutoConfig, AutoProcessor, TextStreamer + +from QEfficient import QEFFAutoModelForImageTextToText + +# For AWQ model update pytorch version to 2.8.* +model_id = "Qwen/Qwen3-VL-2B-Instruct" +config = AutoConfig.from_pretrained(model_id) + +# config.vision_config.depth = 9 +# config.text_config.num_hidden_layers = 1 +# config.vision_config.deepstack_visual_indexes = [8] + +qeff_model = QEFFAutoModelForImageTextToText.from_pretrained( + model_id, attn_implementation="eager", kv_offload=True, config=config +) +tokenizer = transformers.AutoTokenizer.from_pretrained(model_id) +processor = AutoProcessor.from_pretrained(model_id) + +# use skip_vision=True, if want to run only text +skip_vision = False + +if skip_vision: # Only Text + batch_size = 1 + qeff_model.compile( + batch_size=batch_size, + prefill_seq_len=128, + ctx_len=4096, + num_cores=16, + num_devices=4, + height=354, + width=536, + mxfp6_matmul=True, + aic_enable_depth_first=True, + skip_vision=True, + mos=1, + use_onnx_subfunctions=True, + ) + + messages = [ + { + "role": "user", + "content": [ + {"type": "text", "text": "Tell me about yourself."}, + ], + }, + ] + + messages = [messages] * batch_size + + inputs = processor.apply_chat_template( + messages, + add_generation_prompt=True, + tokenize=True, + return_dict=True, + return_tensors="pt", + ) + + inputs = qeff_model.model.prepare_inputs_for_generation(inputs=inputs, prefill_seq_len=128, batch_size=batch_size) + + streamer = TextStreamer(tokenizer) + output = qeff_model.generate(inputs=inputs, generation_len=100) + print(output.generated_ids) + print(tokenizer.batch_decode(output.generated_ids)) + print(output) + +else: # Vision + Text + batch_size = 1 + ctx_len = 8192 + + resolutions = [ + {"width": 360, "height": 240, "num_frames": 3}, + {"width": 536, "height": 354, "num_frames": 2}, + {"width": 1024, "height": 1024, "num_frames": 1}, + ] + + widths = [s["width"] for s in resolutions] + heights = [s["height"] for s in resolutions] + num_frames = [s["num_frames"] for s in resolutions] + # vision_size = 4096 # vision_size is the maximum visual-token budget that limits the ViT + + # (Vision Transformer) embeddings passed to the language decoder, together + # with the text prompt. Increasing this value preserves more visual detail, + # but consumes more of the model’s context length. + # This argument is **optional**; if not provided, vision_size is automatically + # derived from the input image resolutions to support the largest resolution. + + qeff_model.compile( + batch_size=batch_size, + prefill_seq_len=128, + ctx_len=ctx_len, + num_cores=16, + num_devices=4, + height=heights, + width=widths, + num_frames=num_frames, + mm_processor_kwargs={ + "min_pixels": 4 * 32 * 32, + "max_pixels": 16384 * 32 * 32, + }, + # vision_size=vision_size, + mxfp6_matmul=True, + mxint8_kv_cache=True, + aic_enable_depth_first=True, + mos=1, + use_onnx_subfunctions=False, + ) + + image_url = "https://picsum.photos/id/237/536/354" + image = Image.open(requests.get(image_url, stream=True).raw) + image = image.resize((360, 240)) # Resize to any dimension (width, height) present in specializations + frames = 3 + + content = [{"type": "image", "image": image} for _ in range(frames)] + [ + {"type": "text", "text": "Describe the visual"} + ] + + messages = [{"role": "user", "content": content}] + + messages = [messages] * batch_size + texts = [processor.apply_chat_template(msg, tokenize=False, add_generation_prompt=True) for msg in messages] + + image_inputs, video_inputs = process_vision_info(messages) + inputs = processor( + text=texts, + images=image_inputs, + videos=video_inputs, + padding=True, + return_tensors="pt", + ) + + inputs = qeff_model.model.prepare_inputs_for_generation(inputs=inputs, prefill_seq_len=128, batch_size=batch_size) + + streamer = TextStreamer(tokenizer) + output = qeff_model.generate( + inputs=inputs, tokenizer=tokenizer, generation_len=100, multi_specs=True, num_frames=frames + ) + print(output.generated_ids) + print(output.generated_texts) + print(output) diff --git a/pyproject.toml b/pyproject.toml index b3c44e22d9..7cfbaf83f7 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -37,6 +37,7 @@ dependencies = [ "tensorboard", "fire", "py7zr", + "qwen-vl-utils==0.0.8", "torchmetrics==1.7.0", "ftfy==6.3.1", "imageio==2.37.2",