diff --git a/QEfficient/generation/vlm_generation.py b/QEfficient/generation/vlm_generation.py
index bb0c649f34..a0faf9e4d7 100644
--- a/QEfficient/generation/vlm_generation.py
+++ b/QEfficient/generation/vlm_generation.py
@@ -23,6 +23,7 @@
 from typing import Any, Dict, List, Optional, Union
 
 import numpy as np
+import torch
 from transformers import AutoImageProcessor, PreTrainedTokenizer, PreTrainedTokenizerFast
 
 from QEfficient.generation.cloud_infer import QAICInferenceSession
@@ -33,6 +34,7 @@
     QEffTextGenerationBase,
     TextGeneration,
     calculate_latency,
+    get_compilation_dims,
     write_io_files,
 )
 from QEfficient.utils import LRUCache
@@ -467,7 +469,15 @@ def _prepare_vision_language_prompt(self, text_prompt, image_path):
             return text_prompt
 
     def generate(
-        self, images: List[str], prompts: List[str], generation_len: Optional[int] = None, stream: bool = True, **kwargs
+        self,
+        images: List[str],
+        prompts: List[str],
+        inputs: torch.Tensor = None,
+        num_frames: Optional[int] = None,
+        multi_specs: Optional[bool] = None,
+        generation_len: Optional[int] = None,
+        stream: bool = True,
+        **kwargs,
     ) -> CloudAI100ExecInfo:
         """
         Main generation method maintaining API compatibility with VisionLanguageGeneration
@@ -485,6 +495,9 @@ def generate(
         Raises:
             ValueError: If images and prompts lengths don't match
         """
+        if num_frames or multi_specs:
+            return self._generate_multi_frame_specialization(inputs, num_frames, generation_len)
+
         if len(images) != len(prompts):
             raise ValueError(f"Number of images ({len(images)}) must match number of prompts ({len(prompts)})")
 
@@ -504,6 +517,228 @@ def generate(
             # Regular batching mode
             return self._generate_regular_batching(vision_prompts, generation_len, stream, **kwargs)
 
+    def run_prefill_multi_frame_specialization(
+        self, inputs: Optional[torch.Tensor], num_frames: Optional[int] = 1, generation_len: int = None
+    ):
+        """
+        Run prefill for multi-frame specialization. This is a special case where we have a fixed number of frames
+        and we want to prefill the model with the input frames before generating the output.
+
+        Args:
+            inputs: Input tensor
+            num_frames: Number of frames to process
+            generation_len: Length of generation
+            stream: Whether to stream the output
+
+        Returns:
+            Generation result
+        """
+
+        if not self._qpc_path:
+            raise TypeError("Please run compile API for language model first!")
+
+        self._session.deactivate()
+        self._vision_session.activate()
+
+        if not num_frames:
+            logger.warning("num_frames not specified, defaulting to 1")
+            num_frames = 1
+
+        batch_size, ctx_len, fbs = get_compilation_dims(self._qpc_path)
+
+        pad_token_id = 1
+
+        # Skip inputs/outputs
+        self._session.skip_buffers(
+            [
+                x
+                for x in self._session.input_names + self._session.output_names
+                if x.startswith("past_") or x.endswith("_RetainedState")
+            ]
+        )
+
+        prefill_seq_len = max(
+            [x[self._session.binding_index_map["input_ids"]][1][1] for x in self._session.allowed_shapes]
+            + [self._session.bindings[self._session.binding_index_map["input_ids"]].dims[1]]
+        )
+
+        input_len = inputs["attention_mask"].sum(1, keepdims=True)
+        input_ids_length = inputs["input_ids"].shape[1]
+        num_chunks = -(input_ids_length // -prefill_seq_len)  # ceil divide without float
+        padded_len = num_chunks * prefill_seq_len  # Convert to a multiple of prompt_len
+
+        if generation_len is None:
+            generation_len = ctx_len - input_len.max()
+        assert generation_len > 0, "generation length should be greater than zero"
+
+        inputs["input_ids"] = torch.nn.functional.pad(
+            inputs["input_ids"],
+            (0, padded_len - input_ids_length),
+            "constant",
+            pad_token_id,
+        )
+        inputs["attention_mask"] = torch.nn.functional.pad(
+            inputs["attention_mask"], (0, padded_len - input_ids_length), "constant", 0
+        )
+
+        for k, v in inputs.items():
+            inputs[k] = np.array(v)
+
+        vision_inputs = {
+            k: v
+            for k, v in inputs.items()
+            if k
+            in {"pixel_values", "image_masks", "image_input_idx", "valid_idx", "aspect_ratio_ids", "aspect_ratio_mask"}
+        }
+
+        vision_inputs_fp16 = {"pixel_values", "image_masks"}
+        vision_inputs.update({k: vision_inputs[k].astype("float16") for k in vision_inputs_fp16 if k in vision_inputs})
+
+        vision_outputs = {}
+        if vision_inputs:
+            vision_size = vision_inputs["pixel_values"].shape[0] // num_frames
+
+            pixel_values_shape = list(vision_inputs["pixel_values"][:vision_size].shape)
+
+            idx = next(
+                i for i, inner in enumerate(self._vision_session.allowed_shapes) if (2, pixel_values_shape) in inner
+            )
+
+            buffer_set = {
+                "vision_embeds": np.zeros(
+                    self._vision_session.allowed_shapes[idx][self._vision_session.binding_index_map["vision_embeds"]][
+                        1
+                    ],
+                    dtype=np.float16,
+                ),
+                "image_grid_thw": np.zeros(
+                    self._vision_session.allowed_shapes[idx][self._vision_session.binding_index_map["image_grid_thw"]][
+                        1
+                    ],
+                    dtype=np.int64,
+                ),
+            }
+            if "deepstack_features" in self._vision_session.binding_index_map:
+                buffer_set["deepstack_features"] = np.zeros(
+                    self._vision_session.allowed_shapes[idx][
+                        self._vision_session.binding_index_map["deepstack_features"]
+                    ][1],
+                    dtype=np.float16,
+                )
+
+            self._vision_session.set_buffers(buffer_set)
+
+            chunk_inputs = vision_inputs.copy()
+
+            for i in range(num_frames):
+                chunk_inputs["pixel_values"] = vision_inputs["pixel_values"][i * vision_size : (i + 1) * vision_size]
+                chunk_outputs = self._vision_session.run(chunk_inputs)
+                if i == 0:
+                    vision_outputs = chunk_outputs
+                else:
+                    vision_outputs["vision_embeds"] = np.concatenate(
+                        (vision_outputs["vision_embeds"], chunk_outputs["vision_embeds"]), axis=1
+                    )
+
+            vision_outputs["vision_embeds"] = np.pad(
+                vision_outputs["vision_embeds"],
+                pad_width=(
+                    (0, 0),
+                    (0, self._session.allowed_shapes[0][1][1][1] - vision_outputs["vision_embeds"].shape[-2]),
+                    (0, 0),
+                ),  # pad axis=1 only
+                mode="constant",
+                constant_values=0,
+            )
+            if "deepstack_features" in vision_outputs:
+                vision_outputs["deepstack_features"] = np.pad(
+                    vision_outputs["deepstack_features"],
+                    pad_width=(
+                        (0, 0),
+                        (0, 0),
+                        (0, self._session.allowed_shapes[0][1][1][1] - vision_outputs["deepstack_features"].shape[-2]),
+                        (0, 0),
+                    ),  # pad axis=1 only
+                    mode="constant",
+                    constant_values=0,
+                )
+
+        lang_inputs = {k: v for k, v in inputs.items() if k not in vision_inputs}
+        lang_inputs.pop("attention_mask")
+
+        if self._vision_qpc_path:
+            self._vision_session.deactivate()
+
+        self._session.activate()
+
+        self._session.set_buffers(vision_outputs)
+        logger.debug(f"Vision buffers set: {list(vision_outputs.keys())}")
+        self._vision_processed = True
+        self._vision_outputs = vision_outputs
+
+        # Calculate generation_len consistent with ctx_len
+        max_gen_len = self._ctx_len - np.where(lang_inputs["position_ids"] != -1, 1, 0).sum(1, keepdims=True).max()
+        generation_len = self._fetch_generation_len(generation_len, max_gen_len)
+
+        # Execute chunked prefill
+        outputs = self._execute_chunked_prefill(lang_inputs, num_chunks)
+
+        self._session.skip_buffers(vision_outputs)
+
+        # Prepare position_ids for decode phase (next position after prefill)
+        position_ids_decode = np.max(lang_inputs["position_ids"], axis=-1, keepdims=True) + 1
+
+        return outputs, position_ids_decode, generation_len
+
+    def _generate_multi_frame_specialization(
+        self,
+        inputs: Optional[torch.Tensor],
+        num_frames: Optional[int] = 1,
+        generation_len: int = None,
+        stream: List[str] = None,
+    ):
+
+        exec_batch_size = self.batch_size
+        max_gen_length = self._ctx_len if not generation_len else max(self._ctx_len, generation_len)
+        self.initialize_decode_inputs(
+            num_prompts=1, execution_batch_size=exec_batch_size, max_gen_length=max_gen_length
+        )
+
+        if self.is_qwen_vl:
+            self.decode_pos_ids = np.zeros((4, exec_batch_size, 1), np.int64)
+
+        # Prefill using VLM-aware run_prefill (batch is a list of (image, text))
+        start = perf_counter()
+        outputs, position_ids, generation_len_final = self.run_prefill_multi_frame_specialization(
+            inputs, num_frames, generation_len
+        )
+        self.update_decode_input(outputs, position_ids, generation_len_final)
+
+        # Prepare decode
+        decode_inputs = self.prepare_decode_inputs()
+
+        # Decode loop
+        loop_start = perf_counter()
+        num_token = self.run_decode(decode_inputs, generation_len_final, automation=False, streamer=None)
+        end = perf_counter()
+
+        # Decode generated texts
+        generated_texts = self.tokenizer.batch_decode(self.generated_ids, skip_special_tokens=True)
+
+        # Latency metrics
+        total_decode_tokens = num_token
+        prefill_time, decode_perf, total_perf, total_time = calculate_latency(
+            total_decode_tokens, loop_start, start, end
+        )
+        perf_metrics = PerfMetrics(prefill_time, decode_perf, total_perf, total_time)
+
+        return CloudAI100ExecInfo(
+            batch_size=self.batch_size,
+            generated_texts=generated_texts,
+            generated_ids=self.generated_ids,
+            perf_metrics=perf_metrics,
+        )
+
     def _generate_regular_batching(self, vision_prompts, generation_len, stream, **kwargs):
         """Handle regular batching for vision-language generation without creating a second language session"""
         batch_results = []
diff --git a/QEfficient/transformers/models/modeling_auto.py b/QEfficient/transformers/models/modeling_auto.py
index 222bb4c658..73536b24a1 100644
--- a/QEfficient/transformers/models/modeling_auto.py
+++ b/QEfficient/transformers/models/modeling_auto.py
@@ -1697,6 +1697,8 @@ def generate(
         generation_len: Optional[int] = None,
         image_height: Optional[int] = None,
         image_width: Optional[int] = None,
+        multi_specs: Optional[bool] = None,
+        num_frames: Optional[int] = None,
         **kwargs,
     ) -> Union[torch.Tensor, np.ndarray]:
         """
@@ -1745,7 +1747,7 @@ def generate(
         self._write_io_dir = os.path.join(os.path.dirname(self.onnx_path[1]), "io_dir") if write_io else None
 
         # Use VisionLanguageGeneration for image-prompt pairs
-        if (processor and images) or (tokenizer and prompts):
+        if (processor and images) or (tokenizer and prompts) or multi_specs or num_frames:
             # Create VisionLanguageGeneration instance
             batch_size_comp, ctx_len_comp, fbs = get_compilation_dims(self.lang_model.qpc_path)
             vlm_gen = VisionLanguageGeneration(
@@ -1767,6 +1769,9 @@ def generate(
 
             # Call generate method
             return vlm_gen.generate(
+                inputs=inputs,
+                num_frames=num_frames,
+                multi_specs=multi_specs,
                 images=images,
                 prompts=prompts,
                 generation_len=generation_len,
diff --git a/QEfficient/transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py b/QEfficient/transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py
index dd70a31c95..78f068ab97 100644
--- a/QEfficient/transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py
+++ b/QEfficient/transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py
@@ -11,6 +11,7 @@
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
+from qwen_vl_utils import smart_resize
 from transformers import Qwen2_5_VLForConditionalGeneration, Qwen2_5_VLModel
 from transformers.cache_utils import Cache
 from transformers.modeling_outputs import (
@@ -904,9 +905,9 @@ def get_specializations(
         prefill_seq_len: int,
         ctx_len: int,
         img_size: None,
-        height: int = None,
-        width: int = None,
-        num_frames: int = 1,
+        height: int | List[int] = None,
+        width: int | List[int] = None,
+        num_frames: int | List[int] = 1,
         kv_offload: bool = False,
         continuous_batching: bool = False,
         kv_cache_batch_size: Optional[int] = None,
@@ -922,80 +923,69 @@ def get_specializations(
             logger.warning(
                 f"Setting height and width to be {height} and {width} respectively, as it was neither passed nor found in vision_config"
             )
+        height = [height] if isinstance(height, int) else height
+        width = [width] if isinstance(width, int) else width
+        num_frames = [num_frames] * len(height) if isinstance(num_frames, int) else num_frames
+
         prefill_seq_len = prefill_seq_len if prefill_seq_len else 128
         ctx_len = ctx_len if ctx_len else constants.INTERN_CTX_LEN
         channel = 3
         patch_size = self.config.vision_config.patch_size
         temporal_patch_size = self.config.vision_config.temporal_patch_size
 
-        IMAGE_FACTOR = 28
-        MIN_PIXELS = 4 * 28 * 28
-        MAX_PIXELS = 16384 * 28 * 28
-        MAX_RATIO = 200
-
-        def round_by_factor(number: int, factor: int) -> int:
-            """Returns the closest integer to 'number' that is divisible by 'factor'."""
-            return round(number / factor) * factor
-
-        def ceil_by_factor(number: int, factor: int) -> int:
-            """Returns the smallest integer greater than or equal to 'number' that is divisible by 'factor'."""
-            return math.ceil(number / factor) * factor
-
-        def floor_by_factor(number: int, factor: int) -> int:
-            """Returns the largest integer less than or equal to 'number' that is divisible by 'factor'."""
-            return math.floor(number / factor) * factor
-
-        def smart_resize(
-            height: int,
-            width: int,
-            factor: int = IMAGE_FACTOR,
-            min_pixels: int = MIN_PIXELS,
-            max_pixels: int = MAX_PIXELS,
-        ) -> tuple[int, int]:
-            """
-            Rescales the image so that the following conditions are met:
-
-            1. Both dimensions (height and width) are divisible by 'factor'.
-
-            2. The total number of pixels is within the range ['min_pixels', 'max_pixels'].
-
-            3. The aspect ratio of the image is maintained as closely as possible.
-            """
-            if max(height, width) / min(height, width) > MAX_RATIO:
-                raise ValueError(
-                    f"absolute aspect ratio must be smaller than {MAX_RATIO}, got {max(height, width) / min(height, width)}"
+        IMAGE_FACTOR = constants.IMAGE_FACTOR_QWEN_2_5
+        IMAGE_MIN_TOKEN_NUM = constants.IMAGE_MIN_TOKEN_NUM
+        IMAGE_MAX_TOKEN_NUM = constants.IMAGE_MAX_TOKEN_NUM
+        min_pixels = IMAGE_MIN_TOKEN_NUM * IMAGE_FACTOR**2
+        max_pixels = IMAGE_MAX_TOKEN_NUM * IMAGE_FACTOR**2
+        mm_processor_kwargs = compiler_options.pop("mm_processor_kwargs", None)
+        if mm_processor_kwargs:
+            min_pixels = mm_processor_kwargs.get("min_pixels", min_pixels)
+            max_pixels = mm_processor_kwargs.get("max_pixels", max_pixels)
+
+        vision = []
+        max_vision_size = 0
+        user_vision_size = compiler_options.pop("vision_size", None)
+        if user_vision_size:
+            assert user_vision_size < ctx_len, "vision_size must be less than ctx_len"
+            max_vision_size = user_vision_size
+
+        for h, w, f in zip(height, width, num_frames):
+            resized_height, resized_width = smart_resize(
+                height=h, width=w, factor=IMAGE_FACTOR, min_pixels=min_pixels, max_pixels=max_pixels
+            )
+            grid_h, grid_w = resized_height // patch_size, resized_width // patch_size
+            grid_height = grid_h * grid_w
+            grid_width = patch_size * patch_size * temporal_patch_size * channel
+            vision_size = grid_height // 4
+            grid_height = grid_height * batch_size
+            if not user_vision_size:
+                max_vision_size = max(max_vision_size, vision_size * f)
+                assert max_vision_size < ctx_len, (
+                    f"Computed vision_size of {vision_size * f} tokens "
+                    f"(vision_size={vision_size}, num_frames={f}) for image resolution "
+                    f"(width={w}, height={h}) must be less than ctx_len. Please adjust the image "
+                    "resolution."
+                )
+            else:
+                assert vision_size * f < user_vision_size, (
+                    f"Computed vision_size of {vision_size * f} tokens "
+                    f"(vision_size={vision_size}, num_frames={f}) for image resolution "
+                    f"(width={w}, height={h}) cannot exceed the provided "
+                    f"vision_size={user_vision_size}. Please adjust the image resolution or "
+                    "increase the vision_size."
                 )
-            h_bar = max(factor, round_by_factor(height, factor))
-            w_bar = max(factor, round_by_factor(width, factor))
-            if h_bar * w_bar > max_pixels:
-                beta = math.sqrt((height * width) / max_pixels)
-                h_bar = floor_by_factor(height / beta, factor)
-                w_bar = floor_by_factor(width / beta, factor)
-            elif h_bar * w_bar < min_pixels:
-                beta = math.sqrt(min_pixels / (height * width))
-                h_bar = ceil_by_factor(height * beta, factor)
-                w_bar = ceil_by_factor(width * beta, factor)
-            return h_bar, w_bar
-
-        resized_height, resized_width = smart_resize(height=height, width=width)
-        grid_h, grid_w = resized_height // patch_size, resized_width // patch_size
-        grid_height = grid_h * grid_w
-        grid_width = patch_size * patch_size * temporal_patch_size * channel
-        vision_size = grid_height // 4
-        vision_size = vision_size * num_frames
-        grid_height = grid_height * batch_size
-
-        vision = [
-            {
-                "batch_size": batch_size,
-                "vision_size": vision_size,
-                "grid_height": grid_height,
-                "grid_width": grid_width,
-                "grid_h": grid_h,
-                "grid_w": grid_w,
-            }
-        ]
 
+            vision.append(
+                {
+                    "batch_size": batch_size,
+                    "vision_size": vision_size,
+                    "grid_height": grid_height,
+                    "grid_width": grid_width,
+                    "grid_h": grid_h,
+                    "grid_w": grid_w,
+                }
+            )
         if comp_ctx_lengths_prefill is not None:
             lang = []
 
@@ -1004,7 +994,7 @@ def smart_resize(
                     "batch_size": 1 if continuous_batching else batch_size,
                     "seq_len": prefill_seq_len,
                     "ctx_len": ctx_len,
-                    "vision_size": vision_size,
+                    "vision_size": max_vision_size,
                     "comp_ctx_lengths": comp_ctx_lengths_prefill[i],
                     "vision_batch_size": batch_size,
                 }
@@ -1023,7 +1013,7 @@ def smart_resize(
                     "batch_size": full_batch_size if continuous_batching else batch_size,
                     "seq_len": "1",
                     "ctx_len": ctx_len,
-                    "vision_size": vision_size,
+                    "vision_size": max_vision_size,
                     "comp_ctx_lengths": comp_ctx_lengths_decode[i],
                     "vision_batch_size": batch_size,
                 }
@@ -1039,7 +1029,7 @@ def smart_resize(
                 "batch_size": 1 if continuous_batching else batch_size,
                 "seq_len": prefill_seq_len,
                 "ctx_len": ctx_len,
-                "vision_size": vision_size,
+                "vision_size": max_vision_size,
                 "vision_batch_size": batch_size,
             }
 
@@ -1054,7 +1044,7 @@ def smart_resize(
                 "batch_size": full_batch_size if continuous_batching else batch_size,
                 "seq_len": 1,
                 "ctx_len": ctx_len,
-                "vision_size": vision_size,
+                "vision_size": max_vision_size,
                 "vision_batch_size": batch_size,
             }
 
diff --git a/QEfficient/transformers/models/qwen3_vl/modeling_qwen3_vl.py b/QEfficient/transformers/models/qwen3_vl/modeling_qwen3_vl.py
index 18acc22c53..54b352f73e 100644
--- a/QEfficient/transformers/models/qwen3_vl/modeling_qwen3_vl.py
+++ b/QEfficient/transformers/models/qwen3_vl/modeling_qwen3_vl.py
@@ -10,6 +10,7 @@
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
+from qwen_vl_utils import smart_resize
 from transformers.cache_utils import Cache
 from transformers.modeling_outputs import (
     BaseModelOutputWithPast,
@@ -908,11 +909,10 @@ def get_specializations(
         prefill_seq_len: int,
         ctx_len: int,
         img_size: None,
-        height: int = None,
-        width: int = None,
+        height: int | List[int] = None,
+        width: int | List[int] = None,
         time: int = 1,
-        # dimensions: List = None,
-        num_frames: int = 1,
+        num_frames: int | List[int] = 1,
         kv_offload: bool = False,
         continuous_batching: bool = False,
         kv_cache_batch_size: Optional[int] = None,
@@ -927,81 +927,71 @@ def get_specializations(
             logger.warning(
                 f"Setting height and width to be {height} and {width} respectively, as it was neither passed nor found in vision_config"
             )
+        height = [height] if isinstance(height, int) else height
+        width = [width] if isinstance(width, int) else width
+        num_frames = [num_frames] * len(height) if isinstance(num_frames, int) else num_frames
         prefill_seq_len = prefill_seq_len if prefill_seq_len else 128
         ctx_len = ctx_len if ctx_len else constants.INTERN_CTX_LEN
         channel = 3
         patch_size = self.config.vision_config.patch_size
         temporal_patch_size = self.config.vision_config.temporal_patch_size
 
-        IMAGE_FACTOR = 32
-        MIN_PIXELS = 64 * 32 * 32
-        MAX_PIXELS = 16384 * 32 * 32
-        MAX_RATIO = 200
-
-        def round_by_factor(number: int, factor: int) -> int:
-            """Returns the closest integer to 'number' that is divisible by 'factor'."""
-            return round(number / factor) * factor
-
-        def ceil_by_factor(number: int, factor: int) -> int:
-            """Returns the smallest integer greater than or equal to 'number' that is divisible by 'factor'."""
-            return math.ceil(number / factor) * factor
-
-        def floor_by_factor(number: int, factor: int) -> int:
-            """Returns the largest integer less than or equal to 'number' that is divisible by 'factor'."""
-            return math.floor(number / factor) * factor
-
-        def smart_resize(
-            height: int,
-            width: int,
-            factor: int = IMAGE_FACTOR,
-            min_pixels: int = MIN_PIXELS,
-            max_pixels: int = MAX_PIXELS,
-        ) -> tuple[int, int]:
-            """
-            Rescales the image so that the following conditions are met:
-
-            1. Both dimensions (height and width) are divisible by 'factor'.
-
-            2. The total number of pixels is within the range ['min_pixels', 'max_pixels'].
-
-            3. The aspect ratio of the image is maintained as closely as possible.
-            """
-            if max(height, width) / min(height, width) > MAX_RATIO:
-                raise ValueError(
-                    f"absolute aspect ratio must be smaller than {MAX_RATIO}, got {max(height, width) / min(height, width)}"
+        IMAGE_FACTOR = constants.IMAGE_FACTOR_QWEN_3
+        IMAGE_MIN_TOKEN_NUM = constants.IMAGE_MIN_TOKEN_NUM
+        IMAGE_MAX_TOKEN_NUM = constants.IMAGE_MAX_TOKEN_NUM
+        min_pixels = IMAGE_MIN_TOKEN_NUM * IMAGE_FACTOR**2
+        max_pixels = IMAGE_MAX_TOKEN_NUM * IMAGE_FACTOR**2
+        mm_processor_kwargs = compiler_options.pop("mm_processor_kwargs", None)
+        if mm_processor_kwargs:
+            min_pixels = mm_processor_kwargs.get("min_pixels", min_pixels)
+            max_pixels = mm_processor_kwargs.get("max_pixels", max_pixels)
+
+        vision = []
+        max_vision_size = 0
+        user_vision_size = compiler_options.pop("vision_size", None)
+        if user_vision_size:
+            assert user_vision_size < ctx_len, "vision_size must be less than ctx_len"
+            max_vision_size = user_vision_size
+
+        for h, w, f in zip(height, width, num_frames):
+            resized_height, resized_width = smart_resize(
+                height=h, width=w, factor=IMAGE_FACTOR, min_pixels=min_pixels, max_pixels=max_pixels
+            )
+            grid_h, grid_w = resized_height // patch_size, resized_width // patch_size
+            grid_height = grid_h * grid_w
+            grid_width = patch_size * patch_size * temporal_patch_size * channel
+            vision_size = grid_height // 4
+            vision_size = vision_size * time
+            grid_height = grid_height * time * batch_size
+            if not user_vision_size:
+                max_vision_size = max(max_vision_size, vision_size * f)
+                assert max_vision_size < ctx_len, (
+                    f"Computed vision_size of {vision_size * f} tokens "
+                    f"(vision_size={vision_size}, num_frames={f}) for image resolution "
+                    f"(width={w}, height={h}) must be less than ctx_len. Please adjust the image "
+                    "resolution."
+                )
+            else:
+                assert vision_size * f < user_vision_size, (
+                    f"Computed vision_size of {vision_size * f} tokens "
+                    f"(vision_size={vision_size}, num_frames={f}) for image resolution "
+                    f"(width={w}, height={h}) cannot exceed the provided "
+                    f"vision_size={user_vision_size}. Please adjust the image resolution or "
+                    "increase the vision_size."
                 )
-            h_bar = max(factor, round_by_factor(height, factor))
-            w_bar = max(factor, round_by_factor(width, factor))
-            if h_bar * w_bar > max_pixels:
-                beta = math.sqrt((height * width) / max_pixels)
-                h_bar = floor_by_factor(height / beta, factor)
-                w_bar = floor_by_factor(width / beta, factor)
-            elif h_bar * w_bar < min_pixels:
-                beta = math.sqrt(min_pixels / (height * width))
-                h_bar = ceil_by_factor(height * beta, factor)
-                w_bar = ceil_by_factor(width * beta, factor)
-            return h_bar, w_bar
-
-        resized_height, resized_width = smart_resize(height=height, width=width)
-        grid_h, grid_w = resized_height // patch_size, resized_width // patch_size
-        grid_height = grid_h * grid_w
-        grid_width = patch_size * patch_size * temporal_patch_size * channel
-        vision_size = grid_height // 4
-        vision_size = vision_size * num_frames * time
-        grid_height = grid_height * time * batch_size
-
-        vision = [
-            {
-                "batch_size": batch_size,
-                "vision_size": vision_size,
-                "grid_height": grid_height,
-                "grid_width": grid_width,
-                "time": time,
-                "grid_h": grid_h,
-                "grid_w": grid_w,
-                "num_feature_layers": len(self.config.vision_config.deepstack_visual_indexes),
-            }
-        ]
+
+            vision.append(
+                {
+                    "batch_size": batch_size,
+                    "vision_size": vision_size,
+                    "grid_height": grid_height,
+                    "grid_width": grid_width,
+                    "grid_h": grid_h,
+                    "grid_w": grid_w,
+                    "time": time,
+                    "num_feature_layers": len(self.config.vision_config.deepstack_visual_indexes),
+                }
+            )
 
         if comp_ctx_lengths_prefill is not None:
             lang = []
@@ -1011,7 +1001,7 @@ def smart_resize(
                     "batch_size": 1 if continuous_batching else batch_size,
                     "seq_len": prefill_seq_len,
                     "ctx_len": ctx_len,
-                    "vision_size": vision_size,
+                    "vision_size": max_vision_size,
                     "comp_ctx_lengths": comp_ctx_lengths_prefill[i],
                     "vision_batch_size": batch_size,
                     "num_feature_layers": len(self.config.vision_config.deepstack_visual_indexes),
@@ -1031,7 +1021,7 @@ def smart_resize(
                     "batch_size": full_batch_size if continuous_batching else batch_size,
                     "seq_len": "1",
                     "ctx_len": ctx_len,
-                    "vision_size": vision_size,
+                    "vision_size": max_vision_size,
                     "comp_ctx_lengths": comp_ctx_lengths_decode[i],
                     "vision_batch_size": batch_size,
                     "num_feature_layers": len(self.config.vision_config.deepstack_visual_indexes),
@@ -1048,7 +1038,7 @@ def smart_resize(
                 "batch_size": 1 if continuous_batching else batch_size,
                 "seq_len": prefill_seq_len,
                 "ctx_len": ctx_len,
-                "vision_size": vision_size,
+                "vision_size": max_vision_size,
                 "vision_batch_size": batch_size,
                 "num_feature_layers": len(self.config.vision_config.deepstack_visual_indexes),
             }
@@ -1064,7 +1054,7 @@ def smart_resize(
                 "batch_size": full_batch_size if continuous_batching else batch_size,
                 "seq_len": 1,
                 "ctx_len": ctx_len,
-                "vision_size": vision_size,
+                "vision_size": max_vision_size,
                 "vision_batch_size": batch_size,
                 "num_feature_layers": len(self.config.vision_config.deepstack_visual_indexes),
             }
diff --git a/QEfficient/transformers/models/qwen3_vl_moe/modeling_qwen3_vl_moe.py b/QEfficient/transformers/models/qwen3_vl_moe/modeling_qwen3_vl_moe.py
index bf02b32ab4..cb83c870e4 100644
--- a/QEfficient/transformers/models/qwen3_vl_moe/modeling_qwen3_vl_moe.py
+++ b/QEfficient/transformers/models/qwen3_vl_moe/modeling_qwen3_vl_moe.py
@@ -10,6 +10,7 @@
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
+from qwen_vl_utils import smart_resize
 from transformers.cache_utils import Cache
 from transformers.modeling_outputs import (
     BaseModelOutputWithPast,
@@ -946,11 +947,10 @@ def get_specializations(
         prefill_seq_len: int,
         ctx_len: int,
         img_size: None,
-        height: int = None,
-        width: int = None,
+        height: int | List[int] = None,
+        width: int | List[int] = None,
         time: int = 1,
-        # dimensions: List = None,
-        num_frames: int = 1,
+        num_frames: int | List[int] = 1,
         kv_offload: bool = False,
         continuous_batching: bool = False,
         kv_cache_batch_size: Optional[int] = None,
@@ -965,81 +965,73 @@ def get_specializations(
             logger.warning(
                 f"Setting height and width to be {height} and {width} respectively, as it was neither passed nor found in vision_config"
             )
+
+        height = [height] if isinstance(height, int) else height
+        width = [width] if isinstance(width, int) else width
+        num_frames = [num_frames] * len(height) if isinstance(num_frames, int) else num_frames
+
         prefill_seq_len = prefill_seq_len if prefill_seq_len else 128
         ctx_len = ctx_len if ctx_len else constants.INTERN_CTX_LEN
         channel = 3
         patch_size = self.config.vision_config.patch_size
         temporal_patch_size = self.config.vision_config.temporal_patch_size
 
-        IMAGE_FACTOR = 32
-        MIN_PIXELS = 64 * 32 * 32
-        MAX_PIXELS = 16384 * 32 * 32
-        MAX_RATIO = 200
-
-        def round_by_factor(number: int, factor: int) -> int:
-            """Returns the closest integer to 'number' that is divisible by 'factor'."""
-            return round(number / factor) * factor
-
-        def ceil_by_factor(number: int, factor: int) -> int:
-            """Returns the smallest integer greater than or equal to 'number' that is divisible by 'factor'."""
-            return math.ceil(number / factor) * factor
-
-        def floor_by_factor(number: int, factor: int) -> int:
-            """Returns the largest integer less than or equal to 'number' that is divisible by 'factor'."""
-            return math.floor(number / factor) * factor
-
-        def smart_resize(
-            height: int,
-            width: int,
-            factor: int = IMAGE_FACTOR,
-            min_pixels: int = MIN_PIXELS,
-            max_pixels: int = MAX_PIXELS,
-        ) -> tuple[int, int]:
-            """
-            Rescales the image so that the following conditions are met:
-
-            1. Both dimensions (height and width) are divisible by 'factor'.
-
-            2. The total number of pixels is within the range ['min_pixels', 'max_pixels'].
-
-            3. The aspect ratio of the image is maintained as closely as possible.
-            """
-            if max(height, width) / min(height, width) > MAX_RATIO:
-                raise ValueError(
-                    f"absolute aspect ratio must be smaller than {MAX_RATIO}, got {max(height, width) / min(height, width)}"
+        IMAGE_FACTOR = constants.IMAGE_FACTOR_QWEN_3
+        IMAGE_MIN_TOKEN_NUM = constants.IMAGE_MIN_TOKEN_NUM
+        IMAGE_MAX_TOKEN_NUM = constants.IMAGE_MAX_TOKEN_NUM
+        min_pixels = IMAGE_MIN_TOKEN_NUM * IMAGE_FACTOR**2
+        max_pixels = IMAGE_MAX_TOKEN_NUM * IMAGE_FACTOR**2
+        mm_processor_kwargs = compiler_options.pop("mm_processor_kwargs", None)
+        if mm_processor_kwargs:
+            min_pixels = mm_processor_kwargs.get("min_pixels", min_pixels)
+            max_pixels = mm_processor_kwargs.get("max_pixels", max_pixels)
+
+        vision = []
+        max_vision_size = 0
+        user_vision_size = compiler_options.pop("vision_size", None)
+        if user_vision_size:
+            assert user_vision_size < ctx_len, "vision_size must be less than ctx_len"
+            max_vision_size = user_vision_size
+
+        for h, w, f in zip(height, width, num_frames):
+            resized_height, resized_width = smart_resize(
+                height=h, width=w, factor=IMAGE_FACTOR, min_pixels=min_pixels, max_pixels=max_pixels
+            )
+            grid_h, grid_w = resized_height // patch_size, resized_width // patch_size
+            grid_height = grid_h * grid_w
+            grid_width = patch_size * patch_size * temporal_patch_size * channel
+            vision_size = grid_height // 4
+            vision_size = vision_size * time
+            grid_height = grid_height * time * batch_size
+            if not user_vision_size:
+                max_vision_size = max(max_vision_size, vision_size * f)
+                assert max_vision_size < ctx_len, (
+                    f"Computed vision_size of {vision_size * f} tokens "
+                    f"(vision_size={vision_size}, num_frames={f}) for image resolution "
+                    f"(width={w}, height={h}) must be less than ctx_len. Please adjust the image "
+                    "resolution."
                 )
-            h_bar = max(factor, round_by_factor(height, factor))
-            w_bar = max(factor, round_by_factor(width, factor))
-            if h_bar * w_bar > max_pixels:
-                beta = math.sqrt((height * width) / max_pixels)
-                h_bar = floor_by_factor(height / beta, factor)
-                w_bar = floor_by_factor(width / beta, factor)
-            elif h_bar * w_bar < min_pixels:
-                beta = math.sqrt(min_pixels / (height * width))
-                h_bar = ceil_by_factor(height * beta, factor)
-                w_bar = ceil_by_factor(width * beta, factor)
-            return h_bar, w_bar
-
-        resized_height, resized_width = smart_resize(height=height, width=width)
-        grid_h, grid_w = resized_height // patch_size, resized_width // patch_size
-        grid_height = grid_h * grid_w
-        grid_width = patch_size * patch_size * temporal_patch_size * channel
-        vision_size = grid_height // 4
-        vision_size = vision_size * num_frames * time
-        grid_height = grid_height * time * batch_size
-
-        vision = [
-            {
-                "batch_size": batch_size,
-                "vision_size": vision_size,
-                "grid_height": grid_height,
-                "grid_width": grid_width,
-                "time": time,
-                "grid_h": grid_h,
-                "grid_w": grid_w,
-                "num_feature_layers": len(self.config.vision_config.deepstack_visual_indexes),
-            }
-        ]
+            else:
+                assert vision_size * f < user_vision_size, (
+                    f"Computed vision_size of {vision_size * f} tokens "
+                    f"(vision_size={vision_size}, num_frames={f}) for image resolution "
+                    f"(width={w}, height={h}) cannot exceed the provided "
+                    f"vision_size={user_vision_size}. Please adjust the image resolution or "
+                    "increase the vision_size."
+                )
+
+            vision.append(
+                {
+                    "batch_size": batch_size,
+                    "vision_size": vision_size,
+                    "grid_height": grid_height,
+                    "grid_width": grid_width,
+                    "grid_h": grid_h,
+                    "grid_w": grid_w,
+                    "time": time,
+                    "num_feature_layers": len(self.config.vision_config.deepstack_visual_indexes),
+                }
+            )
 
         if comp_ctx_lengths_prefill is not None:
             lang = []
diff --git a/QEfficient/utils/constants.py b/QEfficient/utils/constants.py
index f71045834d..36dd35081f 100644
--- a/QEfficient/utils/constants.py
+++ b/QEfficient/utils/constants.py
@@ -184,10 +184,14 @@ def get_default_aic_hw_version() -> str:
 # Qwen2_5_vl Constants
 QWEN2_5_VL_HEIGHT = 354
 QWEN2_5_VL_WIDTH = 536
+IMAGE_FACTOR_QWEN_2_5 = 28
+IMAGE_MIN_TOKEN_NUM = 4
+IMAGE_MAX_TOKEN_NUM = 16384
 
 # Qwen3_vl Constanst
 QWEN3_VL_HEIGHT = 354
 QWEN3_VL_WIDTH = 536
+IMAGE_FACTOR_QWEN_3 = 32
 
 # Modules to cache while clearing the pytorch weights
 CACHE_MODULES = ["get_output_names", "get_dummy_inputs", "get_onnx_dynamic_axes", "get_specializations"]
diff --git a/examples/image_text_to_text/models/qwen_vl/basic_inference.py b/examples/image_text_to_text/models/qwen2_5_vl/basic_inference.py
similarity index 100%
rename from examples/image_text_to_text/models/qwen_vl/basic_inference.py
rename to examples/image_text_to_text/models/qwen2_5_vl/basic_inference.py
diff --git a/examples/image_text_to_text/models/qwen_vl/configs/Qwen2.5-VL-32B-Instruct-AWQ.yaml b/examples/image_text_to_text/models/qwen2_5_vl/configs/Qwen2.5-VL-32B-Instruct-AWQ.yaml
similarity index 100%
rename from examples/image_text_to_text/models/qwen_vl/configs/Qwen2.5-VL-32B-Instruct-AWQ.yaml
rename to examples/image_text_to_text/models/qwen2_5_vl/configs/Qwen2.5-VL-32B-Instruct-AWQ.yaml
diff --git a/examples/image_text_to_text/models/qwen_vl/continuous_batching.py b/examples/image_text_to_text/models/qwen2_5_vl/continuous_batching.py
similarity index 100%
rename from examples/image_text_to_text/models/qwen_vl/continuous_batching.py
rename to examples/image_text_to_text/models/qwen2_5_vl/continuous_batching.py
diff --git a/examples/image_text_to_text/models/qwen2_5_vl/multi_specialization_inference.py b/examples/image_text_to_text/models/qwen2_5_vl/multi_specialization_inference.py
new file mode 100644
index 0000000000..00d454ff01
--- /dev/null
+++ b/examples/image_text_to_text/models/qwen2_5_vl/multi_specialization_inference.py
@@ -0,0 +1,147 @@
+# -----------------------------------------------------------------------------
+#
+# Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# -----------------------------------------------------------------------------
+
+import requests
+import transformers
+from PIL import Image
+from qwen_vl_utils import process_vision_info
+from transformers import AutoConfig, AutoProcessor, TextStreamer
+
+from QEfficient import QEFFAutoModelForImageTextToText
+
+# For AWQ model update pytorch version to 2.8.*
+model_id = "Qwen/Qwen2.5-VL-3B-Instruct"
+config = AutoConfig.from_pretrained(model_id)
+# config.text_config.num_hidden_layers = 2
+
+qeff_model = QEFFAutoModelForImageTextToText.from_pretrained(
+    model_id, attn_implementation="eager", kv_offload=True, config=config
+)
+tokenizer = transformers.AutoTokenizer.from_pretrained(model_id)
+processor = AutoProcessor.from_pretrained(model_id)
+
+# use skip_vision=True, if want to run only text
+skip_vision = False
+
+if skip_vision:  # Only Text
+    batch_size = 1
+    qeff_model.compile(
+        batch_size=batch_size,
+        prefill_seq_len=128,
+        ctx_len=4096,
+        num_cores=16,
+        num_devices=8,
+        height=354,
+        width=536,
+        mxfp6_matmul=False,
+        aic_enable_depth_first=True,
+        skip_vision=True,
+        mos=1,
+    )
+
+    messages = [
+        {
+            "role": "user",
+            "content": [
+                {"type": "text", "text": "Tell me about yourself."},
+            ],
+        },
+    ]
+
+    messages = [messages] * batch_size
+
+    inputs = processor.apply_chat_template(
+        messages,
+        add_generation_prompt=True,
+        tokenize=True,
+        return_dict=True,
+        return_tensors="pt",
+    )
+
+    inputs = qeff_model.model.prepare_inputs_for_generation(inputs=inputs, prefill_seq_len=128, batch_size=batch_size)
+
+    streamer = TextStreamer(tokenizer)
+    output = qeff_model.generate(inputs=inputs, generation_len=100)
+    print(output.generated_ids)
+    print(tokenizer.batch_decode(output.generated_ids))
+    print(output)
+
+else:  # Vision + Text
+    batch_size = 1
+    ctx_len = 8192
+
+    resolutions = [
+        {"width": 360, "height": 120, "num_frames": 3},
+        {"width": 320, "height": 180, "num_frames": 2},
+        {"width": 360, "height": 240, "num_frames": 1},
+        {"width": 454, "height": 256, "num_frames": 1},
+    ]
+
+    widths = [s["width"] for s in resolutions]
+    heights = [s["height"] for s in resolutions]
+    num_frames = [s["num_frames"] for s in resolutions]
+    # vision_size = 4096  # vision_size is the maximum visual-token budget that limits the ViT
+
+    # (Vision Transformer) embeddings passed to the language decoder, together
+    # with the text prompt. Increasing this value preserves more visual detail,
+    # but consumes more of the model’s context length.
+    # This argument is **optional**; if not provided, vision_size is automatically
+    # derived from the input image resolutions to support the largest resolution.
+
+    qeff_model.compile(
+        batch_size=batch_size,
+        prefill_seq_len=128,
+        ctx_len=ctx_len,
+        num_cores=16,
+        num_devices=2,
+        height=heights,
+        width=widths,
+        num_frames=num_frames,
+        mm_processor_kwargs={
+            "min_pixels": 4 * 28 * 28,
+            "max_pixels": 16384 * 28 * 28,
+        },
+        # vision_size=vision_size,
+        mxfp6_matmul=True,
+        mxint8_kv_cache=True,
+        aic_enable_depth_first=True,
+        mos=1,
+    )
+
+    image_url = "https://picsum.photos/id/237/536/354"
+    image = Image.open(requests.get(image_url, stream=True).raw)
+    image = image.resize((360, 120))  # Resize to any dimension (width, height) present in specializations
+    frames = 3
+
+    content = [{"type": "image", "image": image} for _ in range(frames)] + [
+        {"type": "text", "text": "Describe the image"}
+    ]
+
+    messages = [{"role": "user", "content": content}]
+
+    messages = [messages] * batch_size
+    texts = [processor.apply_chat_template(msg, tokenize=False, add_generation_prompt=True) for msg in messages]
+
+    image_inputs, video_inputs = process_vision_info(messages)
+    inputs = processor(
+        text=texts,
+        images=image_inputs,
+        videos=video_inputs,
+        padding=True,
+        return_tensors="pt",
+    )
+
+    image_grid_thw = inputs.get("image_grid_thw")
+    inputs = qeff_model.model.prepare_inputs_for_generation(inputs=inputs, prefill_seq_len=128, batch_size=batch_size)
+
+    streamer = TextStreamer(tokenizer)
+    output = qeff_model.generate(
+        inputs=inputs, tokenizer=tokenizer, generation_len=100, multi_specs=True, num_frames=frames
+    )
+    print(output.generated_ids)
+    print(output.generated_texts)
+    print(output)
diff --git a/examples/image_text_to_text/models/qwen3_vl_moe/multi_specialization_inference.py b/examples/image_text_to_text/models/qwen3_vl_moe/multi_specialization_inference.py
new file mode 100644
index 0000000000..c9c1d81383
--- /dev/null
+++ b/examples/image_text_to_text/models/qwen3_vl_moe/multi_specialization_inference.py
@@ -0,0 +1,150 @@
+# -----------------------------------------------------------------------------
+#
+# Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# -----------------------------------------------------------------------------
+
+import requests
+import transformers
+from PIL import Image
+from qwen_vl_utils import process_vision_info
+from transformers import AutoConfig, AutoProcessor, TextStreamer
+
+from QEfficient import QEFFAutoModelForImageTextToText
+
+model_id = "Qwen/Qwen3-VL-30B-A3B-Instruct"
+config = AutoConfig.from_pretrained(model_id)
+
+# For faster execution user can run with lesser layers, For Testing Purpose Only
+# config.vision_config.depth = 9
+# config.text_config.num_hidden_layers = 1
+# config.vision_config.deepstack_visual_indexes = [8]
+
+qeff_model = QEFFAutoModelForImageTextToText.from_pretrained(
+    model_id, attn_implementation="eager", kv_offload=True, config=config
+)
+tokenizer = transformers.AutoTokenizer.from_pretrained(model_id)
+processor = AutoProcessor.from_pretrained(model_id)
+
+# use skip_vision=True, if want to run only text
+skip_vision = False
+
+if skip_vision:  # Only Text
+    batch_size = 1
+    qeff_model.compile(
+        batch_size=batch_size,
+        prefill_seq_len=128,
+        ctx_len=4096,
+        num_cores=16,
+        num_devices=4,
+        height=354,
+        width=536,
+        mxfp6_matmul=True,
+        aic_enable_depth_first=True,
+        skip_vision=True,
+        mos=1,
+        use_onnx_subfunctions=True,
+    )
+
+    messages = [
+        {
+            "role": "user",
+            "content": [
+                {"type": "text", "text": "Tell me about yourself."},
+            ],
+        },
+    ]
+
+    messages = [messages] * batch_size
+
+    inputs = processor.apply_chat_template(
+        messages,
+        add_generation_prompt=True,
+        tokenize=True,
+        return_dict=True,
+        return_tensors="pt",
+    )
+
+    inputs = qeff_model.model.prepare_inputs_for_generation(inputs=inputs, prefill_seq_len=128, batch_size=batch_size)
+
+    streamer = TextStreamer(tokenizer)
+    output = qeff_model.generate(inputs=inputs, generation_len=100)
+    print(output.generated_ids)
+    print(tokenizer.batch_decode(output.generated_ids))
+    print(output)
+
+else:  # Vision + Text
+    batch_size = 1
+    ctx_len = 8192
+
+    resolutions = [
+        {"width": 360, "height": 240, "num_frames": 3},
+        {"width": 536, "height": 354, "num_frames": 2},
+        {"width": 1024, "height": 1024, "num_frames": 1},
+    ]
+
+    widths = [s["width"] for s in resolutions]
+    heights = [s["height"] for s in resolutions]
+    num_frames = [s["num_frames"] for s in resolutions]
+    # vision_size = 4096  # vision_size is the maximum visual-token budget that limits the ViT
+
+    # (Vision Transformer) embeddings passed to the language decoder, together
+    # with the text prompt. Increasing this value preserves more visual detail,
+    # but consumes more of the model’s context length.
+    # This argument is **optional**; if not provided, vision_size is automatically
+    # derived from the input image resolutions to support the largest resolution.
+
+    qeff_model.compile(
+        batch_size=batch_size,
+        prefill_seq_len=128,
+        ctx_len=ctx_len,
+        num_cores=16,
+        num_devices=4,
+        height=heights,
+        width=widths,
+        num_frames=num_frames,
+        mm_processor_kwargs={
+            "min_pixels": 4 * 32 * 32,
+            "max_pixels": 16384 * 32 * 32,
+        },
+        # vision_size=vision_size,
+        mxfp6_matmul=True,
+        mxint8_kv_cache=True,
+        aic_enable_depth_first=True,
+        mos=1,
+        use_onnx_subfunctions=False,
+    )
+
+    image_url = "https://picsum.photos/id/237/536/354"
+    image = Image.open(requests.get(image_url, stream=True).raw)
+    image = image.resize((360, 240))  # Resize to any dimension (width, height) present in specializations
+    frames = 3
+
+    content = [{"type": "image", "image": image} for _ in range(frames)] + [
+        {"type": "text", "text": "Describe the visual"}
+    ]
+
+    messages = [{"role": "user", "content": content}]
+
+    messages = [messages] * batch_size
+    texts = [processor.apply_chat_template(msg, tokenize=False, add_generation_prompt=True) for msg in messages]
+
+    image_inputs, video_inputs = process_vision_info(messages)
+    inputs = processor(
+        text=texts,
+        images=image_inputs,
+        videos=video_inputs,
+        padding=True,
+        return_tensors="pt",
+    )
+
+    inputs = qeff_model.model.prepare_inputs_for_generation(inputs=inputs, prefill_seq_len=128, batch_size=batch_size)
+
+    streamer = TextStreamer(tokenizer)
+    output = qeff_model.generate(
+        inputs=inputs, tokenizer=tokenizer, generation_len=100, multi_specs=True, num_frames=frames
+    )
+    print(output.generated_ids)
+    print(output.generated_texts)
+    print(output)
diff --git a/examples/image_text_to_text/models/qwen3vl/multi_specialization_inference.py b/examples/image_text_to_text/models/qwen3vl/multi_specialization_inference.py
new file mode 100644
index 0000000000..7456dce289
--- /dev/null
+++ b/examples/image_text_to_text/models/qwen3vl/multi_specialization_inference.py
@@ -0,0 +1,150 @@
+# -----------------------------------------------------------------------------
+#
+# Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# -----------------------------------------------------------------------------
+
+import requests
+import transformers
+from PIL import Image
+from qwen_vl_utils import process_vision_info
+from transformers import AutoConfig, AutoProcessor, TextStreamer
+
+from QEfficient import QEFFAutoModelForImageTextToText
+
+# For AWQ model update pytorch version to 2.8.*
+model_id = "Qwen/Qwen3-VL-2B-Instruct"
+config = AutoConfig.from_pretrained(model_id)
+
+# config.vision_config.depth = 9
+# config.text_config.num_hidden_layers = 1
+# config.vision_config.deepstack_visual_indexes = [8]
+
+qeff_model = QEFFAutoModelForImageTextToText.from_pretrained(
+    model_id, attn_implementation="eager", kv_offload=True, config=config
+)
+tokenizer = transformers.AutoTokenizer.from_pretrained(model_id)
+processor = AutoProcessor.from_pretrained(model_id)
+
+# use skip_vision=True, if want to run only text
+skip_vision = False
+
+if skip_vision:  # Only Text
+    batch_size = 1
+    qeff_model.compile(
+        batch_size=batch_size,
+        prefill_seq_len=128,
+        ctx_len=4096,
+        num_cores=16,
+        num_devices=4,
+        height=354,
+        width=536,
+        mxfp6_matmul=True,
+        aic_enable_depth_first=True,
+        skip_vision=True,
+        mos=1,
+        use_onnx_subfunctions=True,
+    )
+
+    messages = [
+        {
+            "role": "user",
+            "content": [
+                {"type": "text", "text": "Tell me about yourself."},
+            ],
+        },
+    ]
+
+    messages = [messages] * batch_size
+
+    inputs = processor.apply_chat_template(
+        messages,
+        add_generation_prompt=True,
+        tokenize=True,
+        return_dict=True,
+        return_tensors="pt",
+    )
+
+    inputs = qeff_model.model.prepare_inputs_for_generation(inputs=inputs, prefill_seq_len=128, batch_size=batch_size)
+
+    streamer = TextStreamer(tokenizer)
+    output = qeff_model.generate(inputs=inputs, generation_len=100)
+    print(output.generated_ids)
+    print(tokenizer.batch_decode(output.generated_ids))
+    print(output)
+
+else:  # Vision + Text
+    batch_size = 1
+    ctx_len = 8192
+
+    resolutions = [
+        {"width": 360, "height": 240, "num_frames": 3},
+        {"width": 536, "height": 354, "num_frames": 2},
+        {"width": 1024, "height": 1024, "num_frames": 1},
+    ]
+
+    widths = [s["width"] for s in resolutions]
+    heights = [s["height"] for s in resolutions]
+    num_frames = [s["num_frames"] for s in resolutions]
+    # vision_size = 4096  # vision_size is the maximum visual-token budget that limits the ViT
+
+    # (Vision Transformer) embeddings passed to the language decoder, together
+    # with the text prompt. Increasing this value preserves more visual detail,
+    # but consumes more of the model’s context length.
+    # This argument is **optional**; if not provided, vision_size is automatically
+    # derived from the input image resolutions to support the largest resolution.
+
+    qeff_model.compile(
+        batch_size=batch_size,
+        prefill_seq_len=128,
+        ctx_len=ctx_len,
+        num_cores=16,
+        num_devices=4,
+        height=heights,
+        width=widths,
+        num_frames=num_frames,
+        mm_processor_kwargs={
+            "min_pixels": 4 * 32 * 32,
+            "max_pixels": 16384 * 32 * 32,
+        },
+        # vision_size=vision_size,
+        mxfp6_matmul=True,
+        mxint8_kv_cache=True,
+        aic_enable_depth_first=True,
+        mos=1,
+        use_onnx_subfunctions=False,
+    )
+
+    image_url = "https://picsum.photos/id/237/536/354"
+    image = Image.open(requests.get(image_url, stream=True).raw)
+    image = image.resize((360, 240))  # Resize to any dimension (width, height) present in specializations
+    frames = 3
+
+    content = [{"type": "image", "image": image} for _ in range(frames)] + [
+        {"type": "text", "text": "Describe the visual"}
+    ]
+
+    messages = [{"role": "user", "content": content}]
+
+    messages = [messages] * batch_size
+    texts = [processor.apply_chat_template(msg, tokenize=False, add_generation_prompt=True) for msg in messages]
+
+    image_inputs, video_inputs = process_vision_info(messages)
+    inputs = processor(
+        text=texts,
+        images=image_inputs,
+        videos=video_inputs,
+        padding=True,
+        return_tensors="pt",
+    )
+
+    inputs = qeff_model.model.prepare_inputs_for_generation(inputs=inputs, prefill_seq_len=128, batch_size=batch_size)
+
+    streamer = TextStreamer(tokenizer)
+    output = qeff_model.generate(
+        inputs=inputs, tokenizer=tokenizer, generation_len=100, multi_specs=True, num_frames=frames
+    )
+    print(output.generated_ids)
+    print(output.generated_texts)
+    print(output)
diff --git a/pyproject.toml b/pyproject.toml
index b3c44e22d9..7cfbaf83f7 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -37,6 +37,7 @@ dependencies = [
     "tensorboard",
     "fire",
     "py7zr",
+    "qwen-vl-utils==0.0.8",
     "torchmetrics==1.7.0",
     "ftfy==6.3.1",
     "imageio==2.37.2",