quic · quic-amitraj · Dec 11, 2025 · Dec 11, 2025 · Dec 11, 2025 · Dec 11, 2025
diff --git a/QEfficient/__init__.py b/QEfficient/__init__.py
@@ -32,6 +32,7 @@
 from QEfficient.diffusers.pipelines.flux.pipeline_flux import QEffFluxPipeline
 from QEfficient.diffusers.pipelines.wan.pipeline_wan import QEffWanPipeline
 from QEfficient.diffusers.pipelines.wan.pipeline_wan_i2v import QEffWanImageToVideoPipeline
+from QEfficient.diffusers.pipelines.qwen_image.pipeline_qwenimage import QEffQwenImagePipeline
 from QEfficient.exporter.export_hf_to_cloud_ai_100 import qualcomm_efficient_converter
 from QEfficient.generation.text_generation_inference import cloud_ai_100_exec_kv
 from QEfficient.peft import QEffAutoPeftModelForCausalLM
@@ -61,6 +62,7 @@
     "QEffFluxPipeline",
     "QEffWanPipeline",
     "QEffWanImageToVideoPipeline",
+    "QEffQwenImagePipeline",
 ]
 
 

diff --git a/QEfficient/diffusers/models/autoencoders/autoencoder_kl_qwenimage.py b/QEfficient/diffusers/models/autoencoders/autoencoder_kl_qwenimage.py
@@ -0,0 +1,252 @@
+# -----------------------------------------------------------------------------
+#
+# Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# -----------------------------------------------------------------------------
+
+"""
+Qeff modeling changes
+ - Changed upsampling mode from "nearest-exact" to "nearest" for ONNX compatibility.
+ - Used max(0, x.shape[2] - CACHE_T) instead of CACHE_T because x.shape[2] is either 1 or 4,
+ - CACHE_T = 2. This ensures the value never goes negative
+"""
+
+import torch
+from diffusers.models.autoencoders.autoencoder_kl_qwenimage import (
+    QwenImageDecoder3d,
+    QwenImageEncoder3d,
+    QwenImageResample,
+    QwenImageResidualBlock,
+    QwenImageUpsample,
+)
+
+CACHE_T = 2
+
+
+class QEffQwenImageResample(QwenImageResample):
+    r"""
+    A custom resampling module for 2D and 3D data.
+
+    Args:
+        dim (int): The number of input/output channels.
+        mode (str): The resampling mode. Must be one of:
+            - 'none': No resampling (identity operation).
+            - 'upsample2d': 2D upsampling with nearest-exact interpolation and convolution.
+            - 'upsample3d': 3D upsampling with nearest-exact interpolation, convolution, and causal 3D convolution.
+            - 'downsample2d': 2D downsampling with zero-padding and convolution.
+            - 'downsample3d': 3D downsampling with zero-padding, convolution, and causal 3D convolution.
+    """
+
+    def __qeff_init__(self):
+        # Changed upsampling mode from "nearest-exact" to "nearest" for ONNX compatibility.
+        # Since the scale factor is an integer, both modes behave the
+        if self.mode in ("upsample2d", "upsample3d"):
+            self.resample[0] = QwenImageUpsample(scale_factor=(2.0, 2.0), mode="nearest")
+
+    def forward(self, x, feat_cache=None, feat_idx=[0]):
+        b, c, t, h, w = x.size()
+        if self.mode == "upsample3d":
+            if feat_cache is not None:
+                idx = feat_idx[0]
+                if feat_cache[idx] is None:
+                    feat_cache[idx] = "Rep"
+                    feat_idx[0] += 1
+                else:
+                    cache_x = x[:, :, max(0, x.shape[2] - CACHE_T) :, :, :].clone()
+                    if cache_x.shape[2] < 2 and feat_cache[idx] is not None and feat_cache[idx] != "Rep":
+                        # cache last frame of last two chunk
+                        cache_x = torch.cat(
+                            [feat_cache[idx][:, :, -1, :, :].unsqueeze(2).to(cache_x.device), cache_x], dim=2
+                        )
+                    if cache_x.shape[2] < 2 and feat_cache[idx] is not None and feat_cache[idx] == "Rep":
+                        cache_x = torch.cat([torch.zeros_like(cache_x).to(cache_x.device), cache_x], dim=2)
+                    if feat_cache[idx] == "Rep":
+                        x = self.time_conv(x)
+                    else:
+                        x = self.time_conv(x, feat_cache[idx])
+                    feat_cache[idx] = cache_x
+                    feat_idx[0] += 1
+
+                    x = x.reshape(b, 2, c, t, h, w)
+                    x = torch.stack((x[:, 0, :, :, :, :], x[:, 1, :, :, :, :]), 3)
+                    x = x.reshape(b, c, t * 2, h, w)
+        t = x.shape[2]
+        x = x.permute(0, 2, 1, 3, 4).reshape(b * t, c, h, w)
+        x = self.resample(x)
+        x = x.view(b, t, x.size(1), x.size(2), x.size(3)).permute(0, 2, 1, 3, 4)
+
+        if self.mode == "downsample3d":
+            if feat_cache is not None:
+                idx = feat_idx[0]
+                if feat_cache[idx] is None:
+                    feat_cache[idx] = x.clone()
+                    feat_idx[0] += 1
+                else:
+                    cache_x = x[:, :, -1:, :, :].clone()
+                    x = self.time_conv(torch.cat([feat_cache[idx][:, :, -1:, :, :], x], 2))
+                    feat_cache[idx] = cache_x
+                    feat_idx[0] += 1
+        return x
+
+
+class QEffQwenImageResidualBlock(QwenImageResidualBlock):
+    r"""
+    A custom residual block module.
+
+    Args:
+        in_dim (int): Number of input channels.
+        out_dim (int): Number of output channels.
+        dropout (float, optional): Dropout rate for the dropout layer. Default is 0.0.
+        non_linearity (str, optional): Type of non-linearity to use. Default is "silu".
+    """
+
+    def forward(self, x, feat_cache=None, feat_idx=[0]):
+        # Apply shortcut connection
+        h = self.conv_shortcut(x)
+
+        # First normalization and activation
+        x = self.norm1(x)
+        x = self.nonlinearity(x)
+
+        if feat_cache is not None:
+            idx = feat_idx[0]
+            cache_x = x[:, :, max(0, x.shape[2] - CACHE_T) :, :, :].clone()
+            if cache_x.shape[2] < 2 and feat_cache[idx] is not None:
+                cache_x = torch.cat([feat_cache[idx][:, :, -1, :, :].unsqueeze(2).to(cache_x.device), cache_x], dim=2)
+
+            x = self.conv1(x, feat_cache[idx])
+            feat_cache[idx] = cache_x
+            feat_idx[0] += 1
+        else:
+            x = self.conv1(x)
+
+        # Second normalization and activation
+        x = self.norm2(x)
+        x = self.nonlinearity(x)
+
+        # Dropout
+        x = self.dropout(x)
+
+        if feat_cache is not None:
+            idx = feat_idx[0]
+            cache_x = x[:, :, max(0, x.shape[2] - CACHE_T) :, :, :].clone()
+            if cache_x.shape[2] < 2 and feat_cache[idx] is not None:
+                cache_x = torch.cat([feat_cache[idx][:, :, -1, :, :].unsqueeze(2).to(cache_x.device), cache_x], dim=2)
+
+            x = self.conv2(x, feat_cache[idx])
+            feat_cache[idx] = cache_x
+            feat_idx[0] += 1
+        else:
+            x = self.conv2(x)
+
+        # Add residual connection
+        return x + h
+
+
+class QEffQwenImageEncoder3d(QwenImageEncoder3d):
+    r"""
+    A 3D encoder module.
+
+    Args:
+        dim (int): The base number of channels in the first layer.
+        z_dim (int): The dimensionality of the latent space.
+        dim_mult (list of int): Multipliers for the number of channels in each block.
+        num_res_blocks (int): Number of residual blocks in each block.
+        attn_scales (list of float): Scales at which to apply attention mechanisms.
+        temperal_downsample (list of bool): Whether to downsample temporally in each block.
+        dropout (float): Dropout rate for the dropout layers.
+        non_linearity (str): Type of non-linearity to use.
+    """
+
+    def forward(self, x, feat_cache=None, feat_idx=[0]):
+        if feat_cache is not None:
+            idx = feat_idx[0]
+            cache_x = x[:, :, max(0, x.shape[2] - CACHE_T) :, :, :].clone()
+            if cache_x.shape[2] < 2 and feat_cache[idx] is not None:
+                # cache last frame of last two chunk
+                cache_x = torch.cat([feat_cache[idx][:, :, -1, :, :].unsqueeze(2).to(cache_x.device), cache_x], dim=2)
+            x = self.conv_in(x, feat_cache[idx])
+            feat_cache[idx] = cache_x
+            feat_idx[0] += 1
+        else:
+            x = self.conv_in(x)
+
+        ## downsamples
+        for layer in self.down_blocks:
+            if feat_cache is not None:
+                x = layer(x, feat_cache, feat_idx)
+            else:
+                x = layer(x)
+
+        ## middle
+        x = self.mid_block(x, feat_cache, feat_idx)
+
+        ## head
+        x = self.norm_out(x)
+        x = self.nonlinearity(x)
+        if feat_cache is not None:
+            idx = feat_idx[0]
+            cache_x = x[:, :, max(0, x.shape[2] - CACHE_T) :, :, :].clone()
+            if cache_x.shape[2] < 2 and feat_cache[idx] is not None:
+                # cache last frame of last two chunk
+                cache_x = torch.cat([feat_cache[idx][:, :, -1, :, :].unsqueeze(2).to(cache_x.device), cache_x], dim=2)
+            x = self.conv_out(x, feat_cache[idx])
+            feat_cache[idx] = cache_x
+            feat_idx[0] += 1
+        else:
+            x = self.conv_out(x)
+        return x
+
+
+class QEffQwenImageDecoder3d(QwenImageDecoder3d):
+    r"""
+    A 3D decoder module.
+
+    Args:
+        dim (int): The base number of channels in the first layer.
+        z_dim (int): The dimensionality of the latent space.
+        dim_mult (list of int): Multipliers for the number of channels in each block.
+        num_res_blocks (int): Number of residual blocks in each block.
+        attn_scales (list of float): Scales at which to apply attention mechanisms.
+        temperal_upsample (list of bool): Whether to upsample temporally in each block.
+        dropout (float): Dropout rate for the dropout layers.
+        non_linearity (str): Type of non-linearity to use.
+    """
+
+    def forward(self, x, feat_cache=None, feat_idx=[0]):
+        ## conv1
+        if feat_cache is not None:
+            idx = feat_idx[0]
+            cache_x = x[:, :, max(0, x.shape[2] - CACHE_T) :, :, :].clone()
+            if cache_x.shape[2] < 2 and feat_cache[idx] is not None:
+                # cache last frame of last two chunk
+                cache_x = torch.cat([feat_cache[idx][:, :, -1, :, :].unsqueeze(2).to(cache_x.device), cache_x], dim=2)
+            x = self.conv_in(x, feat_cache[idx])
+            feat_cache[idx] = cache_x
+            feat_idx[0] += 1
+        else:
+            x = self.conv_in(x)
+
+        ## middle
+        x = self.mid_block(x, feat_cache, feat_idx)
+
+        ## upsamples
+        for up_block in self.up_blocks:
+            x = up_block(x, feat_cache, feat_idx)
+
+        ## head
+        x = self.norm_out(x)
+        x = self.nonlinearity(x)
+        if feat_cache is not None:
+            idx = feat_idx[0]
+            cache_x = x[:, :, max(0, x.shape[2] - CACHE_T) :, :, :].clone()
+            if cache_x.shape[2] < 2 and feat_cache[idx] is not None:
+                # cache last frame of last two chunk
+                cache_x = torch.cat([feat_cache[idx][:, :, -1, :, :].unsqueeze(2).to(cache_x.device), cache_x], dim=2)
+            x = self.conv_out(x, feat_cache[idx])
+            feat_cache[idx] = cache_x
+            feat_idx[0] += 1
+        else:
+            x = self.conv_out(x)
+        return x
diff --git a/QEfficient/diffusers/models/pytorch_transforms.py b/QEfficient/diffusers/models/pytorch_transforms.py
@@ -5,6 +5,13 @@
 #
 # -----------------------------------------------------------------------------
 
+from diffusers.models.attention_processor import Attention
+from diffusers.models.autoencoders.autoencoder_kl_qwenimage import (
+    QwenImageDecoder3d,
+    QwenImageEncoder3d,
+    QwenImageResample,
+    QwenImageResidualBlock,
+)
 from diffusers.models.autoencoders.autoencoder_kl_wan import (
     AutoencoderKLWan,
     WanDecoder3d,
@@ -20,11 +27,21 @@
     FluxTransformer2DModel,
     FluxTransformerBlock,
 )
+from diffusers.models.transformers.transformer_qwenimage import (
+    QwenImageTransformer2DModel,
+    QwenImageTransformerBlock,
+)
 from diffusers.models.transformers.transformer_wan import WanAttention, WanAttnProcessor, WanTransformer3DModel
 from torch import nn
 
 from QEfficient.base.pytorch_transforms import ModuleMappingTransform
 from QEfficient.customop.rms_norm import CustomRMSNormAIC
+from QEfficient.diffusers.models.autoencoders.autoencoder_kl_qwenimage import (
+    QEffQwenImageDecoder3d,
+    QEffQwenImageEncoder3d,
+    QEffQwenImageResample,
+    QEffQwenImageResidualBlock,
+)
 from QEfficient.diffusers.models.autoencoders.autoencoder_kl_wan import (
     QEffAutoencoderKLWan,
     QEffWanDecoder3d,
@@ -44,6 +61,11 @@
     QEffFluxTransformer2DModel,
     QEffFluxTransformerBlock,
 )
+from QEfficient.diffusers.models.transformers.transformer_qwenimage import (
+    QEffQwenImageAttention,
+    QEffQwenImageTransformer2DModel,
+    QEffQwenImageTransformerBlock,
+)
 from QEfficient.diffusers.models.transformers.transformer_wan import (
     QEffWanAttention,
     QEffWanAttnProcessor,
@@ -69,10 +91,17 @@ class AttentionTransform(ModuleMappingTransform):
         WanAttention: QEffWanAttention,
         WanTransformer3DModel: QEffWanTransformer3DModel,
         AutoencoderKLWan: QEffAutoencoderKLWan,
+        QwenImageTransformer2DModel: QEffQwenImageTransformer2DModel,
+        QwenImageTransformerBlock: QEffQwenImageTransformerBlock,
+        Attention: QEffQwenImageAttention,
         WanDecoder3d: QEffWanDecoder3d,
         WanEncoder3d: QEffWanEncoder3d,
         WanResidualBlock: QEffWanResidualBlock,
         WanResample: QEffWanResample,
+        QwenImageResample: QEffQwenImageResample,
+        QwenImageResidualBlock: QEffQwenImageResidualBlock,
+        QwenImageEncoder3d: QEffQwenImageEncoder3d,
+        QwenImageDecoder3d: QEffQwenImageDecoder3d,
     }