Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
24 commits
Select commit Hold shift + click to select a range
3f9968d
Onboarding Qwen Image
qcdipankar Dec 11, 2025
c306c3d
Cleaning Done 1
qcdipankar Dec 11, 2025
03d6973
Comments added
qcdipankar Dec 11, 2025
393751f
Fixing QwenProcessor issue in diffuser pipeline
qcdipankar Dec 11, 2025
b97788a
Minor Fixes
qcdipankar Dec 11, 2025
7ead3f7
Repointer Issue fixed
qcdipankar Dec 11, 2025
e502f7b
Ruff check fixed
qcdipankar Dec 11, 2025
a2a072e
DCO fix
qcdipankar Dec 11, 2025
6f2f7eb
Cleaning Done
qcdipankar Dec 22, 2025
4b747e8
[WIP] Updating Qwen image with scaling changes
tv-karthikeya Feb 11, 2026
32559ef
Enabling Vae on Qaic, DIT rope out
tv-karthikeya Feb 16, 2026
37e25ed
Wan I2V support (#788)
tv-karthikeya Mar 25, 2026
e0eca20
Onboarding Qwen Image
qcdipankar Dec 11, 2025
f961134
Fixing QwenProcessor issue in diffuser pipeline
qcdipankar Dec 11, 2025
119a352
[WIP] Updating Qwen image with scaling changes
tv-karthikeya Feb 11, 2026
b68de4e
Enabling Vae on Qaic, DIT rope out
tv-karthikeya Feb 16, 2026
b2c7315
Updated Qwen Image, resolved issue wrt static shapes
tv-karthikeya Apr 21, 2026
4c04368
Merge branch 'quic:main' into qwen_image_pipeline
tv-karthikeya Apr 30, 2026
164ce79
Refactored Qwen images scripts
tv-karthikeya Apr 30, 2026
751455e
Added Readme for Qwen Image
tv-karthikeya May 4, 2026
bb2af6b
Merge branch 'main' into qwen_image_pipeline
tv-karthikeya May 4, 2026
ae409f0
Merge branch 'main' into qwen_image_pipeline
tv-karthikeya May 6, 2026
f6777c9
added suport of magcache for qwen_image
quic-amitraj May 19, 2026
25b15e0
Updated example script and pipeline
quic-amitraj May 20, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions QEfficient/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@
from QEfficient.diffusers.pipelines.flux.pipeline_flux import QEffFluxPipeline
from QEfficient.diffusers.pipelines.wan.pipeline_wan import QEffWanPipeline
from QEfficient.diffusers.pipelines.wan.pipeline_wan_i2v import QEffWanImageToVideoPipeline
from QEfficient.diffusers.pipelines.qwen_image.pipeline_qwenimage import QEffQwenImagePipeline
from QEfficient.exporter.export_hf_to_cloud_ai_100 import qualcomm_efficient_converter
from QEfficient.generation.text_generation_inference import cloud_ai_100_exec_kv
from QEfficient.peft import QEffAutoPeftModelForCausalLM
Expand Down Expand Up @@ -61,6 +62,7 @@
"QEffFluxPipeline",
"QEffWanPipeline",
"QEffWanImageToVideoPipeline",
"QEffQwenImagePipeline",
]


Expand Down
252 changes: 252 additions & 0 deletions QEfficient/diffusers/models/autoencoders/autoencoder_kl_qwenimage.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,252 @@
# -----------------------------------------------------------------------------
#
# Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
# SPDX-License-Identifier: BSD-3-Clause
#
# -----------------------------------------------------------------------------

"""
Qeff modeling changes
- Changed upsampling mode from "nearest-exact" to "nearest" for ONNX compatibility.
- Used max(0, x.shape[2] - CACHE_T) instead of CACHE_T because x.shape[2] is either 1 or 4,
- CACHE_T = 2. This ensures the value never goes negative
"""

import torch
from diffusers.models.autoencoders.autoencoder_kl_qwenimage import (
QwenImageDecoder3d,
QwenImageEncoder3d,
QwenImageResample,
QwenImageResidualBlock,
QwenImageUpsample,
)

CACHE_T = 2


class QEffQwenImageResample(QwenImageResample):
r"""
A custom resampling module for 2D and 3D data.

Args:
dim (int): The number of input/output channels.
mode (str): The resampling mode. Must be one of:
- 'none': No resampling (identity operation).
- 'upsample2d': 2D upsampling with nearest-exact interpolation and convolution.
- 'upsample3d': 3D upsampling with nearest-exact interpolation, convolution, and causal 3D convolution.
- 'downsample2d': 2D downsampling with zero-padding and convolution.
- 'downsample3d': 3D downsampling with zero-padding, convolution, and causal 3D convolution.
"""

def __qeff_init__(self):
# Changed upsampling mode from "nearest-exact" to "nearest" for ONNX compatibility.
# Since the scale factor is an integer, both modes behave the
if self.mode in ("upsample2d", "upsample3d"):
self.resample[0] = QwenImageUpsample(scale_factor=(2.0, 2.0), mode="nearest")

def forward(self, x, feat_cache=None, feat_idx=[0]):
b, c, t, h, w = x.size()
if self.mode == "upsample3d":
if feat_cache is not None:
idx = feat_idx[0]
if feat_cache[idx] is None:
feat_cache[idx] = "Rep"
feat_idx[0] += 1
else:
cache_x = x[:, :, max(0, x.shape[2] - CACHE_T) :, :, :].clone()
if cache_x.shape[2] < 2 and feat_cache[idx] is not None and feat_cache[idx] != "Rep":
# cache last frame of last two chunk
cache_x = torch.cat(
[feat_cache[idx][:, :, -1, :, :].unsqueeze(2).to(cache_x.device), cache_x], dim=2
)
if cache_x.shape[2] < 2 and feat_cache[idx] is not None and feat_cache[idx] == "Rep":
cache_x = torch.cat([torch.zeros_like(cache_x).to(cache_x.device), cache_x], dim=2)
if feat_cache[idx] == "Rep":
x = self.time_conv(x)
else:
x = self.time_conv(x, feat_cache[idx])
feat_cache[idx] = cache_x
feat_idx[0] += 1

x = x.reshape(b, 2, c, t, h, w)
x = torch.stack((x[:, 0, :, :, :, :], x[:, 1, :, :, :, :]), 3)
x = x.reshape(b, c, t * 2, h, w)
t = x.shape[2]
x = x.permute(0, 2, 1, 3, 4).reshape(b * t, c, h, w)
x = self.resample(x)
x = x.view(b, t, x.size(1), x.size(2), x.size(3)).permute(0, 2, 1, 3, 4)

if self.mode == "downsample3d":
if feat_cache is not None:
idx = feat_idx[0]
if feat_cache[idx] is None:
feat_cache[idx] = x.clone()
feat_idx[0] += 1
else:
cache_x = x[:, :, -1:, :, :].clone()
x = self.time_conv(torch.cat([feat_cache[idx][:, :, -1:, :, :], x], 2))
feat_cache[idx] = cache_x
feat_idx[0] += 1
return x


class QEffQwenImageResidualBlock(QwenImageResidualBlock):
r"""
A custom residual block module.

Args:
in_dim (int): Number of input channels.
out_dim (int): Number of output channels.
dropout (float, optional): Dropout rate for the dropout layer. Default is 0.0.
non_linearity (str, optional): Type of non-linearity to use. Default is "silu".
"""

def forward(self, x, feat_cache=None, feat_idx=[0]):
# Apply shortcut connection
h = self.conv_shortcut(x)

# First normalization and activation
x = self.norm1(x)
x = self.nonlinearity(x)

if feat_cache is not None:
idx = feat_idx[0]
cache_x = x[:, :, max(0, x.shape[2] - CACHE_T) :, :, :].clone()
if cache_x.shape[2] < 2 and feat_cache[idx] is not None:
cache_x = torch.cat([feat_cache[idx][:, :, -1, :, :].unsqueeze(2).to(cache_x.device), cache_x], dim=2)

x = self.conv1(x, feat_cache[idx])
feat_cache[idx] = cache_x
feat_idx[0] += 1
else:
x = self.conv1(x)

# Second normalization and activation
x = self.norm2(x)
x = self.nonlinearity(x)

# Dropout
x = self.dropout(x)

if feat_cache is not None:
idx = feat_idx[0]
cache_x = x[:, :, max(0, x.shape[2] - CACHE_T) :, :, :].clone()
if cache_x.shape[2] < 2 and feat_cache[idx] is not None:
cache_x = torch.cat([feat_cache[idx][:, :, -1, :, :].unsqueeze(2).to(cache_x.device), cache_x], dim=2)

x = self.conv2(x, feat_cache[idx])
feat_cache[idx] = cache_x
feat_idx[0] += 1
else:
x = self.conv2(x)

# Add residual connection
return x + h


class QEffQwenImageEncoder3d(QwenImageEncoder3d):
r"""
A 3D encoder module.

Args:
dim (int): The base number of channels in the first layer.
z_dim (int): The dimensionality of the latent space.
dim_mult (list of int): Multipliers for the number of channels in each block.
num_res_blocks (int): Number of residual blocks in each block.
attn_scales (list of float): Scales at which to apply attention mechanisms.
temperal_downsample (list of bool): Whether to downsample temporally in each block.
dropout (float): Dropout rate for the dropout layers.
non_linearity (str): Type of non-linearity to use.
"""

def forward(self, x, feat_cache=None, feat_idx=[0]):
if feat_cache is not None:
idx = feat_idx[0]
cache_x = x[:, :, max(0, x.shape[2] - CACHE_T) :, :, :].clone()
if cache_x.shape[2] < 2 and feat_cache[idx] is not None:
# cache last frame of last two chunk
cache_x = torch.cat([feat_cache[idx][:, :, -1, :, :].unsqueeze(2).to(cache_x.device), cache_x], dim=2)
x = self.conv_in(x, feat_cache[idx])
feat_cache[idx] = cache_x
feat_idx[0] += 1
else:
x = self.conv_in(x)

## downsamples
for layer in self.down_blocks:
if feat_cache is not None:
x = layer(x, feat_cache, feat_idx)
else:
x = layer(x)

## middle
x = self.mid_block(x, feat_cache, feat_idx)

## head
x = self.norm_out(x)
x = self.nonlinearity(x)
if feat_cache is not None:
idx = feat_idx[0]
cache_x = x[:, :, max(0, x.shape[2] - CACHE_T) :, :, :].clone()
if cache_x.shape[2] < 2 and feat_cache[idx] is not None:
# cache last frame of last two chunk
cache_x = torch.cat([feat_cache[idx][:, :, -1, :, :].unsqueeze(2).to(cache_x.device), cache_x], dim=2)
x = self.conv_out(x, feat_cache[idx])
feat_cache[idx] = cache_x
feat_idx[0] += 1
else:
x = self.conv_out(x)
return x


class QEffQwenImageDecoder3d(QwenImageDecoder3d):
r"""
A 3D decoder module.

Args:
dim (int): The base number of channels in the first layer.
z_dim (int): The dimensionality of the latent space.
dim_mult (list of int): Multipliers for the number of channels in each block.
num_res_blocks (int): Number of residual blocks in each block.
attn_scales (list of float): Scales at which to apply attention mechanisms.
temperal_upsample (list of bool): Whether to upsample temporally in each block.
dropout (float): Dropout rate for the dropout layers.
non_linearity (str): Type of non-linearity to use.
"""

def forward(self, x, feat_cache=None, feat_idx=[0]):
## conv1
if feat_cache is not None:
idx = feat_idx[0]
cache_x = x[:, :, max(0, x.shape[2] - CACHE_T) :, :, :].clone()
if cache_x.shape[2] < 2 and feat_cache[idx] is not None:
# cache last frame of last two chunk
cache_x = torch.cat([feat_cache[idx][:, :, -1, :, :].unsqueeze(2).to(cache_x.device), cache_x], dim=2)
x = self.conv_in(x, feat_cache[idx])
feat_cache[idx] = cache_x
feat_idx[0] += 1
else:
x = self.conv_in(x)

## middle
x = self.mid_block(x, feat_cache, feat_idx)

## upsamples
for up_block in self.up_blocks:
x = up_block(x, feat_cache, feat_idx)

## head
x = self.norm_out(x)
x = self.nonlinearity(x)
if feat_cache is not None:
idx = feat_idx[0]
cache_x = x[:, :, max(0, x.shape[2] - CACHE_T) :, :, :].clone()
if cache_x.shape[2] < 2 and feat_cache[idx] is not None:
# cache last frame of last two chunk
cache_x = torch.cat([feat_cache[idx][:, :, -1, :, :].unsqueeze(2).to(cache_x.device), cache_x], dim=2)
x = self.conv_out(x, feat_cache[idx])
feat_cache[idx] = cache_x
feat_idx[0] += 1
else:
x = self.conv_out(x)
return x
29 changes: 29 additions & 0 deletions QEfficient/diffusers/models/pytorch_transforms.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,13 @@
#
# -----------------------------------------------------------------------------

from diffusers.models.attention_processor import Attention
from diffusers.models.autoencoders.autoencoder_kl_qwenimage import (
QwenImageDecoder3d,
QwenImageEncoder3d,
QwenImageResample,
QwenImageResidualBlock,
)
from diffusers.models.autoencoders.autoencoder_kl_wan import (
AutoencoderKLWan,
WanDecoder3d,
Expand All @@ -20,11 +27,21 @@
FluxTransformer2DModel,
FluxTransformerBlock,
)
from diffusers.models.transformers.transformer_qwenimage import (
QwenImageTransformer2DModel,
QwenImageTransformerBlock,
)
from diffusers.models.transformers.transformer_wan import WanAttention, WanAttnProcessor, WanTransformer3DModel
from torch import nn

from QEfficient.base.pytorch_transforms import ModuleMappingTransform
from QEfficient.customop.rms_norm import CustomRMSNormAIC
from QEfficient.diffusers.models.autoencoders.autoencoder_kl_qwenimage import (
QEffQwenImageDecoder3d,
QEffQwenImageEncoder3d,
QEffQwenImageResample,
QEffQwenImageResidualBlock,
)
from QEfficient.diffusers.models.autoencoders.autoencoder_kl_wan import (
QEffAutoencoderKLWan,
QEffWanDecoder3d,
Expand All @@ -44,6 +61,11 @@
QEffFluxTransformer2DModel,
QEffFluxTransformerBlock,
)
from QEfficient.diffusers.models.transformers.transformer_qwenimage import (
QEffQwenImageAttention,
QEffQwenImageTransformer2DModel,
QEffQwenImageTransformerBlock,
)
from QEfficient.diffusers.models.transformers.transformer_wan import (
QEffWanAttention,
QEffWanAttnProcessor,
Expand All @@ -69,10 +91,17 @@ class AttentionTransform(ModuleMappingTransform):
WanAttention: QEffWanAttention,
WanTransformer3DModel: QEffWanTransformer3DModel,
AutoencoderKLWan: QEffAutoencoderKLWan,
QwenImageTransformer2DModel: QEffQwenImageTransformer2DModel,
QwenImageTransformerBlock: QEffQwenImageTransformerBlock,
Attention: QEffQwenImageAttention,
WanDecoder3d: QEffWanDecoder3d,
WanEncoder3d: QEffWanEncoder3d,
WanResidualBlock: QEffWanResidualBlock,
WanResample: QEffWanResample,
QwenImageResample: QEffQwenImageResample,
QwenImageResidualBlock: QEffQwenImageResidualBlock,
QwenImageEncoder3d: QEffQwenImageEncoder3d,
QwenImageDecoder3d: QEffQwenImageDecoder3d,
}


Expand Down
Loading
Loading