From a03145845cea1fddec1cb9f8d7f4ceab9d11ba67 Mon Sep 17 00:00:00 2001 From: Mohit Soni Date: Tue, 23 Dec 2025 07:03:04 +0000 Subject: [PATCH 01/10] Adding vae decoder in Wan Signed-off-by: Mohit Soni --- .../models/autoencoders/autoencoder_kl_wan.py | 221 ++++++++++++++++++ .../diffusers/models/pytorch_transforms.py | 19 ++ .../diffusers/pipelines/pipeline_module.py | 116 +++++++++ .../diffusers/pipelines/wan/pipeline_wan.py | 48 ++-- examples/diffusers/wan/wan_config.json | 30 ++- 5 files changed, 417 insertions(+), 17 deletions(-) create mode 100644 QEfficient/diffusers/models/autoencoders/autoencoder_kl_wan.py diff --git a/QEfficient/diffusers/models/autoencoders/autoencoder_kl_wan.py b/QEfficient/diffusers/models/autoencoders/autoencoder_kl_wan.py new file mode 100644 index 000000000..abd9d9491 --- /dev/null +++ b/QEfficient/diffusers/models/autoencoders/autoencoder_kl_wan.py @@ -0,0 +1,221 @@ +# Copyright 2025 The Wan Team and The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import torch +from diffusers import AutoencoderKLWan +from diffusers.models.autoencoders.autoencoder_kl_wan import ( + WanDecoder3d, + WanEncoder3d, + WanResample, + WanResidualBlock, + WanUpsample, +) + +CACHE_T = 2 + +modes = [] + + +class QEffWanResample(WanResample): + def __qeff_init__(self): + if self.mode in ("upsample2d", "upsample3d"): + self.resample[0] = WanUpsample(scale_factor=(2.0, 2.0), mode="nearest") + + def forward(self, x, feat_cache=None, feat_idx=[0]): + b, c, t, h, w = x.size() + if self.mode == "upsample3d": + if feat_cache is not None: + idx = feat_idx[0] + if feat_cache[idx] is None: + feat_cache[idx] = "Rep" + feat_idx[0] += 1 + else: + cache_x = x[:, :, max(0, x.shape[2] - CACHE_T) :, :, :].clone() + if cache_x.shape[2] < 2 and feat_cache[idx] is not None and feat_cache[idx] != "Rep": + # cache last frame of last two chunk + cache_x = torch.cat( + [feat_cache[idx][:, :, -1, :, :].unsqueeze(2).to(cache_x.device), cache_x], dim=2 + ) + if cache_x.shape[2] < 2 and feat_cache[idx] is not None and feat_cache[idx] == "Rep": + cache_x = torch.cat([torch.zeros_like(cache_x).to(cache_x.device), cache_x], dim=2) + if feat_cache[idx] == "Rep": + x = self.time_conv(x) + else: + x = self.time_conv(x, feat_cache[idx]) + feat_cache[idx] = cache_x + feat_idx[0] += 1 + + x = x.reshape(b, 2, c, t, h, w) + x = torch.stack((x[:, 0, :, :, :, :], x[:, 1, :, :, :, :]), 3) + x = x.reshape(b, c, t * 2, h, w) + t = x.shape[2] + x = x.permute(0, 2, 1, 3, 4).reshape(b * t, c, h, w) + modes.append(self.mode) + x = self.resample(x) + x = x.view(b, t, x.size(1), x.size(2), x.size(3)).permute(0, 2, 1, 3, 4) + + if self.mode == "downsample3d": + if feat_cache is not None: + idx = feat_idx[0] + if feat_cache[idx] is None: + feat_cache[idx] = x.clone() + feat_idx[0] += 1 + else: + cache_x = x[:, :, -1:, :, :].clone() + x = self.time_conv(torch.cat([feat_cache[idx][:, :, -1:, :, :], x], 2)) + feat_cache[idx] = cache_x + feat_idx[0] += 1 + return x + + +class QEffWanResidualBlock(WanResidualBlock): + def forward(self, x, feat_cache=None, feat_idx=[0]): + # Apply shortcut connection + h = self.conv_shortcut(x) + + # First normalization and activation + x = self.norm1(x) + x = self.nonlinearity(x) + + if feat_cache is not None: + idx = feat_idx[0] + cache_x = x[:, :, max(0, x.shape[2] - CACHE_T) :, :, :].clone() + if cache_x.shape[2] < 2 and feat_cache[idx] is not None: + cache_x = torch.cat([feat_cache[idx][:, :, -1, :, :].unsqueeze(2).to(cache_x.device), cache_x], dim=2) + + x = self.conv1(x, feat_cache[idx]) + feat_cache[idx] = cache_x + feat_idx[0] += 1 + else: + x = self.conv1(x) + + # Second normalization and activation + x = self.norm2(x) + x = self.nonlinearity(x) + + # Dropout + x = self.dropout(x) + + if feat_cache is not None: + idx = feat_idx[0] + cache_x = x[:, :, max(0, x.shape[2] - CACHE_T) :, :, :].clone() + if cache_x.shape[2] < 2 and feat_cache[idx] is not None: + cache_x = torch.cat([feat_cache[idx][:, :, -1, :, :].unsqueeze(2).to(cache_x.device), cache_x], dim=2) + + x = self.conv2(x, feat_cache[idx]) + feat_cache[idx] = cache_x + feat_idx[0] += 1 + else: + x = self.conv2(x) + + # Add residual connection + return x + h + + +class QEffWanEncoder3d(WanEncoder3d): + def forward(self, x, feat_cache=None, feat_idx=[0]): + if feat_cache is not None: + idx = feat_idx[0] + cache_x = x[:, :, max(0, x.shape[2] - CACHE_T) :, :, :].clone() + if cache_x.shape[2] < 2 and feat_cache[idx] is not None: + # cache last frame of last two chunk + cache_x = torch.cat([feat_cache[idx][:, :, -1, :, :].unsqueeze(2).to(cache_x.device), cache_x], dim=2) + x = self.conv_in(x, feat_cache[idx]) + feat_cache[idx] = cache_x + feat_idx[0] += 1 + else: + x = self.conv_in(x) + + ## downsamples + for layer in self.down_blocks: + if feat_cache is not None: + x = layer(x, feat_cache, feat_idx) + else: + x = layer(x) + + ## middle + x = self.mid_block(x, feat_cache, feat_idx) + + ## head + x = self.norm_out(x) + x = self.nonlinearity(x) + if feat_cache is not None: + idx = feat_idx[0] + cache_x = x[:, :, max(0, x.shape[2] - CACHE_T) :, :, :].clone() + if cache_x.shape[2] < 2 and feat_cache[idx] is not None: + # cache last frame of last two chunk + cache_x = torch.cat([feat_cache[idx][:, :, -1, :, :].unsqueeze(2).to(cache_x.device), cache_x], dim=2) + x = self.conv_out(x, feat_cache[idx]) + feat_cache[idx] = cache_x + feat_idx[0] += 1 + else: + x = self.conv_out(x) + return x + + +class QEffWanDecoder3d(WanDecoder3d): + def forward(self, x, feat_cache=None, feat_idx=[0], first_chunk=False): + ## conv1 + if feat_cache is not None: + idx = feat_idx[0] + cache_x = x[:, :, max(0, x.shape[2] - CACHE_T) :, :, :].clone() + if cache_x.shape[2] < 2 and feat_cache[idx] is not None: + # cache last frame of last two chunk + cache_x = torch.cat([feat_cache[idx][:, :, -1, :, :].unsqueeze(2).to(cache_x.device), cache_x], dim=2) + x = self.conv_in(x, feat_cache[idx]) + feat_cache[idx] = cache_x + feat_idx[0] += 1 + else: + x = self.conv_in(x) + + ## middle + x = self.mid_block(x, feat_cache, feat_idx) + + ## upsamples + for up_block in self.up_blocks: + x = up_block(x, feat_cache, feat_idx, first_chunk=first_chunk) + + ## head + x = self.norm_out(x) + x = self.nonlinearity(x) + if feat_cache is not None: + idx = feat_idx[0] + cache_x = x[:, :, max(0, x.shape[2] - CACHE_T) :, :, :].clone() + if cache_x.shape[2] < 2 and feat_cache[idx] is not None: + # cache last frame of last two chunk + cache_x = torch.cat([feat_cache[idx][:, :, -1, :, :].unsqueeze(2).to(cache_x.device), cache_x], dim=2) + x = self.conv_out(x, feat_cache[idx]) + feat_cache[idx] = cache_x + feat_idx[0] += 1 + else: + x = self.conv_out(x) + return x + + +class QEffAutoencoderKLWan(AutoencoderKLWan): + def forward( + self, + latent_sample: torch.Tensor, + sample_posterior: bool = False, + return_dict: bool = True, + generator=None, + ): + """ + Args: + sample (`torch.Tensor`): Input sample. + return_dict (`bool`, *optional*, defaults to `True`): + Whether or not to return a [`DecoderOutput`] instead of a plain tuple. + """ + + return self.decode(latent_sample) diff --git a/QEfficient/diffusers/models/pytorch_transforms.py b/QEfficient/diffusers/models/pytorch_transforms.py index 4fb5c3f12..6fa8cc28a 100644 --- a/QEfficient/diffusers/models/pytorch_transforms.py +++ b/QEfficient/diffusers/models/pytorch_transforms.py @@ -5,6 +5,13 @@ # # ----------------------------------------------------------------------------- +from diffusers.models.autoencoders.autoencoder_kl_wan import ( + AutoencoderKLWan, + WanDecoder3d, + WanEncoder3d, + WanResample, + WanResidualBlock, +) from diffusers.models.normalization import AdaLayerNormContinuous, AdaLayerNormZero, AdaLayerNormZeroSingle, RMSNorm from diffusers.models.transformers.transformer_flux import ( FluxAttention, @@ -18,6 +25,13 @@ from QEfficient.base.pytorch_transforms import ModuleMappingTransform from QEfficient.customop.rms_norm import CustomRMSNormAIC +from QEfficient.diffusers.models.autoencoders.autoencoder_kl_wan import ( + QEffAutoencoderKLWan, + QEffWanDecoder3d, + QEffWanEncoder3d, + QEffWanResample, + QEffWanResidualBlock, +) from QEfficient.diffusers.models.normalization import ( QEffAdaLayerNormContinuous, QEffAdaLayerNormZero, @@ -54,6 +68,11 @@ class AttentionTransform(ModuleMappingTransform): WanAttnProcessor: QEffWanAttnProcessor, WanAttention: QEffWanAttention, WanTransformer3DModel: QEffWanTransformer3DModel, + WanDecoder3d: QEffWanDecoder3d, + WanEncoder3d: QEffWanEncoder3d, + WanResidualBlock: QEffWanResidualBlock, + WanResample: QEffWanResample, + AutoencoderKLWan: QEffAutoencoderKLWan, } diff --git a/QEfficient/diffusers/pipelines/pipeline_module.py b/QEfficient/diffusers/pipelines/pipeline_module.py index 19e7701d4..268b47dae 100644 --- a/QEfficient/diffusers/pipelines/pipeline_module.py +++ b/QEfficient/diffusers/pipelines/pipeline_module.py @@ -214,6 +214,122 @@ def compile(self, specializations: List[Dict], **compiler_options) -> None: self._compile(specializations=specializations, **compiler_options) +class QEffAutoencoderKLWan(QEFFBaseModel): + """ + Wrapper for Variational Autoencoder (VAE) models with ONNX export and QAIC compilation. + + This class handles VAE models with specific transformations and optimizations + for efficient inference on Qualcomm AI hardware. VAE models are used in diffusion + pipelines for encoding images to latent space and decoding latents back to images. + + Attributes: + model (nn.Module): The wrapped VAE model (deep copy of original) + type (str): VAE operation type ("encoder" or "decoder") + _pytorch_transforms (List): PyTorch transformations applied before ONNX export + _onnx_transforms (List): ONNX transformations applied after export + """ + + _pytorch_transforms = [AttentionTransform, CustomOpsTransform] + _onnx_transforms = [FP16ClipTransform, SplitTensorsTransform] + + @property + def get_model_config(self) -> Dict: + """ + Get the model configuration as a dictionary. + + Returns: + Dict: The configuration dictionary of the underlying VAE model + """ + return self.model.config.__dict__ + + def __init__(self, model: nn.Module, type: str) -> None: + """ + Initialize the VAE wrapper. + + Args: + model (nn.Module): The pipeline model containing the VAE + type (str): VAE operation type ("encoder" or "decoder") + """ + super().__init__(model) + self.model = model + + # To have different hashing for encoder/decoder + self.model.config["type"] = type + + def get_onnx_params(self) -> Tuple[Dict, Dict, List[str]]: + """ + Generate ONNX export configuration for the VAE decoder. + + Args: + latent_height (int): Height of latent representation (default: 32) + latent_width (int): Width of latent representation (default: 32) + + Returns: + Tuple containing: + - example_inputs (Dict): Sample inputs for ONNX export + - dynamic_axes (Dict): Specification of dynamic dimensions + - output_names (List[str]): Names of model outputs + """ + bs = constants.ONNX_EXPORT_EXAMPLE_BATCH_SIZE + + # VAE decoder takes latent representation as input + example_inputs = { + "latent_sample": torch.randn(bs, 16, 21, 24, 40), + "return_dict": False, + } + + output_names = ["sample"] + + # All dimensions except channels can be dynamic + dynamic_axes = { + "latent_sample": {2: "num_frames", 3: "latent_height", 4: "latent_width"}, + } + + return example_inputs, dynamic_axes, output_names + + def export( + self, + inputs: Dict, + output_names: List[str], + dynamic_axes: Dict, + export_dir: str = None, + export_kwargs: Dict = {}, + ) -> str: + """ + Export the VAE model to ONNX format. + + Args: + inputs (Dict): Example inputs for ONNX export + output_names (List[str]): Names of model outputs + dynamic_axes (Dict): Specification of dynamic dimensions + export_dir (str, optional): Directory to save ONNX model + export_kwargs (Dict, optional): Additional export arguments + + Returns: + str: Path to the exported ONNX model + """ + + self.model.config["_use_default_values"].sort() + + return self._export( + example_inputs=inputs, + output_names=output_names, + dynamic_axes=dynamic_axes, + export_dir=export_dir, + **export_kwargs, + ) + + def compile(self, specializations: List[Dict], **compiler_options) -> None: + """ + Compile the ONNX model for Qualcomm AI hardware. + + Args: + specializations (List[Dict]): Model specialization configurations + **compiler_options: Additional compiler options + """ + self._compile(specializations=specializations, **compiler_options) + + class QEffVAE(QEFFBaseModel): """ Wrapper for Variational Autoencoder (VAE) models with ONNX export and QAIC compilation. diff --git a/QEfficient/diffusers/pipelines/wan/pipeline_wan.py b/QEfficient/diffusers/pipelines/wan/pipeline_wan.py index 888763af0..3e15043d3 100644 --- a/QEfficient/diffusers/pipelines/wan/pipeline_wan.py +++ b/QEfficient/diffusers/pipelines/wan/pipeline_wan.py @@ -11,7 +11,7 @@ for high-performance text-to-video generation on Qualcomm AI hardware. The pipeline supports WAN 2.2 architectures with unified transformer. -TODO: 1. Update Vae, umt5 to Qaic; present running on cpu +TODO: 1. Update umt5 to Qaic; present running on cpu """ import os @@ -22,7 +22,7 @@ import torch from diffusers import WanPipeline -from QEfficient.diffusers.pipelines.pipeline_module import QEffWanUnifiedTransformer +from QEfficient.diffusers.pipelines.pipeline_module import QEffAutoencoderKLWan, QEffWanUnifiedTransformer from QEfficient.diffusers.pipelines.pipeline_utils import ( ONNX_SUBFUNCTION_MODULE, ModulePerf, @@ -106,11 +106,10 @@ def __init__(self, model, **kwargs): self.transformer = QEffWanUnifiedTransformer(self.unified_wrapper) # VAE decoder for latent-to-video conversion - self.vae_decode = model.vae - + self.vae_decoder = QEffAutoencoderKLWan(model.vae, "decoder") # Store all modules in a dictionary for easy iteration during export/compile - # TODO: add text encoder, vae decoder on QAIC - self.modules = {"transformer": self.transformer} + # TODO: add text encoder on QAIC + self.modules = {"transformer": self.transformer, "vae_decoder": self.vae_decoder} # Copy tokenizers and scheduler from the original model self.tokenizer = model.tokenizer @@ -336,7 +335,14 @@ def compile( "latent_width": latent_width, # Latent space width "num_frames": latent_frames, # Latent frames }, - ] + ], + "vae_decoder": [ + { + "num_frames": latent_frames, + "latent_height": latent_height, + "latent_width": latent_width, + } + ], } # Use generic utility functions for compilation @@ -548,6 +554,11 @@ def __call__( str(self.transformer.qpc_path), device_ids=self.transformer.device_ids ) + if self.vae_decoder.qpc_session is None: + self.vae_decoder.qpc_session = QAICInferenceSession( + str(self.vae_decoder.qpc_path), device_ids=self.vae_decoder.device_ids + ) + # Calculate compressed latent dimension for transformer buffer allocation cl, _, _, _ = calculate_latent_dimensions_with_frames( height, @@ -722,31 +733,36 @@ def __call__( # Step 9: Decode latents to video if not output_type == "latent": # Prepare latents for VAE decoding - latents = latents.to(self.vae_decode.dtype) + latents = latents.to(self.model.vae.dtype) # Apply VAE normalization (denormalization) latents_mean = ( - torch.tensor(self.vae_decode.config.latents_mean) - .view(1, self.vae_decode.config.z_dim, 1, 1, 1) + torch.tensor(self.model.vae.config.latents_mean) + .view(1, self.model.vae.config.z_dim, 1, 1, 1) .to(latents.device, latents.dtype) ) - latents_std = 1.0 / torch.tensor(self.vae_decode.config.latents_std).view( - 1, self.vae_decode.config.z_dim, 1, 1, 1 + latents_std = 1.0 / torch.tensor(self.model.vae.config.latents_std).view( + 1, self.model.vae.config.z_dim, 1, 1, 1 ).to(latents.device, latents.dtype) latents = latents / latents_std + latents_mean - # TODO: Enable VAE on QAIC - # VAE Decode latents to video using CPU (temporary) - video = self.model.vae.decode(latents, return_dict=False)[0] # CPU fallback + inputs_aic = {"latent_sample": latents.detach().numpy()} + + start_decode_time = time.perf_counter() + video = self.vae_decoder.qpc_session.run(inputs_aic) # CPU fallback + end_decode_time = time.perf_counter() + + vae_decoder_perf = end_decode_time - start_decode_time # Post-process video for output - video = self.model.video_processor.postprocess_video(video.detach()) + video = self.model.video_processor.postprocess_video(torch.tensor(video["sample"])) else: video = latents # Step 10: Collect performance metrics perf_data = { "transformer": transformer_perf, # Unified transformer (QAIC) + "vae_decoder": vae_decoder_perf, } # Build performance metrics for output diff --git a/examples/diffusers/wan/wan_config.json b/examples/diffusers/wan/wan_config.json index 7e752ba14..77fba9ab7 100644 --- a/examples/diffusers/wan/wan_config.json +++ b/examples/diffusers/wan/wan_config.json @@ -32,6 +32,34 @@ "execute": { "device_ids": null } - } + }, + "vae_decoder": + { + "specializations": [ + { + "batch_size": 1, + "num_channels": 16, + "num_frames": 21, + "latent_height": 60, + "latent_width": 104 + } + ], + "compilation": + { + "onnx_path": null, + "compile_dir": null, + "mdp_ts_num_devices": 4, + "mxfp6_matmul": false, + "convert_to_fp16": true, + "aic_num_cores": 16, + "aic-enable-depth-first": true, + "compile_only":true + }, + "execute": + { + "device_ids": null + } + } + } } \ No newline at end of file From d54a374ccbe3cbd5db36184441134caf371d0ede Mon Sep 17 00:00:00 2001 From: Mohit Soni Date: Tue, 23 Dec 2025 17:33:00 +0000 Subject: [PATCH 02/10] Addressed comments Signed-off-by: Mohit Soni --- .../models/autoencoders/autoencoder_kl_wan.py | 17 --- .../diffusers/models/pytorch_transforms.py | 3 - .../diffusers/pipelines/pipeline_module.py | 103 ++---------------- .../diffusers/pipelines/wan/pipeline_wan.py | 43 +++++--- 4 files changed, 36 insertions(+), 130 deletions(-) diff --git a/QEfficient/diffusers/models/autoencoders/autoencoder_kl_wan.py b/QEfficient/diffusers/models/autoencoders/autoencoder_kl_wan.py index abd9d9491..4c6a2160b 100644 --- a/QEfficient/diffusers/models/autoencoders/autoencoder_kl_wan.py +++ b/QEfficient/diffusers/models/autoencoders/autoencoder_kl_wan.py @@ -202,20 +202,3 @@ def forward(self, x, feat_cache=None, feat_idx=[0], first_chunk=False): x = self.conv_out(x) return x - -class QEffAutoencoderKLWan(AutoencoderKLWan): - def forward( - self, - latent_sample: torch.Tensor, - sample_posterior: bool = False, - return_dict: bool = True, - generator=None, - ): - """ - Args: - sample (`torch.Tensor`): Input sample. - return_dict (`bool`, *optional*, defaults to `True`): - Whether or not to return a [`DecoderOutput`] instead of a plain tuple. - """ - - return self.decode(latent_sample) diff --git a/QEfficient/diffusers/models/pytorch_transforms.py b/QEfficient/diffusers/models/pytorch_transforms.py index 6fa8cc28a..fa637b2e9 100644 --- a/QEfficient/diffusers/models/pytorch_transforms.py +++ b/QEfficient/diffusers/models/pytorch_transforms.py @@ -6,7 +6,6 @@ # ----------------------------------------------------------------------------- from diffusers.models.autoencoders.autoencoder_kl_wan import ( - AutoencoderKLWan, WanDecoder3d, WanEncoder3d, WanResample, @@ -26,7 +25,6 @@ from QEfficient.base.pytorch_transforms import ModuleMappingTransform from QEfficient.customop.rms_norm import CustomRMSNormAIC from QEfficient.diffusers.models.autoencoders.autoencoder_kl_wan import ( - QEffAutoencoderKLWan, QEffWanDecoder3d, QEffWanEncoder3d, QEffWanResample, @@ -72,7 +70,6 @@ class AttentionTransform(ModuleMappingTransform): WanEncoder3d: QEffWanEncoder3d, WanResidualBlock: QEffWanResidualBlock, WanResample: QEffWanResample, - AutoencoderKLWan: QEffAutoencoderKLWan, } diff --git a/QEfficient/diffusers/pipelines/pipeline_module.py b/QEfficient/diffusers/pipelines/pipeline_module.py index 268b47dae..d9f3f0ad6 100644 --- a/QEfficient/diffusers/pipelines/pipeline_module.py +++ b/QEfficient/diffusers/pipelines/pipeline_module.py @@ -214,7 +214,7 @@ def compile(self, specializations: List[Dict], **compiler_options) -> None: self._compile(specializations=specializations, **compiler_options) -class QEffAutoencoderKLWan(QEFFBaseModel): +class QEffVAE(QEFFBaseModel): """ Wrapper for Variational Autoencoder (VAE) models with ONNX export and QAIC compilation. @@ -229,7 +229,7 @@ class QEffAutoencoderKLWan(QEFFBaseModel): _onnx_transforms (List): ONNX transformations applied after export """ - _pytorch_transforms = [AttentionTransform, CustomOpsTransform] + _pytorch_transforms = [CustomOpsTransform, AttentionTransform] _onnx_transforms = [FP16ClipTransform, SplitTensorsTransform] @property @@ -256,7 +256,7 @@ def __init__(self, model: nn.Module, type: str) -> None: # To have different hashing for encoder/decoder self.model.config["type"] = type - def get_onnx_params(self) -> Tuple[Dict, Dict, List[str]]: + def get_onnx_params(self, latent_height: int = 32, latent_width: int = 32) -> Tuple[Dict, Dict, List[str]]: """ Generate ONNX export configuration for the VAE decoder. @@ -274,7 +274,7 @@ def get_onnx_params(self) -> Tuple[Dict, Dict, List[str]]: # VAE decoder takes latent representation as input example_inputs = { - "latent_sample": torch.randn(bs, 16, 21, 24, 40), + "latent_sample": torch.randn(bs, 16, latent_height, latent_width), "return_dict": False, } @@ -282,97 +282,12 @@ def get_onnx_params(self) -> Tuple[Dict, Dict, List[str]]: # All dimensions except channels can be dynamic dynamic_axes = { - "latent_sample": {2: "num_frames", 3: "latent_height", 4: "latent_width"}, + "latent_sample": {0: "batch_size", 1: "channels", 2: "latent_height", 3: "latent_width"}, } return example_inputs, dynamic_axes, output_names - - def export( - self, - inputs: Dict, - output_names: List[str], - dynamic_axes: Dict, - export_dir: str = None, - export_kwargs: Dict = {}, - ) -> str: - """ - Export the VAE model to ONNX format. - - Args: - inputs (Dict): Example inputs for ONNX export - output_names (List[str]): Names of model outputs - dynamic_axes (Dict): Specification of dynamic dimensions - export_dir (str, optional): Directory to save ONNX model - export_kwargs (Dict, optional): Additional export arguments - - Returns: - str: Path to the exported ONNX model - """ - - self.model.config["_use_default_values"].sort() - - return self._export( - example_inputs=inputs, - output_names=output_names, - dynamic_axes=dynamic_axes, - export_dir=export_dir, - **export_kwargs, - ) - - def compile(self, specializations: List[Dict], **compiler_options) -> None: - """ - Compile the ONNX model for Qualcomm AI hardware. - - Args: - specializations (List[Dict]): Model specialization configurations - **compiler_options: Additional compiler options - """ - self._compile(specializations=specializations, **compiler_options) - - -class QEffVAE(QEFFBaseModel): - """ - Wrapper for Variational Autoencoder (VAE) models with ONNX export and QAIC compilation. - - This class handles VAE models with specific transformations and optimizations - for efficient inference on Qualcomm AI hardware. VAE models are used in diffusion - pipelines for encoding images to latent space and decoding latents back to images. - - Attributes: - model (nn.Module): The wrapped VAE model (deep copy of original) - type (str): VAE operation type ("encoder" or "decoder") - _pytorch_transforms (List): PyTorch transformations applied before ONNX export - _onnx_transforms (List): ONNX transformations applied after export - """ - - _pytorch_transforms = [CustomOpsTransform] - _onnx_transforms = [FP16ClipTransform, SplitTensorsTransform] - - @property - def get_model_config(self) -> Dict: - """ - Get the model configuration as a dictionary. - - Returns: - Dict: The configuration dictionary of the underlying VAE model - """ - return self.model.config.__dict__ - - def __init__(self, model: nn.Module, type: str) -> None: - """ - Initialize the VAE wrapper. - - Args: - model (nn.Module): The pipeline model containing the VAE - type (str): VAE operation type ("encoder" or "decoder") - """ - super().__init__(model) - self.model = model - - # To have different hashing for encoder/decoder - self.model.config["type"] = type - - def get_onnx_params(self, latent_height: int = 32, latent_width: int = 32) -> Tuple[Dict, Dict, List[str]]: + + def get_video_onnx_params(self) -> Tuple[Dict, Dict, List[str]]: """ Generate ONNX export configuration for the VAE decoder. @@ -390,7 +305,7 @@ def get_onnx_params(self, latent_height: int = 32, latent_width: int = 32) -> Tu # VAE decoder takes latent representation as input example_inputs = { - "latent_sample": torch.randn(bs, 16, latent_height, latent_width), + "latent_sample": torch.randn(bs, 16, 21, 24, 40), "return_dict": False, } @@ -398,7 +313,7 @@ def get_onnx_params(self, latent_height: int = 32, latent_width: int = 32) -> Tu # All dimensions except channels can be dynamic dynamic_axes = { - "latent_sample": {0: "batch_size", 1: "channels", 2: "latent_height", 3: "latent_width"}, + "latent_sample": {0: "batch_size", 2: "num_frames", 3: "latent_height", 4: "latent_width"}, } return example_inputs, dynamic_axes, output_names diff --git a/QEfficient/diffusers/pipelines/wan/pipeline_wan.py b/QEfficient/diffusers/pipelines/wan/pipeline_wan.py index 3e15043d3..5c2ee7858 100644 --- a/QEfficient/diffusers/pipelines/wan/pipeline_wan.py +++ b/QEfficient/diffusers/pipelines/wan/pipeline_wan.py @@ -22,7 +22,7 @@ import torch from diffusers import WanPipeline -from QEfficient.diffusers.pipelines.pipeline_module import QEffAutoencoderKLWan, QEffWanUnifiedTransformer +from QEfficient.diffusers.pipelines.pipeline_module import QEffVAE, QEffWanUnifiedTransformer from QEfficient.diffusers.pipelines.pipeline_utils import ( ONNX_SUBFUNCTION_MODULE, ModulePerf, @@ -106,7 +106,7 @@ def __init__(self, model, **kwargs): self.transformer = QEffWanUnifiedTransformer(self.unified_wrapper) # VAE decoder for latent-to-video conversion - self.vae_decoder = QEffAutoencoderKLWan(model.vae, "decoder") + self.vae_decoder = QEffVAE(model.vae, "decoder") # Store all modules in a dictionary for easy iteration during export/compile # TODO: add text encoder on QAIC self.modules = {"transformer": self.transformer, "vae_decoder": self.vae_decoder} @@ -115,6 +115,13 @@ def __init__(self, model, **kwargs): self.tokenizer = model.tokenizer self.text_encoder.tokenizer = model.tokenizer self.scheduler = model.scheduler + + self.vae_decoder.model.forward = lambda latent_sample, return_dict: self.vae_decoder.model.decode( + latent_sample, return_dict + ) + + self.vae_decoder.get_onnx_params = self.vae_decoder.get_video_onnx_params + self.vae_decoder.model.config["_use_default_values"].sort() # Extract patch dimensions from transformer configuration _, self.patch_height, self.patch_width = self.transformer.model.config.patch_size @@ -554,11 +561,6 @@ def __call__( str(self.transformer.qpc_path), device_ids=self.transformer.device_ids ) - if self.vae_decoder.qpc_session is None: - self.vae_decoder.qpc_session = QAICInferenceSession( - str(self.vae_decoder.qpc_path), device_ids=self.vae_decoder.device_ids - ) - # Calculate compressed latent dimension for transformer buffer allocation cl, _, _, _ = calculate_latent_dimensions_with_frames( height, @@ -733,29 +735,38 @@ def __call__( # Step 9: Decode latents to video if not output_type == "latent": # Prepare latents for VAE decoding - latents = latents.to(self.model.vae.dtype) + latents = latents.to(self.vae_decoder.model.dtype) # Apply VAE normalization (denormalization) latents_mean = ( - torch.tensor(self.model.vae.config.latents_mean) - .view(1, self.model.vae.config.z_dim, 1, 1, 1) + torch.tensor(self.vae_decoder.model.config.latents_mean) + .view(1, self.vae_decoder.model.config.z_dim, 1, 1, 1) .to(latents.device, latents.dtype) ) - latents_std = 1.0 / torch.tensor(self.model.vae.config.latents_std).view( - 1, self.model.vae.config.z_dim, 1, 1, 1 + latents_std = 1.0 / torch.tensor(self.vae_decoder.model.config.latents_std).view( + 1, self.vae_decoder.model.config.z_dim, 1, 1, 1 ).to(latents.device, latents.dtype) latents = latents / latents_std + latents_mean - inputs_aic = {"latent_sample": latents.detach().numpy()} + # Initialize VAE decoder inference session + if self.vae_decoder.qpc_session is None: + self.vae_decoder.qpc_session = QAICInferenceSession( + str(self.vae_decoder.qpc_path), device_ids=self.vae_decoder.device_ids + ) + + # Allocate output buffer for VAE decoder + output_buffer = {"sample": np.random.rand(batch_size, 3, num_frames, height, width).astype(np.int32)} + + inputs = {"latent_sample": latents.numpy()} start_decode_time = time.perf_counter() - video = self.vae_decoder.qpc_session.run(inputs_aic) # CPU fallback + video = self.vae_decoder.qpc_session.run(inputs) # CPU fallback end_decode_time = time.perf_counter() - vae_decoder_perf = end_decode_time - start_decode_time # Post-process video for output - video = self.model.video_processor.postprocess_video(torch.tensor(video["sample"])) + video_tensor = torch.from_numpy(video["sample"]) + video = self.model.video_processor.postprocess_video(video_tensor) else: video = latents From 8a633c042d67d84224c32219940281c23184b16b Mon Sep 17 00:00:00 2001 From: Mohit Soni Date: Tue, 23 Dec 2025 17:48:02 +0000 Subject: [PATCH 03/10] Minor Fixes Signed-off-by: Mohit Soni --- .../models/autoencoders/autoencoder_kl_wan.py | 17 ++++------------- .../diffusers/pipelines/pipeline_module.py | 4 ++-- .../diffusers/pipelines/wan/pipeline_wan.py | 2 +- examples/diffusers/wan/wan_config.json | 6 ++++-- 4 files changed, 11 insertions(+), 18 deletions(-) diff --git a/QEfficient/diffusers/models/autoencoders/autoencoder_kl_wan.py b/QEfficient/diffusers/models/autoencoders/autoencoder_kl_wan.py index 4c6a2160b..34a4961b1 100644 --- a/QEfficient/diffusers/models/autoencoders/autoencoder_kl_wan.py +++ b/QEfficient/diffusers/models/autoencoders/autoencoder_kl_wan.py @@ -1,19 +1,11 @@ -# Copyright 2025 The Wan Team and The HuggingFace Team. All rights reserved. +# ----------------------------------------------------------------------------- # -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at +# Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries. +# SPDX-License-Identifier: BSD-3-Clause # -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. +# ----------------------------------------------------------------------------- import torch -from diffusers import AutoencoderKLWan from diffusers.models.autoencoders.autoencoder_kl_wan import ( WanDecoder3d, WanEncoder3d, @@ -201,4 +193,3 @@ def forward(self, x, feat_cache=None, feat_idx=[0], first_chunk=False): else: x = self.conv_out(x) return x - diff --git a/QEfficient/diffusers/pipelines/pipeline_module.py b/QEfficient/diffusers/pipelines/pipeline_module.py index d9f3f0ad6..8a9930556 100644 --- a/QEfficient/diffusers/pipelines/pipeline_module.py +++ b/QEfficient/diffusers/pipelines/pipeline_module.py @@ -286,7 +286,7 @@ def get_onnx_params(self, latent_height: int = 32, latent_width: int = 32) -> Tu } return example_inputs, dynamic_axes, output_names - + def get_video_onnx_params(self) -> Tuple[Dict, Dict, List[str]]: """ Generate ONNX export configuration for the VAE decoder. @@ -305,7 +305,7 @@ def get_video_onnx_params(self) -> Tuple[Dict, Dict, List[str]]: # VAE decoder takes latent representation as input example_inputs = { - "latent_sample": torch.randn(bs, 16, 21, 24, 40), + "latent_sample": torch.randn(bs, 16, 21, 12, 16), "return_dict": False, } diff --git a/QEfficient/diffusers/pipelines/wan/pipeline_wan.py b/QEfficient/diffusers/pipelines/wan/pipeline_wan.py index 5c2ee7858..a2b51fe12 100644 --- a/QEfficient/diffusers/pipelines/wan/pipeline_wan.py +++ b/QEfficient/diffusers/pipelines/wan/pipeline_wan.py @@ -753,7 +753,7 @@ def __call__( self.vae_decoder.qpc_session = QAICInferenceSession( str(self.vae_decoder.qpc_path), device_ids=self.vae_decoder.device_ids ) - + # Allocate output buffer for VAE decoder output_buffer = {"sample": np.random.rand(batch_size, 3, num_frames, height, width).astype(np.int32)} diff --git a/examples/diffusers/wan/wan_config.json b/examples/diffusers/wan/wan_config.json index 77fba9ab7..c32054db1 100644 --- a/examples/diffusers/wan/wan_config.json +++ b/examples/diffusers/wan/wan_config.json @@ -48,12 +48,14 @@ { "onnx_path": null, "compile_dir": null, - "mdp_ts_num_devices": 4, + "mdp_ts_num_devices": 8, "mxfp6_matmul": false, "convert_to_fp16": true, "aic_num_cores": 16, "aic-enable-depth-first": true, - "compile_only":true + "compile_only":true, + "mos": 1, + "mdts_mos": 1 }, "execute": { From e350eceb6a7ad3a616df05730803a0f6566de335 Mon Sep 17 00:00:00 2001 From: Mohit Soni Date: Wed, 24 Dec 2025 06:25:17 +0000 Subject: [PATCH 04/10] Description Signed-off-by: Mohit Soni --- .../diffusers/models/autoencoders/autoencoder_kl_wan.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/QEfficient/diffusers/models/autoencoders/autoencoder_kl_wan.py b/QEfficient/diffusers/models/autoencoders/autoencoder_kl_wan.py index 34a4961b1..868214455 100644 --- a/QEfficient/diffusers/models/autoencoders/autoencoder_kl_wan.py +++ b/QEfficient/diffusers/models/autoencoders/autoencoder_kl_wan.py @@ -18,9 +18,14 @@ modes = [] +# Used max(0, x.shape[2] - CACHE_T) instead of CACHE_T because x.shape[2] is either 1 or 4, +# and CACHE_T = 2. This ensures the value never goes negative + class QEffWanResample(WanResample): def __qeff_init__(self): + # Changed upsampling mode from "nearest-exact" to "nearest" for ONNX compatibility. + # Since the scale factor is an integer, both modes behave the if self.mode in ("upsample2d", "upsample3d"): self.resample[0] = WanUpsample(scale_factor=(2.0, 2.0), mode="nearest") From 942f621306dc2ebfe6b5f99352252d6d98169d76 Mon Sep 17 00:00:00 2001 From: vtirumal Date: Mon, 5 Jan 2026 08:10:20 +0000 Subject: [PATCH 05/10] config minor clean up Signed-off-by: vtirumal --- .../pipelines/configs/wan_config.json | 27 ++++- .../diffusers/pipelines/wan/pipeline_wan.py | 2 +- examples/diffusers/wan/wan_config.json | 107 +++++++++--------- 3 files changed, 79 insertions(+), 57 deletions(-) diff --git a/QEfficient/diffusers/pipelines/configs/wan_config.json b/QEfficient/diffusers/pipelines/configs/wan_config.json index 3f5edce07..eba96f259 100644 --- a/QEfficient/diffusers/pipelines/configs/wan_config.json +++ b/QEfficient/diffusers/pipelines/configs/wan_config.json @@ -31,6 +31,31 @@ "execute": { "device_ids": null } - } + }, + "vae_decoder":{ + "specializations": [ + { + "batch_size": 1, + "num_channels": 16 + } + ], + "compilation": + { + "onnx_path": null, + "compile_dir": null, + "mdp_ts_num_devices": 8, + "mxfp6_matmul": false, + "convert_to_fp16": true, + "aic_num_cores": 16, + "aic-enable-depth-first": true, + "compile_only":true, + "mos": 1, + "mdts_mos": 1 + }, + "execute": + { + "device_ids": null + } + } } } \ No newline at end of file diff --git a/QEfficient/diffusers/pipelines/wan/pipeline_wan.py b/QEfficient/diffusers/pipelines/wan/pipeline_wan.py index a2b51fe12..083e62243 100644 --- a/QEfficient/diffusers/pipelines/wan/pipeline_wan.py +++ b/QEfficient/diffusers/pipelines/wan/pipeline_wan.py @@ -760,7 +760,7 @@ def __call__( inputs = {"latent_sample": latents.numpy()} start_decode_time = time.perf_counter() - video = self.vae_decoder.qpc_session.run(inputs) # CPU fallback + video = self.vae_decoder.qpc_session.run(inputs) end_decode_time = time.perf_counter() vae_decoder_perf = end_decode_time - start_decode_time diff --git a/examples/diffusers/wan/wan_config.json b/examples/diffusers/wan/wan_config.json index c32054db1..7054d573b 100644 --- a/examples/diffusers/wan/wan_config.json +++ b/examples/diffusers/wan/wan_config.json @@ -3,65 +3,62 @@ "model_type": "wan", "modules": { "transformer": { - "specializations": [ - { - "batch_size": "1", - "num_channels": "16", - "steps": "1", - "sequence_length": "512", - "model_type": 1 - }, - { - "batch_size": "1", - "num_channels": "16", - "steps": "1", - "sequence_length": "512", - "model_type": 2 - } - ], - "compilation": { - "onnx_path": null, - "compile_dir": null, - "mdp_ts_num_devices": 16, - "mxfp6_matmul": true, - "convert_to_fp16": true, - "aic_num_cores": 16, - "mos": 1, - "mdts_mos": 1 - }, - "execute": { - "device_ids": null - } - }, - "vae_decoder": - { - "specializations": [ - { - "batch_size": 1, - "num_channels": 16, - "num_frames": 21, - "latent_height": 60, - "latent_width": 104 - } - ], - "compilation": + "specializations": [ { - "onnx_path": null, - "compile_dir": null, - "mdp_ts_num_devices": 8, - "mxfp6_matmul": false, - "convert_to_fp16": true, - "aic_num_cores": 16, - "aic-enable-depth-first": true, - "compile_only":true, - "mos": 1, - "mdts_mos": 1 + "batch_size": "1", + "num_channels": "16", + "steps": "1", + "sequence_length": "512", + "model_type": 1 }, - "execute": { - "device_ids": null + "batch_size": "1", + "num_channels": "16", + "steps": "1", + "sequence_length": "512", + "model_type": 2 + } + ], + "compilation": { + "onnx_path": null, + "compile_dir": null, + "mdp_ts_num_devices": 16, + "mxfp6_matmul": true, + "convert_to_fp16": true, + "aic_num_cores": 16, + "mos": 1, + "mdts_mos": 1 + }, + "execute": { + "device_ids": null + } + }, + "vae_decoder": + { + "specializations": [ + { + "batch_size": 1, + "num_channels": 16 } - } + ], + "compilation": + { + "onnx_path": null, + "compile_dir": null, + "mdp_ts_num_devices": 8, + "mxfp6_matmul": false, + "convert_to_fp16": true, + "aic_num_cores": 16, + "aic-enable-depth-first": true, + "compile_only":true, + "mos": 1, + "mdts_mos": 1 + }, + "execute": + { + "device_ids": null + } + } } } \ No newline at end of file From 2bdd93122f9344134729ddac1556c7dd08f57f71 Mon Sep 17 00:00:00 2001 From: Mohit Soni Date: Tue, 6 Jan 2026 08:03:24 +0000 Subject: [PATCH 06/10] Comments Addressed Signed-off-by: Mohit Soni --- .../diffusers/pipelines/pipeline_module.py | 7 ++++++- .../diffusers/pipelines/wan/pipeline_wan.py | 17 ++++++++--------- examples/diffusers/wan/wan_config.json | 4 ++-- 3 files changed, 16 insertions(+), 12 deletions(-) diff --git a/QEfficient/diffusers/pipelines/pipeline_module.py b/QEfficient/diffusers/pipelines/pipeline_module.py index 8a9930556..e8b72c352 100644 --- a/QEfficient/diffusers/pipelines/pipeline_module.py +++ b/QEfficient/diffusers/pipelines/pipeline_module.py @@ -302,10 +302,13 @@ def get_video_onnx_params(self) -> Tuple[Dict, Dict, List[str]]: - output_names (List[str]): Names of model outputs """ bs = constants.ONNX_EXPORT_EXAMPLE_BATCH_SIZE + num_frames = constants.WAN_ONNX_EXPORT_LATENT_FRAMES + latent_height = constants.WAN_ONNX_EXPORT_LATENT_HEIGHT_180P + latent_width = constants.WAN_ONNX_EXPORT_LATENT_WIDTH_180P # VAE decoder takes latent representation as input example_inputs = { - "latent_sample": torch.randn(bs, 16, 21, 12, 16), + "latent_sample": torch.randn(bs, 16, num_frames, latent_height, latent_width), "return_dict": False, } @@ -339,6 +342,8 @@ def export( Returns: str: Path to the exported ONNX model """ + self.model.config["_use_default_values"].sort() + return self._export( example_inputs=inputs, output_names=output_names, diff --git a/QEfficient/diffusers/pipelines/wan/pipeline_wan.py b/QEfficient/diffusers/pipelines/wan/pipeline_wan.py index 083e62243..a5a27b141 100644 --- a/QEfficient/diffusers/pipelines/wan/pipeline_wan.py +++ b/QEfficient/diffusers/pipelines/wan/pipeline_wan.py @@ -21,6 +21,7 @@ import numpy as np import torch from diffusers import WanPipeline +from tqdm import tqdm from QEfficient.diffusers.pipelines.pipeline_module import QEffVAE, QEffWanUnifiedTransformer from QEfficient.diffusers.pipelines.pipeline_utils import ( @@ -121,7 +122,6 @@ def __init__(self, model, **kwargs): ) self.vae_decoder.get_onnx_params = self.vae_decoder.get_video_onnx_params - self.vae_decoder.model.config["_use_default_values"].sort() # Extract patch dimensions from transformer configuration _, self.patch_height, self.patch_width = self.transformer.model.config.patch_size @@ -227,7 +227,7 @@ def export( """ # Export each module with video-specific parameters - for module_name, module_obj in self.modules.items(): + for module_name, module_obj in tqdm(self.modules.items(), desc="Exporting modules", unit="module"): # Get ONNX export configuration with video dimensions example_inputs, dynamic_axes, output_names = module_obj.get_onnx_params() @@ -308,6 +308,7 @@ def compile( path is None for path in [ self.transformer.onnx_path, + self.vae_decoder.onnx_path, ] ): self.export(use_onnx_subfunctions=use_onnx_subfunctions) @@ -343,13 +344,11 @@ def compile( "num_frames": latent_frames, # Latent frames }, ], - "vae_decoder": [ - { - "num_frames": latent_frames, - "latent_height": latent_height, - "latent_width": latent_width, - } - ], + "vae_decoder": { + "num_frames": latent_frames, + "latent_height": latent_height, + "latent_width": latent_width, + }, } # Use generic utility functions for compilation diff --git a/examples/diffusers/wan/wan_config.json b/examples/diffusers/wan/wan_config.json index 7054d573b..188f7f70b 100644 --- a/examples/diffusers/wan/wan_config.json +++ b/examples/diffusers/wan/wan_config.json @@ -35,12 +35,12 @@ }, "vae_decoder": { - "specializations": [ + "specializations": { "batch_size": 1, "num_channels": 16 } - ], + , "compilation": { "onnx_path": null, From 5007858ac004cd9bd7fb90b2d88f52f59aa1e0f6 Mon Sep 17 00:00:00 2001 From: Mohit Soni Date: Tue, 6 Jan 2026 09:18:34 +0000 Subject: [PATCH 07/10] Minor Changes Signed-off-by: Mohit Soni --- QEfficient/diffusers/pipelines/pipeline_module.py | 8 ++++---- QEfficient/diffusers/pipelines/wan/pipeline_wan.py | 7 ++++--- 2 files changed, 8 insertions(+), 7 deletions(-) diff --git a/QEfficient/diffusers/pipelines/pipeline_module.py b/QEfficient/diffusers/pipelines/pipeline_module.py index e8b72c352..d960eceae 100644 --- a/QEfficient/diffusers/pipelines/pipeline_module.py +++ b/QEfficient/diffusers/pipelines/pipeline_module.py @@ -302,13 +302,13 @@ def get_video_onnx_params(self) -> Tuple[Dict, Dict, List[str]]: - output_names (List[str]): Names of model outputs """ bs = constants.ONNX_EXPORT_EXAMPLE_BATCH_SIZE - num_frames = constants.WAN_ONNX_EXPORT_LATENT_FRAMES + latent_frames = constants.WAN_ONNX_EXPORT_LATENT_FRAMES latent_height = constants.WAN_ONNX_EXPORT_LATENT_HEIGHT_180P latent_width = constants.WAN_ONNX_EXPORT_LATENT_WIDTH_180P # VAE decoder takes latent representation as input example_inputs = { - "latent_sample": torch.randn(bs, 16, num_frames, latent_height, latent_width), + "latent_sample": torch.randn(bs, 16, latent_frames, latent_height, latent_width), "return_dict": False, } @@ -316,7 +316,7 @@ def get_video_onnx_params(self) -> Tuple[Dict, Dict, List[str]]: # All dimensions except channels can be dynamic dynamic_axes = { - "latent_sample": {0: "batch_size", 2: "num_frames", 3: "latent_height", 4: "latent_width"}, + "latent_sample": {0: "batch_size", 2: "latent_frames", 3: "latent_height", 4: "latent_width"}, } return example_inputs, dynamic_axes, output_names @@ -611,7 +611,7 @@ def get_onnx_params(self): "hidden_states": { 0: "batch_size", 1: "num_channels", - 2: "num_frames", + 2: "latent_frames", 3: "latent_height", 4: "latent_width", }, diff --git a/QEfficient/diffusers/pipelines/wan/pipeline_wan.py b/QEfficient/diffusers/pipelines/wan/pipeline_wan.py index a5a27b141..cd1b59cd8 100644 --- a/QEfficient/diffusers/pipelines/wan/pipeline_wan.py +++ b/QEfficient/diffusers/pipelines/wan/pipeline_wan.py @@ -334,24 +334,25 @@ def compile( "cl": cl, # Compressed latent dimension "latent_height": latent_height, # Latent space height "latent_width": latent_width, # Latent space width - "num_frames": latent_frames, # Latent frames + "latent_frames": latent_frames, # Latent frames }, # low noise { "cl": cl, # Compressed latent dimension "latent_height": latent_height, # Latent space height "latent_width": latent_width, # Latent space width - "num_frames": latent_frames, # Latent frames + "latent_frames": latent_frames, # Latent frames }, ], "vae_decoder": { - "num_frames": latent_frames, + "latent_frames": latent_frames, "latent_height": latent_height, "latent_width": latent_width, }, } # Use generic utility functions for compilation + logger.warning('For VAE compilation use QAIC_COMPILER_OPTS_UNSUPPORTED="-aic-hmx-conv3d" ') if parallel: compile_modules_parallel(self.modules, self.custom_config, specialization_updates) else: From 6e851fbe3232f9ec0ec36bdcd446e0d0d70b5c53 Mon Sep 17 00:00:00 2001 From: vtirumal Date: Tue, 6 Jan 2026 10:57:09 +0000 Subject: [PATCH 08/10] To skip pytest for Wan Signed-off-by: vtirumal --- scripts/Jenkinsfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/Jenkinsfile b/scripts/Jenkinsfile index 3420c025b..d51765a4d 100644 --- a/scripts/Jenkinsfile +++ b/scripts/Jenkinsfile @@ -95,7 +95,7 @@ pipeline { export TOKENIZERS_PARALLELISM=false && export QEFF_HOME=$PWD/Non_cli_qaic_diffusion && export HF_HUB_CACHE=/huggingface_hub && - pytest tests -m '(not cli) and (on_qaic) and (diffusion_models) and (not qnn) and (not finetune)' --ignore tests/vllm --junitxml=tests/tests_log_diffusion.xml && + pytest tests -m '(not cli) and (on_qaic) and (diffusion_models) and (not wan) and (not qnn) and (not finetune)' --ignore tests/vllm --junitxml=tests/tests_log_diffusion.xml && junitparser merge tests/tests_log_diffusion.xml tests/tests_log.xml && deactivate" ''' From dc42816250c3c852c42e7930ccd9dff6b9a55225 Mon Sep 17 00:00:00 2001 From: vtirumal Date: Thu, 8 Jan 2026 04:56:07 +0000 Subject: [PATCH 09/10] Adding init for encoder and compiler_only flag for wan configs Signed-off-by: vtirumal --- QEfficient/diffusers/models/autoencoders/__init__.py | 6 ++++++ QEfficient/diffusers/pipelines/configs/wan_config.json | 1 + examples/diffusers/wan/wan_config.json | 1 + tests/diffusers/wan_test_config.json | 1 + 4 files changed, 9 insertions(+) create mode 100644 QEfficient/diffusers/models/autoencoders/__init__.py diff --git a/QEfficient/diffusers/models/autoencoders/__init__.py b/QEfficient/diffusers/models/autoencoders/__init__.py new file mode 100644 index 000000000..75daf1953 --- /dev/null +++ b/QEfficient/diffusers/models/autoencoders/__init__.py @@ -0,0 +1,6 @@ +# ----------------------------------------------------------------------------- +# +# Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries. +# SPDX-License-Identifier: BSD-3-Clause +# +# ---------------------------------------------------------------------------- diff --git a/QEfficient/diffusers/pipelines/configs/wan_config.json b/QEfficient/diffusers/pipelines/configs/wan_config.json index eba96f259..fb6f3dccd 100644 --- a/QEfficient/diffusers/pipelines/configs/wan_config.json +++ b/QEfficient/diffusers/pipelines/configs/wan_config.json @@ -24,6 +24,7 @@ "mdp_ts_num_devices": 16, "mxfp6_matmul": true, "convert_to_fp16": true, + "compile_only":true, "aic_num_cores": 16, "mos": 1, "mdts_mos": 1 diff --git a/examples/diffusers/wan/wan_config.json b/examples/diffusers/wan/wan_config.json index 188f7f70b..efeb7c877 100644 --- a/examples/diffusers/wan/wan_config.json +++ b/examples/diffusers/wan/wan_config.json @@ -25,6 +25,7 @@ "mdp_ts_num_devices": 16, "mxfp6_matmul": true, "convert_to_fp16": true, + "compile_only":true, "aic_num_cores": 16, "mos": 1, "mdts_mos": 1 diff --git a/tests/diffusers/wan_test_config.json b/tests/diffusers/wan_test_config.json index 1ed36294a..25869bbe8 100644 --- a/tests/diffusers/wan_test_config.json +++ b/tests/diffusers/wan_test_config.json @@ -51,6 +51,7 @@ "mdp_ts_num_devices": 1, "mxfp6_matmul": true, "convert_to_fp16": true, + "compile_only":true, "aic_num_cores": 16, "mos": 1, "mdts_mos": 1 From 92307b61603df55e078ce3867ab67c5a960e49b0 Mon Sep 17 00:00:00 2001 From: Mohit Soni Date: Thu, 8 Jan 2026 09:42:19 +0000 Subject: [PATCH 10/10] VAE export fix Signed-off-by: Mohit Soni --- QEfficient/diffusers/pipelines/pipeline_module.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/QEfficient/diffusers/pipelines/pipeline_module.py b/QEfficient/diffusers/pipelines/pipeline_module.py index d960eceae..4cc70d056 100644 --- a/QEfficient/diffusers/pipelines/pipeline_module.py +++ b/QEfficient/diffusers/pipelines/pipeline_module.py @@ -342,7 +342,9 @@ def export( Returns: str: Path to the exported ONNX model """ - self.model.config["_use_default_values"].sort() + + if hasattr(self.model.config, "_use_default_values"): + self.model.config["_use_default_values"].sort() return self._export( example_inputs=inputs,