From a03145845cea1fddec1cb9f8d7f4ceab9d11ba67 Mon Sep 17 00:00:00 2001
From: Mohit Soni <mohisoni@qti.qualcomm.com>
Date: Tue, 23 Dec 2025 07:03:04 +0000
Subject: [PATCH 01/10] Adding vae decoder in Wan

Signed-off-by: Mohit Soni <mohisoni@qti.qualcomm.com>
---
 .../models/autoencoders/autoencoder_kl_wan.py | 221 ++++++++++++++++++
 .../diffusers/models/pytorch_transforms.py    |  19 ++
 .../diffusers/pipelines/pipeline_module.py    | 116 +++++++++
 .../diffusers/pipelines/wan/pipeline_wan.py   |  48 ++--
 examples/diffusers/wan/wan_config.json        |  30 ++-
 5 files changed, 417 insertions(+), 17 deletions(-)
 create mode 100644 QEfficient/diffusers/models/autoencoders/autoencoder_kl_wan.py

diff --git a/QEfficient/diffusers/models/autoencoders/autoencoder_kl_wan.py b/QEfficient/diffusers/models/autoencoders/autoencoder_kl_wan.py
new file mode 100644
index 000000000..abd9d9491
--- /dev/null
+++ b/QEfficient/diffusers/models/autoencoders/autoencoder_kl_wan.py
@@ -0,0 +1,221 @@
+# Copyright 2025 The Wan Team and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+from diffusers import AutoencoderKLWan
+from diffusers.models.autoencoders.autoencoder_kl_wan import (
+    WanDecoder3d,
+    WanEncoder3d,
+    WanResample,
+    WanResidualBlock,
+    WanUpsample,
+)
+
+CACHE_T = 2
+
+modes = []
+
+
+class QEffWanResample(WanResample):
+    def __qeff_init__(self):
+        if self.mode in ("upsample2d", "upsample3d"):
+            self.resample[0] = WanUpsample(scale_factor=(2.0, 2.0), mode="nearest")
+
+    def forward(self, x, feat_cache=None, feat_idx=[0]):
+        b, c, t, h, w = x.size()
+        if self.mode == "upsample3d":
+            if feat_cache is not None:
+                idx = feat_idx[0]
+                if feat_cache[idx] is None:
+                    feat_cache[idx] = "Rep"
+                    feat_idx[0] += 1
+                else:
+                    cache_x = x[:, :, max(0, x.shape[2] - CACHE_T) :, :, :].clone()
+                    if cache_x.shape[2] < 2 and feat_cache[idx] is not None and feat_cache[idx] != "Rep":
+                        # cache last frame of last two chunk
+                        cache_x = torch.cat(
+                            [feat_cache[idx][:, :, -1, :, :].unsqueeze(2).to(cache_x.device), cache_x], dim=2
+                        )
+                    if cache_x.shape[2] < 2 and feat_cache[idx] is not None and feat_cache[idx] == "Rep":
+                        cache_x = torch.cat([torch.zeros_like(cache_x).to(cache_x.device), cache_x], dim=2)
+                    if feat_cache[idx] == "Rep":
+                        x = self.time_conv(x)
+                    else:
+                        x = self.time_conv(x, feat_cache[idx])
+                    feat_cache[idx] = cache_x
+                    feat_idx[0] += 1
+
+                    x = x.reshape(b, 2, c, t, h, w)
+                    x = torch.stack((x[:, 0, :, :, :, :], x[:, 1, :, :, :, :]), 3)
+                    x = x.reshape(b, c, t * 2, h, w)
+        t = x.shape[2]
+        x = x.permute(0, 2, 1, 3, 4).reshape(b * t, c, h, w)
+        modes.append(self.mode)
+        x = self.resample(x)
+        x = x.view(b, t, x.size(1), x.size(2), x.size(3)).permute(0, 2, 1, 3, 4)
+
+        if self.mode == "downsample3d":
+            if feat_cache is not None:
+                idx = feat_idx[0]
+                if feat_cache[idx] is None:
+                    feat_cache[idx] = x.clone()
+                    feat_idx[0] += 1
+                else:
+                    cache_x = x[:, :, -1:, :, :].clone()
+                    x = self.time_conv(torch.cat([feat_cache[idx][:, :, -1:, :, :], x], 2))
+                    feat_cache[idx] = cache_x
+                    feat_idx[0] += 1
+        return x
+
+
+class QEffWanResidualBlock(WanResidualBlock):
+    def forward(self, x, feat_cache=None, feat_idx=[0]):
+        # Apply shortcut connection
+        h = self.conv_shortcut(x)
+
+        # First normalization and activation
+        x = self.norm1(x)
+        x = self.nonlinearity(x)
+
+        if feat_cache is not None:
+            idx = feat_idx[0]
+            cache_x = x[:, :, max(0, x.shape[2] - CACHE_T) :, :, :].clone()
+            if cache_x.shape[2] < 2 and feat_cache[idx] is not None:
+                cache_x = torch.cat([feat_cache[idx][:, :, -1, :, :].unsqueeze(2).to(cache_x.device), cache_x], dim=2)
+
+            x = self.conv1(x, feat_cache[idx])
+            feat_cache[idx] = cache_x
+            feat_idx[0] += 1
+        else:
+            x = self.conv1(x)
+
+        # Second normalization and activation
+        x = self.norm2(x)
+        x = self.nonlinearity(x)
+
+        # Dropout
+        x = self.dropout(x)
+
+        if feat_cache is not None:
+            idx = feat_idx[0]
+            cache_x = x[:, :, max(0, x.shape[2] - CACHE_T) :, :, :].clone()
+            if cache_x.shape[2] < 2 and feat_cache[idx] is not None:
+                cache_x = torch.cat([feat_cache[idx][:, :, -1, :, :].unsqueeze(2).to(cache_x.device), cache_x], dim=2)
+
+            x = self.conv2(x, feat_cache[idx])
+            feat_cache[idx] = cache_x
+            feat_idx[0] += 1
+        else:
+            x = self.conv2(x)
+
+        # Add residual connection
+        return x + h
+
+
+class QEffWanEncoder3d(WanEncoder3d):
+    def forward(self, x, feat_cache=None, feat_idx=[0]):
+        if feat_cache is not None:
+            idx = feat_idx[0]
+            cache_x = x[:, :, max(0, x.shape[2] - CACHE_T) :, :, :].clone()
+            if cache_x.shape[2] < 2 and feat_cache[idx] is not None:
+                # cache last frame of last two chunk
+                cache_x = torch.cat([feat_cache[idx][:, :, -1, :, :].unsqueeze(2).to(cache_x.device), cache_x], dim=2)
+            x = self.conv_in(x, feat_cache[idx])
+            feat_cache[idx] = cache_x
+            feat_idx[0] += 1
+        else:
+            x = self.conv_in(x)
+
+        ## downsamples
+        for layer in self.down_blocks:
+            if feat_cache is not None:
+                x = layer(x, feat_cache, feat_idx)
+            else:
+                x = layer(x)
+
+        ## middle
+        x = self.mid_block(x, feat_cache, feat_idx)
+
+        ## head
+        x = self.norm_out(x)
+        x = self.nonlinearity(x)
+        if feat_cache is not None:
+            idx = feat_idx[0]
+            cache_x = x[:, :, max(0, x.shape[2] - CACHE_T) :, :, :].clone()
+            if cache_x.shape[2] < 2 and feat_cache[idx] is not None:
+                # cache last frame of last two chunk
+                cache_x = torch.cat([feat_cache[idx][:, :, -1, :, :].unsqueeze(2).to(cache_x.device), cache_x], dim=2)
+            x = self.conv_out(x, feat_cache[idx])
+            feat_cache[idx] = cache_x
+            feat_idx[0] += 1
+        else:
+            x = self.conv_out(x)
+        return x
+
+
+class QEffWanDecoder3d(WanDecoder3d):
+    def forward(self, x, feat_cache=None, feat_idx=[0], first_chunk=False):
+        ## conv1
+        if feat_cache is not None:
+            idx = feat_idx[0]
+            cache_x = x[:, :, max(0, x.shape[2] - CACHE_T) :, :, :].clone()
+            if cache_x.shape[2] < 2 and feat_cache[idx] is not None:
+                # cache last frame of last two chunk
+                cache_x = torch.cat([feat_cache[idx][:, :, -1, :, :].unsqueeze(2).to(cache_x.device), cache_x], dim=2)
+            x = self.conv_in(x, feat_cache[idx])
+            feat_cache[idx] = cache_x
+            feat_idx[0] += 1
+        else:
+            x = self.conv_in(x)
+
+        ## middle
+        x = self.mid_block(x, feat_cache, feat_idx)
+
+        ## upsamples
+        for up_block in self.up_blocks:
+            x = up_block(x, feat_cache, feat_idx, first_chunk=first_chunk)
+
+        ## head
+        x = self.norm_out(x)
+        x = self.nonlinearity(x)
+        if feat_cache is not None:
+            idx = feat_idx[0]
+            cache_x = x[:, :, max(0, x.shape[2] - CACHE_T) :, :, :].clone()
+            if cache_x.shape[2] < 2 and feat_cache[idx] is not None:
+                # cache last frame of last two chunk
+                cache_x = torch.cat([feat_cache[idx][:, :, -1, :, :].unsqueeze(2).to(cache_x.device), cache_x], dim=2)
+            x = self.conv_out(x, feat_cache[idx])
+            feat_cache[idx] = cache_x
+            feat_idx[0] += 1
+        else:
+            x = self.conv_out(x)
+        return x
+
+
+class QEffAutoencoderKLWan(AutoencoderKLWan):
+    def forward(
+        self,
+        latent_sample: torch.Tensor,
+        sample_posterior: bool = False,
+        return_dict: bool = True,
+        generator=None,
+    ):
+        """
+        Args:
+            sample (`torch.Tensor`): Input sample.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`DecoderOutput`] instead of a plain tuple.
+        """
+
+        return self.decode(latent_sample)
diff --git a/QEfficient/diffusers/models/pytorch_transforms.py b/QEfficient/diffusers/models/pytorch_transforms.py
index 4fb5c3f12..6fa8cc28a 100644
--- a/QEfficient/diffusers/models/pytorch_transforms.py
+++ b/QEfficient/diffusers/models/pytorch_transforms.py
@@ -5,6 +5,13 @@
 #
 # -----------------------------------------------------------------------------
 
+from diffusers.models.autoencoders.autoencoder_kl_wan import (
+    AutoencoderKLWan,
+    WanDecoder3d,
+    WanEncoder3d,
+    WanResample,
+    WanResidualBlock,
+)
 from diffusers.models.normalization import AdaLayerNormContinuous, AdaLayerNormZero, AdaLayerNormZeroSingle, RMSNorm
 from diffusers.models.transformers.transformer_flux import (
     FluxAttention,
@@ -18,6 +25,13 @@
 
 from QEfficient.base.pytorch_transforms import ModuleMappingTransform
 from QEfficient.customop.rms_norm import CustomRMSNormAIC
+from QEfficient.diffusers.models.autoencoders.autoencoder_kl_wan import (
+    QEffAutoencoderKLWan,
+    QEffWanDecoder3d,
+    QEffWanEncoder3d,
+    QEffWanResample,
+    QEffWanResidualBlock,
+)
 from QEfficient.diffusers.models.normalization import (
     QEffAdaLayerNormContinuous,
     QEffAdaLayerNormZero,
@@ -54,6 +68,11 @@ class AttentionTransform(ModuleMappingTransform):
         WanAttnProcessor: QEffWanAttnProcessor,
         WanAttention: QEffWanAttention,
         WanTransformer3DModel: QEffWanTransformer3DModel,
+        WanDecoder3d: QEffWanDecoder3d,
+        WanEncoder3d: QEffWanEncoder3d,
+        WanResidualBlock: QEffWanResidualBlock,
+        WanResample: QEffWanResample,
+        AutoencoderKLWan: QEffAutoencoderKLWan,
     }
 
 
diff --git a/QEfficient/diffusers/pipelines/pipeline_module.py b/QEfficient/diffusers/pipelines/pipeline_module.py
index 19e7701d4..268b47dae 100644
--- a/QEfficient/diffusers/pipelines/pipeline_module.py
+++ b/QEfficient/diffusers/pipelines/pipeline_module.py
@@ -214,6 +214,122 @@ def compile(self, specializations: List[Dict], **compiler_options) -> None:
         self._compile(specializations=specializations, **compiler_options)
 
 
+class QEffAutoencoderKLWan(QEFFBaseModel):
+    """
+    Wrapper for Variational Autoencoder (VAE) models with ONNX export and QAIC compilation.
+
+    This class handles VAE models with specific transformations and optimizations
+    for efficient inference on Qualcomm AI hardware. VAE models are used in diffusion
+    pipelines for encoding images to latent space and decoding latents back to images.
+
+    Attributes:
+        model (nn.Module): The wrapped VAE model (deep copy of original)
+        type (str): VAE operation type ("encoder" or "decoder")
+        _pytorch_transforms (List): PyTorch transformations applied before ONNX export
+        _onnx_transforms (List): ONNX transformations applied after export
+    """
+
+    _pytorch_transforms = [AttentionTransform, CustomOpsTransform]
+    _onnx_transforms = [FP16ClipTransform, SplitTensorsTransform]
+
+    @property
+    def get_model_config(self) -> Dict:
+        """
+        Get the model configuration as a dictionary.
+
+        Returns:
+            Dict: The configuration dictionary of the underlying VAE model
+        """
+        return self.model.config.__dict__
+
+    def __init__(self, model: nn.Module, type: str) -> None:
+        """
+        Initialize the VAE wrapper.
+
+        Args:
+            model (nn.Module): The pipeline model containing the VAE
+            type (str): VAE operation type ("encoder" or "decoder")
+        """
+        super().__init__(model)
+        self.model = model
+
+        # To have different hashing for encoder/decoder
+        self.model.config["type"] = type
+
+    def get_onnx_params(self) -> Tuple[Dict, Dict, List[str]]:
+        """
+        Generate ONNX export configuration for the VAE decoder.
+
+        Args:
+            latent_height (int): Height of latent representation (default: 32)
+            latent_width (int): Width of latent representation (default: 32)
+
+        Returns:
+            Tuple containing:
+                - example_inputs (Dict): Sample inputs for ONNX export
+                - dynamic_axes (Dict): Specification of dynamic dimensions
+                - output_names (List[str]): Names of model outputs
+        """
+        bs = constants.ONNX_EXPORT_EXAMPLE_BATCH_SIZE
+
+        # VAE decoder takes latent representation as input
+        example_inputs = {
+            "latent_sample": torch.randn(bs, 16, 21, 24, 40),
+            "return_dict": False,
+        }
+
+        output_names = ["sample"]
+
+        # All dimensions except channels can be dynamic
+        dynamic_axes = {
+            "latent_sample": {2: "num_frames", 3: "latent_height", 4: "latent_width"},
+        }
+
+        return example_inputs, dynamic_axes, output_names
+
+    def export(
+        self,
+        inputs: Dict,
+        output_names: List[str],
+        dynamic_axes: Dict,
+        export_dir: str = None,
+        export_kwargs: Dict = {},
+    ) -> str:
+        """
+        Export the VAE model to ONNX format.
+
+        Args:
+            inputs (Dict): Example inputs for ONNX export
+            output_names (List[str]): Names of model outputs
+            dynamic_axes (Dict): Specification of dynamic dimensions
+            export_dir (str, optional): Directory to save ONNX model
+            export_kwargs (Dict, optional): Additional export arguments
+
+        Returns:
+            str: Path to the exported ONNX model
+        """
+
+        self.model.config["_use_default_values"].sort()
+
+        return self._export(
+            example_inputs=inputs,
+            output_names=output_names,
+            dynamic_axes=dynamic_axes,
+            export_dir=export_dir,
+            **export_kwargs,
+        )
+
+    def compile(self, specializations: List[Dict], **compiler_options) -> None:
+        """
+        Compile the ONNX model for Qualcomm AI hardware.
+
+        Args:
+            specializations (List[Dict]): Model specialization configurations
+            **compiler_options: Additional compiler options
+        """
+        self._compile(specializations=specializations, **compiler_options)
+
+
 class QEffVAE(QEFFBaseModel):
     """
     Wrapper for Variational Autoencoder (VAE) models with ONNX export and QAIC compilation.
diff --git a/QEfficient/diffusers/pipelines/wan/pipeline_wan.py b/QEfficient/diffusers/pipelines/wan/pipeline_wan.py
index 888763af0..3e15043d3 100644
--- a/QEfficient/diffusers/pipelines/wan/pipeline_wan.py
+++ b/QEfficient/diffusers/pipelines/wan/pipeline_wan.py
@@ -11,7 +11,7 @@
 for high-performance text-to-video generation on Qualcomm AI hardware.
 The pipeline supports WAN 2.2 architectures with unified transformer.
 
-TODO: 1. Update Vae, umt5 to Qaic; present running on cpu
+TODO: 1. Update umt5 to Qaic; present running on cpu
 """
 
 import os
@@ -22,7 +22,7 @@
 import torch
 from diffusers import WanPipeline
 
-from QEfficient.diffusers.pipelines.pipeline_module import QEffWanUnifiedTransformer
+from QEfficient.diffusers.pipelines.pipeline_module import QEffAutoencoderKLWan, QEffWanUnifiedTransformer
 from QEfficient.diffusers.pipelines.pipeline_utils import (
     ONNX_SUBFUNCTION_MODULE,
     ModulePerf,
@@ -106,11 +106,10 @@ def __init__(self, model, **kwargs):
         self.transformer = QEffWanUnifiedTransformer(self.unified_wrapper)
 
         # VAE decoder for latent-to-video conversion
-        self.vae_decode = model.vae
-
+        self.vae_decoder = QEffAutoencoderKLWan(model.vae, "decoder")
         # Store all modules in a dictionary for easy iteration during export/compile
-        # TODO: add text encoder, vae decoder on QAIC
-        self.modules = {"transformer": self.transformer}
+        # TODO: add text encoder on QAIC
+        self.modules = {"transformer": self.transformer, "vae_decoder": self.vae_decoder}
 
         # Copy tokenizers and scheduler from the original model
         self.tokenizer = model.tokenizer
@@ -336,7 +335,14 @@ def compile(
                     "latent_width": latent_width,  # Latent space width
                     "num_frames": latent_frames,  # Latent frames
                 },
-            ]
+            ],
+            "vae_decoder": [
+                {
+                    "num_frames": latent_frames,
+                    "latent_height": latent_height,
+                    "latent_width": latent_width,
+                }
+            ],
         }
 
         # Use generic utility functions for compilation
@@ -548,6 +554,11 @@ def __call__(
                 str(self.transformer.qpc_path), device_ids=self.transformer.device_ids
             )
 
+        if self.vae_decoder.qpc_session is None:
+            self.vae_decoder.qpc_session = QAICInferenceSession(
+                str(self.vae_decoder.qpc_path), device_ids=self.vae_decoder.device_ids
+            )
+
         # Calculate compressed latent dimension for transformer buffer allocation
         cl, _, _, _ = calculate_latent_dimensions_with_frames(
             height,
@@ -722,31 +733,36 @@ def __call__(
         # Step 9: Decode latents to video
         if not output_type == "latent":
             # Prepare latents for VAE decoding
-            latents = latents.to(self.vae_decode.dtype)
+            latents = latents.to(self.model.vae.dtype)
 
             # Apply VAE normalization (denormalization)
             latents_mean = (
-                torch.tensor(self.vae_decode.config.latents_mean)
-                .view(1, self.vae_decode.config.z_dim, 1, 1, 1)
+                torch.tensor(self.model.vae.config.latents_mean)
+                .view(1, self.model.vae.config.z_dim, 1, 1, 1)
                 .to(latents.device, latents.dtype)
             )
-            latents_std = 1.0 / torch.tensor(self.vae_decode.config.latents_std).view(
-                1, self.vae_decode.config.z_dim, 1, 1, 1
+            latents_std = 1.0 / torch.tensor(self.model.vae.config.latents_std).view(
+                1, self.model.vae.config.z_dim, 1, 1, 1
             ).to(latents.device, latents.dtype)
             latents = latents / latents_std + latents_mean
 
-            # TODO: Enable VAE on QAIC
-            # VAE Decode latents to video using CPU (temporary)
-            video = self.model.vae.decode(latents, return_dict=False)[0]  # CPU fallback
+            inputs_aic = {"latent_sample": latents.detach().numpy()}
+
+            start_decode_time = time.perf_counter()
+            video = self.vae_decoder.qpc_session.run(inputs_aic)  # CPU fallback
+            end_decode_time = time.perf_counter()
+
+            vae_decoder_perf = end_decode_time - start_decode_time
 
             # Post-process video for output
-            video = self.model.video_processor.postprocess_video(video.detach())
+            video = self.model.video_processor.postprocess_video(torch.tensor(video["sample"]))
         else:
             video = latents
 
         # Step 10: Collect performance metrics
         perf_data = {
             "transformer": transformer_perf,  # Unified transformer (QAIC)
+            "vae_decoder": vae_decoder_perf,
         }
 
         # Build performance metrics for output
diff --git a/examples/diffusers/wan/wan_config.json b/examples/diffusers/wan/wan_config.json
index 7e752ba14..77fba9ab7 100644
--- a/examples/diffusers/wan/wan_config.json
+++ b/examples/diffusers/wan/wan_config.json
@@ -32,6 +32,34 @@
           "execute":     {
                               "device_ids": null
                           }
-    }
+    },
+    "vae_decoder": 
+                    {
+                      "specializations": [ 
+                                          {
+                                            "batch_size": 1,
+                                            "num_channels": 16,
+                                            "num_frames": 21,
+                                            "latent_height": 60,
+                                            "latent_width": 104
+                                          }
+                                      ],
+                      "compilation": 
+                                        {
+                                          "onnx_path": null,
+                                          "compile_dir": null,
+                                          "mdp_ts_num_devices": 4,
+                                          "mxfp6_matmul": false,
+                                          "convert_to_fp16": true,
+                                          "aic_num_cores": 16,
+                                          "aic-enable-depth-first": true,
+                                          "compile_only":true
+                                        },
+                      "execute":
+                                        {
+                                          "device_ids": null
+                                        }
+                    }
+
   }
 }
\ No newline at end of file

From d54a374ccbe3cbd5db36184441134caf371d0ede Mon Sep 17 00:00:00 2001
From: Mohit Soni <mohisoni@qti.qualcomm.com>
Date: Tue, 23 Dec 2025 17:33:00 +0000
Subject: [PATCH 02/10] Addressed comments

Signed-off-by: Mohit Soni <mohisoni@qti.qualcomm.com>
---
 .../models/autoencoders/autoencoder_kl_wan.py |  17 ---
 .../diffusers/models/pytorch_transforms.py    |   3 -
 .../diffusers/pipelines/pipeline_module.py    | 103 ++----------------
 .../diffusers/pipelines/wan/pipeline_wan.py   |  43 +++++---
 4 files changed, 36 insertions(+), 130 deletions(-)

diff --git a/QEfficient/diffusers/models/autoencoders/autoencoder_kl_wan.py b/QEfficient/diffusers/models/autoencoders/autoencoder_kl_wan.py
index abd9d9491..4c6a2160b 100644
--- a/QEfficient/diffusers/models/autoencoders/autoencoder_kl_wan.py
+++ b/QEfficient/diffusers/models/autoencoders/autoencoder_kl_wan.py
@@ -202,20 +202,3 @@ def forward(self, x, feat_cache=None, feat_idx=[0], first_chunk=False):
             x = self.conv_out(x)
         return x
 
-
-class QEffAutoencoderKLWan(AutoencoderKLWan):
-    def forward(
-        self,
-        latent_sample: torch.Tensor,
-        sample_posterior: bool = False,
-        return_dict: bool = True,
-        generator=None,
-    ):
-        """
-        Args:
-            sample (`torch.Tensor`): Input sample.
-            return_dict (`bool`, *optional*, defaults to `True`):
-                Whether or not to return a [`DecoderOutput`] instead of a plain tuple.
-        """
-
-        return self.decode(latent_sample)
diff --git a/QEfficient/diffusers/models/pytorch_transforms.py b/QEfficient/diffusers/models/pytorch_transforms.py
index 6fa8cc28a..fa637b2e9 100644
--- a/QEfficient/diffusers/models/pytorch_transforms.py
+++ b/QEfficient/diffusers/models/pytorch_transforms.py
@@ -6,7 +6,6 @@
 # -----------------------------------------------------------------------------
 
 from diffusers.models.autoencoders.autoencoder_kl_wan import (
-    AutoencoderKLWan,
     WanDecoder3d,
     WanEncoder3d,
     WanResample,
@@ -26,7 +25,6 @@
 from QEfficient.base.pytorch_transforms import ModuleMappingTransform
 from QEfficient.customop.rms_norm import CustomRMSNormAIC
 from QEfficient.diffusers.models.autoencoders.autoencoder_kl_wan import (
-    QEffAutoencoderKLWan,
     QEffWanDecoder3d,
     QEffWanEncoder3d,
     QEffWanResample,
@@ -72,7 +70,6 @@ class AttentionTransform(ModuleMappingTransform):
         WanEncoder3d: QEffWanEncoder3d,
         WanResidualBlock: QEffWanResidualBlock,
         WanResample: QEffWanResample,
-        AutoencoderKLWan: QEffAutoencoderKLWan,
     }
 
 
diff --git a/QEfficient/diffusers/pipelines/pipeline_module.py b/QEfficient/diffusers/pipelines/pipeline_module.py
index 268b47dae..d9f3f0ad6 100644
--- a/QEfficient/diffusers/pipelines/pipeline_module.py
+++ b/QEfficient/diffusers/pipelines/pipeline_module.py
@@ -214,7 +214,7 @@ def compile(self, specializations: List[Dict], **compiler_options) -> None:
         self._compile(specializations=specializations, **compiler_options)
 
 
-class QEffAutoencoderKLWan(QEFFBaseModel):
+class QEffVAE(QEFFBaseModel):
     """
     Wrapper for Variational Autoencoder (VAE) models with ONNX export and QAIC compilation.
 
@@ -229,7 +229,7 @@ class QEffAutoencoderKLWan(QEFFBaseModel):
         _onnx_transforms (List): ONNX transformations applied after export
     """
 
-    _pytorch_transforms = [AttentionTransform, CustomOpsTransform]
+    _pytorch_transforms = [CustomOpsTransform, AttentionTransform]
     _onnx_transforms = [FP16ClipTransform, SplitTensorsTransform]
 
     @property
@@ -256,7 +256,7 @@ def __init__(self, model: nn.Module, type: str) -> None:
         # To have different hashing for encoder/decoder
         self.model.config["type"] = type
 
-    def get_onnx_params(self) -> Tuple[Dict, Dict, List[str]]:
+    def get_onnx_params(self, latent_height: int = 32, latent_width: int = 32) -> Tuple[Dict, Dict, List[str]]:
         """
         Generate ONNX export configuration for the VAE decoder.
 
@@ -274,7 +274,7 @@ def get_onnx_params(self) -> Tuple[Dict, Dict, List[str]]:
 
         # VAE decoder takes latent representation as input
         example_inputs = {
-            "latent_sample": torch.randn(bs, 16, 21, 24, 40),
+            "latent_sample": torch.randn(bs, 16, latent_height, latent_width),
             "return_dict": False,
         }
 
@@ -282,97 +282,12 @@ def get_onnx_params(self) -> Tuple[Dict, Dict, List[str]]:
 
         # All dimensions except channels can be dynamic
         dynamic_axes = {
-            "latent_sample": {2: "num_frames", 3: "latent_height", 4: "latent_width"},
+            "latent_sample": {0: "batch_size", 1: "channels", 2: "latent_height", 3: "latent_width"},
         }
 
         return example_inputs, dynamic_axes, output_names
-
-    def export(
-        self,
-        inputs: Dict,
-        output_names: List[str],
-        dynamic_axes: Dict,
-        export_dir: str = None,
-        export_kwargs: Dict = {},
-    ) -> str:
-        """
-        Export the VAE model to ONNX format.
-
-        Args:
-            inputs (Dict): Example inputs for ONNX export
-            output_names (List[str]): Names of model outputs
-            dynamic_axes (Dict): Specification of dynamic dimensions
-            export_dir (str, optional): Directory to save ONNX model
-            export_kwargs (Dict, optional): Additional export arguments
-
-        Returns:
-            str: Path to the exported ONNX model
-        """
-
-        self.model.config["_use_default_values"].sort()
-
-        return self._export(
-            example_inputs=inputs,
-            output_names=output_names,
-            dynamic_axes=dynamic_axes,
-            export_dir=export_dir,
-            **export_kwargs,
-        )
-
-    def compile(self, specializations: List[Dict], **compiler_options) -> None:
-        """
-        Compile the ONNX model for Qualcomm AI hardware.
-
-        Args:
-            specializations (List[Dict]): Model specialization configurations
-            **compiler_options: Additional compiler options
-        """
-        self._compile(specializations=specializations, **compiler_options)
-
-
-class QEffVAE(QEFFBaseModel):
-    """
-    Wrapper for Variational Autoencoder (VAE) models with ONNX export and QAIC compilation.
-
-    This class handles VAE models with specific transformations and optimizations
-    for efficient inference on Qualcomm AI hardware. VAE models are used in diffusion
-    pipelines for encoding images to latent space and decoding latents back to images.
-
-    Attributes:
-        model (nn.Module): The wrapped VAE model (deep copy of original)
-        type (str): VAE operation type ("encoder" or "decoder")
-        _pytorch_transforms (List): PyTorch transformations applied before ONNX export
-        _onnx_transforms (List): ONNX transformations applied after export
-    """
-
-    _pytorch_transforms = [CustomOpsTransform]
-    _onnx_transforms = [FP16ClipTransform, SplitTensorsTransform]
-
-    @property
-    def get_model_config(self) -> Dict:
-        """
-        Get the model configuration as a dictionary.
-
-        Returns:
-            Dict: The configuration dictionary of the underlying VAE model
-        """
-        return self.model.config.__dict__
-
-    def __init__(self, model: nn.Module, type: str) -> None:
-        """
-        Initialize the VAE wrapper.
-
-        Args:
-            model (nn.Module): The pipeline model containing the VAE
-            type (str): VAE operation type ("encoder" or "decoder")
-        """
-        super().__init__(model)
-        self.model = model
-
-        # To have different hashing for encoder/decoder
-        self.model.config["type"] = type
-
-    def get_onnx_params(self, latent_height: int = 32, latent_width: int = 32) -> Tuple[Dict, Dict, List[str]]:
+    
+    def get_video_onnx_params(self) -> Tuple[Dict, Dict, List[str]]:
         """
         Generate ONNX export configuration for the VAE decoder.
 
@@ -390,7 +305,7 @@ def get_onnx_params(self, latent_height: int = 32, latent_width: int = 32) -> Tu
 
         # VAE decoder takes latent representation as input
         example_inputs = {
-            "latent_sample": torch.randn(bs, 16, latent_height, latent_width),
+            "latent_sample": torch.randn(bs, 16, 21, 24, 40),
             "return_dict": False,
         }
 
@@ -398,7 +313,7 @@ def get_onnx_params(self, latent_height: int = 32, latent_width: int = 32) -> Tu
 
         # All dimensions except channels can be dynamic
         dynamic_axes = {
-            "latent_sample": {0: "batch_size", 1: "channels", 2: "latent_height", 3: "latent_width"},
+            "latent_sample": {0: "batch_size", 2: "num_frames", 3: "latent_height", 4: "latent_width"},
         }
 
         return example_inputs, dynamic_axes, output_names
diff --git a/QEfficient/diffusers/pipelines/wan/pipeline_wan.py b/QEfficient/diffusers/pipelines/wan/pipeline_wan.py
index 3e15043d3..5c2ee7858 100644
--- a/QEfficient/diffusers/pipelines/wan/pipeline_wan.py
+++ b/QEfficient/diffusers/pipelines/wan/pipeline_wan.py
@@ -22,7 +22,7 @@
 import torch
 from diffusers import WanPipeline
 
-from QEfficient.diffusers.pipelines.pipeline_module import QEffAutoencoderKLWan, QEffWanUnifiedTransformer
+from QEfficient.diffusers.pipelines.pipeline_module import QEffVAE, QEffWanUnifiedTransformer
 from QEfficient.diffusers.pipelines.pipeline_utils import (
     ONNX_SUBFUNCTION_MODULE,
     ModulePerf,
@@ -106,7 +106,7 @@ def __init__(self, model, **kwargs):
         self.transformer = QEffWanUnifiedTransformer(self.unified_wrapper)
 
         # VAE decoder for latent-to-video conversion
-        self.vae_decoder = QEffAutoencoderKLWan(model.vae, "decoder")
+        self.vae_decoder = QEffVAE(model.vae, "decoder")
         # Store all modules in a dictionary for easy iteration during export/compile
         # TODO: add text encoder on QAIC
         self.modules = {"transformer": self.transformer, "vae_decoder": self.vae_decoder}
@@ -115,6 +115,13 @@ def __init__(self, model, **kwargs):
         self.tokenizer = model.tokenizer
         self.text_encoder.tokenizer = model.tokenizer
         self.scheduler = model.scheduler
+
+        self.vae_decoder.model.forward = lambda latent_sample, return_dict: self.vae_decoder.model.decode(
+            latent_sample, return_dict
+        )
+
+        self.vae_decoder.get_onnx_params = self.vae_decoder.get_video_onnx_params
+        self.vae_decoder.model.config["_use_default_values"].sort()
         # Extract patch dimensions from transformer configuration
         _, self.patch_height, self.patch_width = self.transformer.model.config.patch_size
 
@@ -554,11 +561,6 @@ def __call__(
                 str(self.transformer.qpc_path), device_ids=self.transformer.device_ids
             )
 
-        if self.vae_decoder.qpc_session is None:
-            self.vae_decoder.qpc_session = QAICInferenceSession(
-                str(self.vae_decoder.qpc_path), device_ids=self.vae_decoder.device_ids
-            )
-
         # Calculate compressed latent dimension for transformer buffer allocation
         cl, _, _, _ = calculate_latent_dimensions_with_frames(
             height,
@@ -733,29 +735,38 @@ def __call__(
         # Step 9: Decode latents to video
         if not output_type == "latent":
             # Prepare latents for VAE decoding
-            latents = latents.to(self.model.vae.dtype)
+            latents = latents.to(self.vae_decoder.model.dtype)
 
             # Apply VAE normalization (denormalization)
             latents_mean = (
-                torch.tensor(self.model.vae.config.latents_mean)
-                .view(1, self.model.vae.config.z_dim, 1, 1, 1)
+                torch.tensor(self.vae_decoder.model.config.latents_mean)
+                .view(1, self.vae_decoder.model.config.z_dim, 1, 1, 1)
                 .to(latents.device, latents.dtype)
             )
-            latents_std = 1.0 / torch.tensor(self.model.vae.config.latents_std).view(
-                1, self.model.vae.config.z_dim, 1, 1, 1
+            latents_std = 1.0 / torch.tensor(self.vae_decoder.model.config.latents_std).view(
+                1, self.vae_decoder.model.config.z_dim, 1, 1, 1
             ).to(latents.device, latents.dtype)
             latents = latents / latents_std + latents_mean
 
-            inputs_aic = {"latent_sample": latents.detach().numpy()}
+            # Initialize VAE decoder inference session
+            if self.vae_decoder.qpc_session is None:
+                self.vae_decoder.qpc_session = QAICInferenceSession(
+                    str(self.vae_decoder.qpc_path), device_ids=self.vae_decoder.device_ids
+                )
+            
+            # Allocate output buffer for VAE decoder
+            output_buffer = {"sample": np.random.rand(batch_size, 3, num_frames, height, width).astype(np.int32)}
+
+            inputs = {"latent_sample": latents.numpy()}
 
             start_decode_time = time.perf_counter()
-            video = self.vae_decoder.qpc_session.run(inputs_aic)  # CPU fallback
+            video = self.vae_decoder.qpc_session.run(inputs)  # CPU fallback
             end_decode_time = time.perf_counter()
-
             vae_decoder_perf = end_decode_time - start_decode_time
 
             # Post-process video for output
-            video = self.model.video_processor.postprocess_video(torch.tensor(video["sample"]))
+            video_tensor = torch.from_numpy(video["sample"])
+            video = self.model.video_processor.postprocess_video(video_tensor)
         else:
             video = latents
 

From 8a633c042d67d84224c32219940281c23184b16b Mon Sep 17 00:00:00 2001
From: Mohit Soni <mohisoni@qti.qualcomm.com>
Date: Tue, 23 Dec 2025 17:48:02 +0000
Subject: [PATCH 03/10] Minor Fixes

Signed-off-by: Mohit Soni <mohisoni@qti.qualcomm.com>
---
 .../models/autoencoders/autoencoder_kl_wan.py   | 17 ++++-------------
 .../diffusers/pipelines/pipeline_module.py      |  4 ++--
 .../diffusers/pipelines/wan/pipeline_wan.py     |  2 +-
 examples/diffusers/wan/wan_config.json          |  6 ++++--
 4 files changed, 11 insertions(+), 18 deletions(-)

diff --git a/QEfficient/diffusers/models/autoencoders/autoencoder_kl_wan.py b/QEfficient/diffusers/models/autoencoders/autoencoder_kl_wan.py
index 4c6a2160b..34a4961b1 100644
--- a/QEfficient/diffusers/models/autoencoders/autoencoder_kl_wan.py
+++ b/QEfficient/diffusers/models/autoencoders/autoencoder_kl_wan.py
@@ -1,19 +1,11 @@
-# Copyright 2025 The Wan Team and The HuggingFace Team. All rights reserved.
+# -----------------------------------------------------------------------------
 #
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
+# Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
+# SPDX-License-Identifier: BSD-3-Clause
 #
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
+# -----------------------------------------------------------------------------
 
 import torch
-from diffusers import AutoencoderKLWan
 from diffusers.models.autoencoders.autoencoder_kl_wan import (
     WanDecoder3d,
     WanEncoder3d,
@@ -201,4 +193,3 @@ def forward(self, x, feat_cache=None, feat_idx=[0], first_chunk=False):
         else:
             x = self.conv_out(x)
         return x
-
diff --git a/QEfficient/diffusers/pipelines/pipeline_module.py b/QEfficient/diffusers/pipelines/pipeline_module.py
index d9f3f0ad6..8a9930556 100644
--- a/QEfficient/diffusers/pipelines/pipeline_module.py
+++ b/QEfficient/diffusers/pipelines/pipeline_module.py
@@ -286,7 +286,7 @@ def get_onnx_params(self, latent_height: int = 32, latent_width: int = 32) -> Tu
         }
 
         return example_inputs, dynamic_axes, output_names
-    
+
     def get_video_onnx_params(self) -> Tuple[Dict, Dict, List[str]]:
         """
         Generate ONNX export configuration for the VAE decoder.
@@ -305,7 +305,7 @@ def get_video_onnx_params(self) -> Tuple[Dict, Dict, List[str]]:
 
         # VAE decoder takes latent representation as input
         example_inputs = {
-            "latent_sample": torch.randn(bs, 16, 21, 24, 40),
+            "latent_sample": torch.randn(bs, 16, 21, 12, 16),
             "return_dict": False,
         }
 
diff --git a/QEfficient/diffusers/pipelines/wan/pipeline_wan.py b/QEfficient/diffusers/pipelines/wan/pipeline_wan.py
index 5c2ee7858..a2b51fe12 100644
--- a/QEfficient/diffusers/pipelines/wan/pipeline_wan.py
+++ b/QEfficient/diffusers/pipelines/wan/pipeline_wan.py
@@ -753,7 +753,7 @@ def __call__(
                 self.vae_decoder.qpc_session = QAICInferenceSession(
                     str(self.vae_decoder.qpc_path), device_ids=self.vae_decoder.device_ids
                 )
-            
+
             # Allocate output buffer for VAE decoder
             output_buffer = {"sample": np.random.rand(batch_size, 3, num_frames, height, width).astype(np.int32)}
 
diff --git a/examples/diffusers/wan/wan_config.json b/examples/diffusers/wan/wan_config.json
index 77fba9ab7..c32054db1 100644
--- a/examples/diffusers/wan/wan_config.json
+++ b/examples/diffusers/wan/wan_config.json
@@ -48,12 +48,14 @@
                                         {
                                           "onnx_path": null,
                                           "compile_dir": null,
-                                          "mdp_ts_num_devices": 4,
+                                          "mdp_ts_num_devices": 8,
                                           "mxfp6_matmul": false,
                                           "convert_to_fp16": true,
                                           "aic_num_cores": 16,
                                           "aic-enable-depth-first": true,
-                                          "compile_only":true
+                                          "compile_only":true,
+                                          "mos": 1,
+                                          "mdts_mos": 1
                                         },
                       "execute":
                                         {

From e350eceb6a7ad3a616df05730803a0f6566de335 Mon Sep 17 00:00:00 2001
From: Mohit Soni <mohisoni@qti.qualcomm.com>
Date: Wed, 24 Dec 2025 06:25:17 +0000
Subject: [PATCH 04/10] Description

Signed-off-by: Mohit Soni <mohisoni@qti.qualcomm.com>
---
 .../diffusers/models/autoencoders/autoencoder_kl_wan.py      | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/QEfficient/diffusers/models/autoencoders/autoencoder_kl_wan.py b/QEfficient/diffusers/models/autoencoders/autoencoder_kl_wan.py
index 34a4961b1..868214455 100644
--- a/QEfficient/diffusers/models/autoencoders/autoencoder_kl_wan.py
+++ b/QEfficient/diffusers/models/autoencoders/autoencoder_kl_wan.py
@@ -18,9 +18,14 @@
 
 modes = []
 
+# Used max(0, x.shape[2] - CACHE_T) instead of CACHE_T because x.shape[2] is either 1 or 4,
+# and CACHE_T = 2. This ensures the value never goes negative
+
 
 class QEffWanResample(WanResample):
     def __qeff_init__(self):
+        # Changed upsampling mode from "nearest-exact" to "nearest" for ONNX compatibility.
+        # Since the scale factor is an integer, both modes behave the
         if self.mode in ("upsample2d", "upsample3d"):
             self.resample[0] = WanUpsample(scale_factor=(2.0, 2.0), mode="nearest")
 

From 942f621306dc2ebfe6b5f99352252d6d98169d76 Mon Sep 17 00:00:00 2001
From: vtirumal <vtirumal@qti.qualcomm.com>
Date: Mon, 5 Jan 2026 08:10:20 +0000
Subject: [PATCH 05/10] config minor clean up

Signed-off-by: vtirumal <vtirumal@qti.qualcomm.com>
---
 .../pipelines/configs/wan_config.json         |  27 ++++-
 .../diffusers/pipelines/wan/pipeline_wan.py   |   2 +-
 examples/diffusers/wan/wan_config.json        | 107 +++++++++---------
 3 files changed, 79 insertions(+), 57 deletions(-)

diff --git a/QEfficient/diffusers/pipelines/configs/wan_config.json b/QEfficient/diffusers/pipelines/configs/wan_config.json
index 3f5edce07..eba96f259 100644
--- a/QEfficient/diffusers/pipelines/configs/wan_config.json
+++ b/QEfficient/diffusers/pipelines/configs/wan_config.json
@@ -31,6 +31,31 @@
           "execute":     {
                               "device_ids": null
                           }
-    }
+    },
+    "vae_decoder":{
+          "specializations": [
+                              {
+                                "batch_size": 1,
+                                "num_channels": 16
+                              }
+                            ],
+          "compilation":
+                        {
+                          "onnx_path": null,
+                          "compile_dir": null,
+                          "mdp_ts_num_devices": 8,
+                          "mxfp6_matmul": false,
+                          "convert_to_fp16": true,
+                          "aic_num_cores": 16,
+                          "aic-enable-depth-first": true,
+                          "compile_only":true,
+                          "mos": 1,
+                          "mdts_mos": 1
+                        },
+           "execute":
+                      {
+                        "device_ids": null
+                      }
+      }
   }
 }
\ No newline at end of file
diff --git a/QEfficient/diffusers/pipelines/wan/pipeline_wan.py b/QEfficient/diffusers/pipelines/wan/pipeline_wan.py
index a2b51fe12..083e62243 100644
--- a/QEfficient/diffusers/pipelines/wan/pipeline_wan.py
+++ b/QEfficient/diffusers/pipelines/wan/pipeline_wan.py
@@ -760,7 +760,7 @@ def __call__(
             inputs = {"latent_sample": latents.numpy()}
 
             start_decode_time = time.perf_counter()
-            video = self.vae_decoder.qpc_session.run(inputs)  # CPU fallback
+            video = self.vae_decoder.qpc_session.run(inputs)
             end_decode_time = time.perf_counter()
             vae_decoder_perf = end_decode_time - start_decode_time
 
diff --git a/examples/diffusers/wan/wan_config.json b/examples/diffusers/wan/wan_config.json
index c32054db1..7054d573b 100644
--- a/examples/diffusers/wan/wan_config.json
+++ b/examples/diffusers/wan/wan_config.json
@@ -3,65 +3,62 @@
   "model_type": "wan",
   "modules": {
     "transformer": {
-          "specializations": [
-                              {
-                                  "batch_size": "1",
-                                  "num_channels": "16",
-                                  "steps": "1",
-                                  "sequence_length": "512",
-                                  "model_type": 1
-                              },
-                              {
-                                  "batch_size": "1",
-                                  "num_channels": "16",
-                                  "steps": "1",
-                                  "sequence_length": "512",
-                                  "model_type": 2
-                              }
-                          ],
-          "compilation":  {
-                              "onnx_path": null,
-                              "compile_dir": null,
-                              "mdp_ts_num_devices": 16,
-                              "mxfp6_matmul": true,
-                              "convert_to_fp16": true,
-                              "aic_num_cores": 16,
-                              "mos": 1,
-                              "mdts_mos": 1
-                         },
-          "execute":     {
-                              "device_ids": null
-                          }
-    },
-    "vae_decoder": 
-                    {
-                      "specializations": [ 
-                                          {
-                                            "batch_size": 1,
-                                            "num_channels": 16,
-                                            "num_frames": 21,
-                                            "latent_height": 60,
-                                            "latent_width": 104
-                                          }
-                                      ],
-                      "compilation": 
+                    "specializations": [
                                         {
-                                          "onnx_path": null,
-                                          "compile_dir": null,
-                                          "mdp_ts_num_devices": 8,
-                                          "mxfp6_matmul": false,
-                                          "convert_to_fp16": true,
-                                          "aic_num_cores": 16,
-                                          "aic-enable-depth-first": true,
-                                          "compile_only":true,
-                                          "mos": 1,
-                                          "mdts_mos": 1
+                                            "batch_size": "1",
+                                            "num_channels": "16",
+                                            "steps": "1",
+                                            "sequence_length": "512",
+                                            "model_type": 1
                                         },
-                      "execute":
                                         {
-                                          "device_ids": null
+                                            "batch_size": "1",
+                                            "num_channels": "16",
+                                            "steps": "1",
+                                            "sequence_length": "512",
+                                            "model_type": 2
+                                        }
+                                    ],
+                    "compilation":  {
+                                      "onnx_path": null,
+                                      "compile_dir": null,
+                                      "mdp_ts_num_devices": 16,
+                                      "mxfp6_matmul": true,
+                                      "convert_to_fp16": true,
+                                      "aic_num_cores": 16,
+                                      "mos": 1,
+                                      "mdts_mos": 1
+                                  },
+                    "execute":     {
+                                        "device_ids": null
+                                    }
+    },
+    "vae_decoder":
+                  {
+                    "specializations": [
+                                        {
+                                          "batch_size": 1,
+                                          "num_channels": 16
                                         }
-                    }
+                                    ],
+                    "compilation":
+                                      {
+                                        "onnx_path": null,
+                                        "compile_dir": null,
+                                        "mdp_ts_num_devices": 8,
+                                        "mxfp6_matmul": false,
+                                        "convert_to_fp16": true,
+                                        "aic_num_cores": 16,
+                                        "aic-enable-depth-first": true,
+                                        "compile_only":true,
+                                        "mos": 1,
+                                        "mdts_mos": 1
+                                      },
+                    "execute":
+                                      {
+                                        "device_ids": null
+                                      }
+                  }
 
   }
 }
\ No newline at end of file

From 2bdd93122f9344134729ddac1556c7dd08f57f71 Mon Sep 17 00:00:00 2001
From: Mohit Soni <mohisoni@qti.qualcomm.com>
Date: Tue, 6 Jan 2026 08:03:24 +0000
Subject: [PATCH 06/10] Comments Addressed

Signed-off-by: Mohit Soni <mohisoni@qti.qualcomm.com>
---
 .../diffusers/pipelines/pipeline_module.py      |  7 ++++++-
 .../diffusers/pipelines/wan/pipeline_wan.py     | 17 ++++++++---------
 examples/diffusers/wan/wan_config.json          |  4 ++--
 3 files changed, 16 insertions(+), 12 deletions(-)

diff --git a/QEfficient/diffusers/pipelines/pipeline_module.py b/QEfficient/diffusers/pipelines/pipeline_module.py
index 8a9930556..e8b72c352 100644
--- a/QEfficient/diffusers/pipelines/pipeline_module.py
+++ b/QEfficient/diffusers/pipelines/pipeline_module.py
@@ -302,10 +302,13 @@ def get_video_onnx_params(self) -> Tuple[Dict, Dict, List[str]]:
                 - output_names (List[str]): Names of model outputs
         """
         bs = constants.ONNX_EXPORT_EXAMPLE_BATCH_SIZE
+        num_frames = constants.WAN_ONNX_EXPORT_LATENT_FRAMES
+        latent_height = constants.WAN_ONNX_EXPORT_LATENT_HEIGHT_180P
+        latent_width = constants.WAN_ONNX_EXPORT_LATENT_WIDTH_180P
 
         # VAE decoder takes latent representation as input
         example_inputs = {
-            "latent_sample": torch.randn(bs, 16, 21, 12, 16),
+            "latent_sample": torch.randn(bs, 16, num_frames, latent_height, latent_width),
             "return_dict": False,
         }
 
@@ -339,6 +342,8 @@ def export(
         Returns:
             str: Path to the exported ONNX model
         """
+        self.model.config["_use_default_values"].sort()
+
         return self._export(
             example_inputs=inputs,
             output_names=output_names,
diff --git a/QEfficient/diffusers/pipelines/wan/pipeline_wan.py b/QEfficient/diffusers/pipelines/wan/pipeline_wan.py
index 083e62243..a5a27b141 100644
--- a/QEfficient/diffusers/pipelines/wan/pipeline_wan.py
+++ b/QEfficient/diffusers/pipelines/wan/pipeline_wan.py
@@ -21,6 +21,7 @@
 import numpy as np
 import torch
 from diffusers import WanPipeline
+from tqdm import tqdm
 
 from QEfficient.diffusers.pipelines.pipeline_module import QEffVAE, QEffWanUnifiedTransformer
 from QEfficient.diffusers.pipelines.pipeline_utils import (
@@ -121,7 +122,6 @@ def __init__(self, model, **kwargs):
         )
 
         self.vae_decoder.get_onnx_params = self.vae_decoder.get_video_onnx_params
-        self.vae_decoder.model.config["_use_default_values"].sort()
         # Extract patch dimensions from transformer configuration
         _, self.patch_height, self.patch_width = self.transformer.model.config.patch_size
 
@@ -227,7 +227,7 @@ def export(
         """
 
         # Export each module with video-specific parameters
-        for module_name, module_obj in self.modules.items():
+        for module_name, module_obj in tqdm(self.modules.items(), desc="Exporting modules", unit="module"):
             # Get ONNX export configuration with video dimensions
             example_inputs, dynamic_axes, output_names = module_obj.get_onnx_params()
 
@@ -308,6 +308,7 @@ def compile(
             path is None
             for path in [
                 self.transformer.onnx_path,
+                self.vae_decoder.onnx_path,
             ]
         ):
             self.export(use_onnx_subfunctions=use_onnx_subfunctions)
@@ -343,13 +344,11 @@ def compile(
                     "num_frames": latent_frames,  # Latent frames
                 },
             ],
-            "vae_decoder": [
-                {
-                    "num_frames": latent_frames,
-                    "latent_height": latent_height,
-                    "latent_width": latent_width,
-                }
-            ],
+            "vae_decoder": {
+                "num_frames": latent_frames,
+                "latent_height": latent_height,
+                "latent_width": latent_width,
+            },
         }
 
         # Use generic utility functions for compilation
diff --git a/examples/diffusers/wan/wan_config.json b/examples/diffusers/wan/wan_config.json
index 7054d573b..188f7f70b 100644
--- a/examples/diffusers/wan/wan_config.json
+++ b/examples/diffusers/wan/wan_config.json
@@ -35,12 +35,12 @@
     },
     "vae_decoder":
                   {
-                    "specializations": [
+                    "specializations":
                                         {
                                           "batch_size": 1,
                                           "num_channels": 16
                                         }
-                                    ],
+                                    ,
                     "compilation":
                                       {
                                         "onnx_path": null,

From 5007858ac004cd9bd7fb90b2d88f52f59aa1e0f6 Mon Sep 17 00:00:00 2001
From: Mohit Soni <mohisoni@qti.qualcomm.com>
Date: Tue, 6 Jan 2026 09:18:34 +0000
Subject: [PATCH 07/10] Minor Changes

Signed-off-by: Mohit Soni <mohisoni@qti.qualcomm.com>
---
 QEfficient/diffusers/pipelines/pipeline_module.py  | 8 ++++----
 QEfficient/diffusers/pipelines/wan/pipeline_wan.py | 7 ++++---
 2 files changed, 8 insertions(+), 7 deletions(-)

diff --git a/QEfficient/diffusers/pipelines/pipeline_module.py b/QEfficient/diffusers/pipelines/pipeline_module.py
index e8b72c352..d960eceae 100644
--- a/QEfficient/diffusers/pipelines/pipeline_module.py
+++ b/QEfficient/diffusers/pipelines/pipeline_module.py
@@ -302,13 +302,13 @@ def get_video_onnx_params(self) -> Tuple[Dict, Dict, List[str]]:
                 - output_names (List[str]): Names of model outputs
         """
         bs = constants.ONNX_EXPORT_EXAMPLE_BATCH_SIZE
-        num_frames = constants.WAN_ONNX_EXPORT_LATENT_FRAMES
+        latent_frames = constants.WAN_ONNX_EXPORT_LATENT_FRAMES
         latent_height = constants.WAN_ONNX_EXPORT_LATENT_HEIGHT_180P
         latent_width = constants.WAN_ONNX_EXPORT_LATENT_WIDTH_180P
 
         # VAE decoder takes latent representation as input
         example_inputs = {
-            "latent_sample": torch.randn(bs, 16, num_frames, latent_height, latent_width),
+            "latent_sample": torch.randn(bs, 16, latent_frames, latent_height, latent_width),
             "return_dict": False,
         }
 
@@ -316,7 +316,7 @@ def get_video_onnx_params(self) -> Tuple[Dict, Dict, List[str]]:
 
         # All dimensions except channels can be dynamic
         dynamic_axes = {
-            "latent_sample": {0: "batch_size", 2: "num_frames", 3: "latent_height", 4: "latent_width"},
+            "latent_sample": {0: "batch_size", 2: "latent_frames", 3: "latent_height", 4: "latent_width"},
         }
 
         return example_inputs, dynamic_axes, output_names
@@ -611,7 +611,7 @@ def get_onnx_params(self):
             "hidden_states": {
                 0: "batch_size",
                 1: "num_channels",
-                2: "num_frames",
+                2: "latent_frames",
                 3: "latent_height",
                 4: "latent_width",
             },
diff --git a/QEfficient/diffusers/pipelines/wan/pipeline_wan.py b/QEfficient/diffusers/pipelines/wan/pipeline_wan.py
index a5a27b141..cd1b59cd8 100644
--- a/QEfficient/diffusers/pipelines/wan/pipeline_wan.py
+++ b/QEfficient/diffusers/pipelines/wan/pipeline_wan.py
@@ -334,24 +334,25 @@ def compile(
                     "cl": cl,  # Compressed latent dimension
                     "latent_height": latent_height,  # Latent space height
                     "latent_width": latent_width,  # Latent space width
-                    "num_frames": latent_frames,  # Latent frames
+                    "latent_frames": latent_frames,  # Latent frames
                 },
                 # low noise
                 {
                     "cl": cl,  # Compressed latent dimension
                     "latent_height": latent_height,  # Latent space height
                     "latent_width": latent_width,  # Latent space width
-                    "num_frames": latent_frames,  # Latent frames
+                    "latent_frames": latent_frames,  # Latent frames
                 },
             ],
             "vae_decoder": {
-                "num_frames": latent_frames,
+                "latent_frames": latent_frames,
                 "latent_height": latent_height,
                 "latent_width": latent_width,
             },
         }
 
         # Use generic utility functions for compilation
+        logger.warning('For VAE compilation use QAIC_COMPILER_OPTS_UNSUPPORTED="-aic-hmx-conv3d" ')
         if parallel:
             compile_modules_parallel(self.modules, self.custom_config, specialization_updates)
         else:

From 6e851fbe3232f9ec0ec36bdcd446e0d0d70b5c53 Mon Sep 17 00:00:00 2001
From: vtirumal <vtirumal@qti.qualcomm.com>
Date: Tue, 6 Jan 2026 10:57:09 +0000
Subject: [PATCH 08/10] To skip pytest for Wan

Signed-off-by: vtirumal <vtirumal@qti.qualcomm.com>
---
 scripts/Jenkinsfile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/Jenkinsfile b/scripts/Jenkinsfile
index 3420c025b..d51765a4d 100644
--- a/scripts/Jenkinsfile
+++ b/scripts/Jenkinsfile
@@ -95,7 +95,7 @@ pipeline {
                     export TOKENIZERS_PARALLELISM=false &&
                     export QEFF_HOME=$PWD/Non_cli_qaic_diffusion &&
                     export HF_HUB_CACHE=/huggingface_hub &&
-                    pytest tests -m '(not cli) and (on_qaic) and (diffusion_models) and (not qnn) and (not finetune)' --ignore tests/vllm --junitxml=tests/tests_log_diffusion.xml &&
+                    pytest tests -m '(not cli) and (on_qaic) and (diffusion_models) and (not wan) and (not qnn) and (not finetune)' --ignore tests/vllm --junitxml=tests/tests_log_diffusion.xml &&
                     junitparser merge tests/tests_log_diffusion.xml tests/tests_log.xml &&
                     deactivate"
                     '''

From dc42816250c3c852c42e7930ccd9dff6b9a55225 Mon Sep 17 00:00:00 2001
From: vtirumal <vtirumal@qti.qualcomm.com>
Date: Thu, 8 Jan 2026 04:56:07 +0000
Subject: [PATCH 09/10] Adding init for encoder and compiler_only flag for wan
 configs

Signed-off-by: vtirumal <vtirumal@qti.qualcomm.com>
---
 QEfficient/diffusers/models/autoencoders/__init__.py   | 6 ++++++
 QEfficient/diffusers/pipelines/configs/wan_config.json | 1 +
 examples/diffusers/wan/wan_config.json                 | 1 +
 tests/diffusers/wan_test_config.json                   | 1 +
 4 files changed, 9 insertions(+)
 create mode 100644 QEfficient/diffusers/models/autoencoders/__init__.py

diff --git a/QEfficient/diffusers/models/autoencoders/__init__.py b/QEfficient/diffusers/models/autoencoders/__init__.py
new file mode 100644
index 000000000..75daf1953
--- /dev/null
+++ b/QEfficient/diffusers/models/autoencoders/__init__.py
@@ -0,0 +1,6 @@
+# -----------------------------------------------------------------------------
+#
+# Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# ----------------------------------------------------------------------------
diff --git a/QEfficient/diffusers/pipelines/configs/wan_config.json b/QEfficient/diffusers/pipelines/configs/wan_config.json
index eba96f259..fb6f3dccd 100644
--- a/QEfficient/diffusers/pipelines/configs/wan_config.json
+++ b/QEfficient/diffusers/pipelines/configs/wan_config.json
@@ -24,6 +24,7 @@
                               "mdp_ts_num_devices": 16,
                               "mxfp6_matmul": true,
                               "convert_to_fp16": true,
+                              "compile_only":true,
                               "aic_num_cores": 16,
                               "mos": 1,
                               "mdts_mos": 1
diff --git a/examples/diffusers/wan/wan_config.json b/examples/diffusers/wan/wan_config.json
index 188f7f70b..efeb7c877 100644
--- a/examples/diffusers/wan/wan_config.json
+++ b/examples/diffusers/wan/wan_config.json
@@ -25,6 +25,7 @@
                                       "mdp_ts_num_devices": 16,
                                       "mxfp6_matmul": true,
                                       "convert_to_fp16": true,
+                                      "compile_only":true,
                                       "aic_num_cores": 16,
                                       "mos": 1,
                                       "mdts_mos": 1
diff --git a/tests/diffusers/wan_test_config.json b/tests/diffusers/wan_test_config.json
index 1ed36294a..25869bbe8 100644
--- a/tests/diffusers/wan_test_config.json
+++ b/tests/diffusers/wan_test_config.json
@@ -51,6 +51,7 @@
                                 "mdp_ts_num_devices": 1,
                                 "mxfp6_matmul": true,
                                 "convert_to_fp16": true,
+                                "compile_only":true,
                                 "aic_num_cores": 16,
                                 "mos": 1,
                                 "mdts_mos": 1

From 92307b61603df55e078ce3867ab67c5a960e49b0 Mon Sep 17 00:00:00 2001
From: Mohit Soni <mohisoni@qti.qualcomm.com>
Date: Thu, 8 Jan 2026 09:42:19 +0000
Subject: [PATCH 10/10] VAE export fix

Signed-off-by: Mohit Soni <mohisoni@qti.qualcomm.com>
---
 QEfficient/diffusers/pipelines/pipeline_module.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/QEfficient/diffusers/pipelines/pipeline_module.py b/QEfficient/diffusers/pipelines/pipeline_module.py
index d960eceae..4cc70d056 100644
--- a/QEfficient/diffusers/pipelines/pipeline_module.py
+++ b/QEfficient/diffusers/pipelines/pipeline_module.py
@@ -342,7 +342,9 @@ def export(
         Returns:
             str: Path to the exported ONNX model
         """
-        self.model.config["_use_default_values"].sort()
+
+        if hasattr(self.model.config, "_use_default_values"):
+            self.model.config["_use_default_values"].sort()
 
         return self._export(
             example_inputs=inputs,