diff --git a/docker/common/uv-pytorch.lock b/docker/common/uv-pytorch.lock
index c26fd7e54..8bca7cbeb 100644
--- a/docker/common/uv-pytorch.lock
+++ b/docker/common/uv-pytorch.lock
@@ -68,7 +68,7 @@ overrides = [
     { name = "nvidia-nccl-cu12", marker = "sys_platform == 'never'" },
     { name = "torch", marker = "sys_platform == 'never'", index = "https://download.pytorch.org/whl/cpu" },
     { name = "torchao", marker = "sys_platform == 'never'" },
-    { name = "torchvision", marker = "sys_platform == 'never'" },
+    { name = "torchvision", marker = "sys_platform == 'never'", index = "https://download.pytorch.org/whl/cpu" },
     { name = "transformer-engine", marker = "sys_platform == 'never'" },
     { name = "transformer-engine-torch", marker = "sys_platform == 'never'" },
     { name = "triton", marker = "sys_platform == 'never'" },
@@ -2279,6 +2279,21 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/41/45/1a4ed80516f02155c51f51e8cedb3c1902296743db0bbc66608a0db2814f/jsonschema_specifications-2025.9.1-py3-none-any.whl", hash = "sha256:98802fee3a11ee76ecaca44429fda8a41bff98b00a0f2838151b113f210cc6fe", size = 18437, upload-time = "2025-09-08T01:34:57.871Z" },
 ]
 
+[[package]]
+name = "kernels"
+version = "0.12.2"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "huggingface-hub" },
+    { name = "packaging" },
+    { name = "pyyaml" },
+    { name = "tomli", marker = "python_full_version < '3.11'" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/a9/07/d2b635e965b232cae1aa873c6e0458947196be8dca7bb02e64d3cd6e8d19/kernels-0.12.2.tar.gz", hash = "sha256:812fc43c2814f046cee655cbebf3918cddd489715773670bdb38cca3f5203b5b", size = 57108, upload-time = "2026-03-04T10:03:00.379Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/08/be/f5d6758b48633e4f6a28198fcf4bf9f763cc6a82e2335d9fe8802a5cb440/kernels-0.12.2-py3-none-any.whl", hash = "sha256:1289261804748cf3cf8e3afab80b505b0f1b28e4ec88379cdf08dc31e64964b8", size = 55205, upload-time = "2026-03-04T10:02:59.305Z" },
+]
+
 [[package]]
 name = "kiwisolver"
 version = "1.4.9"
@@ -3235,6 +3250,7 @@ all = [
     { name = "ftfy" },
     { name = "imageio" },
     { name = "imageio-ffmpeg" },
+    { name = "kernels" },
     { name = "mamba-ssm" },
     { name = "mistral-common", extra = ["opencv"] },
     { name = "numba", version = "0.53.1", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.13'" },
@@ -3252,6 +3268,7 @@ all = [
     { name = "sentencepiece" },
     { name = "timm" },
     { name = "torchcodec", marker = "platform_machine == 'x86_64' and sys_platform != 'darwin'" },
+    { name = "torchvision", marker = "sys_platform == 'never'" },
     { name = "transformer-engine", marker = "sys_platform == 'never'" },
 ]
 cuda = [
@@ -3271,7 +3288,9 @@ diffusion = [
     { name = "ftfy" },
     { name = "imageio" },
     { name = "imageio-ffmpeg" },
+    { name = "kernels" },
     { name = "opencv-python-headless" },
+    { name = "torchvision", marker = "sys_platform == 'never'" },
 ]
 extra = [
     { name = "flash-linear-attention" },
@@ -3358,6 +3377,7 @@ requires-dist = [
     { name = "ftfy", marker = "extra == 'diffusion'" },
     { name = "imageio", marker = "extra == 'diffusion'" },
     { name = "imageio-ffmpeg", marker = "extra == 'diffusion'" },
+    { name = "kernels", marker = "extra == 'diffusion'" },
     { name = "mamba-ssm", marker = "extra == 'cuda'" },
     { name = "megatron-fsdp", specifier = ">=0.2.3" },
     { name = "mistral-common", extras = ["audio", "hf-hub", "image", "sentencepiece"] },
@@ -3390,6 +3410,9 @@ requires-dist = [
     { name = "torchao" },
     { name = "torchcodec", marker = "platform_machine == 'x86_64' and sys_platform != 'darwin' and extra == 'vlm'" },
     { name = "torchdata" },
+    { name = "torchvision", marker = "sys_platform == 'darwin' and extra == 'diffusion'", index = "https://pypi.org/simple" },
+    { name = "torchvision", marker = "sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'diffusion'", index = "https://download.pytorch.org/whl/cpu" },
+    { name = "torchvision", marker = "sys_platform == 'linux' and extra == 'diffusion'", index = "https://download.pytorch.org/whl/cu129" },
     { name = "transformer-engine", extras = ["pytorch"], marker = "extra == 'cuda'", specifier = "<=2.11.0" },
     { name = "transformers", specifier = ">=5.0.0" },
     { name = "wandb" },
@@ -6409,8 +6432,8 @@ wheels = [
 
 [[package]]
 name = "torchvision"
-version = "0.23.0"
-source = { registry = "https://pypi.org/simple" }
+version = "0.25.0+cpu"
+source = { registry = "https://download.pytorch.org/whl/cpu" }
 dependencies = [
     { name = "numpy", version = "1.26.4", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.13' and sys_platform != 'darwin' and sys_platform != 'linux'" },
     { name = "numpy", version = "2.4.1", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.13' and sys_platform != 'darwin' and sys_platform != 'linux'" },
diff --git a/examples/diffusion/finetune/flux_t2i_flow.yaml b/examples/diffusion/finetune/flux_t2i_flow.yaml
index 52674fe61..46175d806 100644
--- a/examples/diffusion/finetune/flux_t2i_flow.yaml
+++ b/examples/diffusion/finetune/flux_t2i_flow.yaml
@@ -54,7 +54,7 @@ step_scheduler:
 
 data:
   dataloader:
-    _target_: nemo_automodel.components.datasets.diffusion.build_flux_multiresolution_dataloader
+    _target_: nemo_automodel.components.datasets.diffusion.build_text_to_image_multiresolution_dataloader
     cache_dir: PATH_TO_YOUR_DATA
     train_text_encoder: false
     num_workers: 10
diff --git a/examples/diffusion/finetune/wan2_1_t2v_flow.yaml b/examples/diffusion/finetune/wan2_1_t2v_flow.yaml
index 0c47c543a..88a558ed0 100644
--- a/examples/diffusion/finetune/wan2_1_t2v_flow.yaml
+++ b/examples/diffusion/finetune/wan2_1_t2v_flow.yaml
@@ -22,10 +22,14 @@ step_scheduler:
 
 data:
   dataloader:
-    _target_: nemo_automodel.components.datasets.diffusion.build_dataloader
-    meta_folder: PATH_TO_YOUR_DATA
+    _target_: nemo_automodel.components.datasets.diffusion.build_video_multiresolution_dataloader
+    cache_dir: PATH_TO_YOUR_DATA
+    model_type: wan
+    base_resolution: [512, 512]
+    dynamic_batch_size: false
+    shuffle: true
+    drop_last: false
     num_workers: 2
-    device: cpu
 
 optim:
   learning_rate: 5e-6
diff --git a/examples/diffusion/finetune/wan2_1_t2v_flow_multinode.yaml b/examples/diffusion/finetune/wan2_1_t2v_flow_multinode.yaml
index 72b26cf03..715c3b711 100644
--- a/examples/diffusion/finetune/wan2_1_t2v_flow_multinode.yaml
+++ b/examples/diffusion/finetune/wan2_1_t2v_flow_multinode.yaml
@@ -22,10 +22,14 @@ step_scheduler:
 
 data:
   dataloader:
-    _target_: nemo_automodel.components.datasets.diffusion.build_dataloader
-    meta_folder: PATH_TO_YOUR_DATA
+    _target_: nemo_automodel.components.datasets.diffusion.build_video_multiresolution_dataloader
+    cache_dir: PATH_TO_YOUR_DATA
+    model_type: wan
+    base_resolution: [512, 512]
+    dynamic_batch_size: false
+    shuffle: true
+    drop_last: false
     num_workers: 2
-    device: cpu
 
 
 optim:
diff --git a/examples/diffusion/generate/flux_generate.py b/examples/diffusion/generate/flux_generate.py
index 623b1bb4b..195eed9fd 100644
--- a/examples/diffusion/generate/flux_generate.py
+++ b/examples/diffusion/generate/flux_generate.py
@@ -30,7 +30,7 @@
 from diffusers import FluxPipeline
 
 # Import the provided dataloader builder
-from nemo_automodel.components.datasets.diffusion import build_flux_multiresolution_dataloader
+from nemo_automodel.components.datasets.diffusion import build_text_to_image_multiresolution_dataloader
 
 
 def parse_args():
@@ -187,7 +187,7 @@ def main():
     print("=" * 80)
     print(f"Initializing Multiresolution Dataloader: {args.data_path}")
 
-    dataloader, _ = build_flux_multiresolution_dataloader(
+    dataloader, _ = build_text_to_image_multiresolution_dataloader(
         cache_dir=args.data_path, batch_size=1, num_workers=args.num_workers, dynamic_batch_size=True, shuffle=False
     )
     print(f"[INFO] Dataloader ready. Batches: {len(dataloader)}")
diff --git a/examples/diffusion/pretrain/flux_t2i_flow.yaml b/examples/diffusion/pretrain/flux_t2i_flow.yaml
index 418b543bc..d85805eb7 100644
--- a/examples/diffusion/pretrain/flux_t2i_flow.yaml
+++ b/examples/diffusion/pretrain/flux_t2i_flow.yaml
@@ -58,7 +58,7 @@ step_scheduler:
 
 data:
   dataloader:
-    _target_: nemo_automodel.components.datasets.diffusion.build_flux_multiresolution_dataloader
+    _target_: nemo_automodel.components.datasets.diffusion.build_text_to_image_multiresolution_dataloader
     cache_dir: PATH_TO_YOUR_DATA
     train_text_encoder: false
     num_workers: 1
diff --git a/nemo_automodel/components/datasets/diffusion/__init__.py b/nemo_automodel/components/datasets/diffusion/__init__.py
index ee3451498..2e829606c 100644
--- a/nemo_automodel/components/datasets/diffusion/__init__.py
+++ b/nemo_automodel/components/datasets/diffusion/__init__.py
@@ -17,14 +17,26 @@
 import importlib
 
 _LAZY_ATTRS = {
-    "MetaFilesDataset": (".meta_files_dataset", "MetaFilesDataset"),
+    # Dataset classes
+    "BaseMultiresolutionDataset": (".base_dataset", "BaseMultiresolutionDataset"),
     "TextToImageDataset": (".text_to_image_dataset", "TextToImageDataset"),
+    "TextToVideoDataset": (".text_to_video_dataset", "TextToVideoDataset"),
+    "MetaFilesDataset": (".meta_files_dataset", "MetaFilesDataset"),
+    # Utilities
     "MultiTierBucketCalculator": (".multi_tier_bucketing", "MultiTierBucketCalculator"),
     "SequentialBucketSampler": (".sampler", "SequentialBucketSampler"),
-    "collate_fn_flux": (".collate_fns", "collate_fn_flux"),
-    "build_flux_multiresolution_dataloader": (".collate_fns", "build_flux_multiresolution_dataloader"),
-    "build_mock_dataloader": (".mock_dataloader", "build_mock_dataloader"),
+    "VIDEO_OPTIONAL_FIELDS": (".text_to_video_dataset", "VIDEO_OPTIONAL_FIELDS"),
+    # Collate functions
+    "collate_fn_text_to_image": (".collate_fns", "collate_fn_text_to_image"),
+    "collate_fn_video": (".collate_fns", "collate_fn_video"),
+    "collate_fn_production": (".collate_fns", "collate_fn_production"),
+    # Dataloader builders
+    "build_text_to_image_multiresolution_dataloader": (".collate_fns", "build_text_to_image_multiresolution_dataloader"),
+    "build_video_multiresolution_dataloader": (".collate_fns", "build_video_multiresolution_dataloader"),
+    # Legacy (non-multiresolution)
     "build_dataloader": (".meta_files_dataset", "build_dataloader"),
+    # Mock/test
+    "build_mock_dataloader": (".mock_dataloader", "build_mock_dataloader"),
 }
 
 __all__ = sorted(_LAZY_ATTRS.keys())
diff --git a/nemo_automodel/components/datasets/diffusion/base_dataset.py b/nemo_automodel/components/datasets/diffusion/base_dataset.py
new file mode 100644
index 000000000..f8e74eb85
--- /dev/null
+++ b/nemo_automodel/components/datasets/diffusion/base_dataset.py
@@ -0,0 +1,133 @@
+# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import json
+import logging
+from abc import ABC, abstractmethod
+from pathlib import Path
+from typing import Dict, List
+
+from torch.utils.data import Dataset
+
+from .multi_tier_bucketing import MultiTierBucketCalculator
+
+logger = logging.getLogger(__name__)
+
+
+class BaseMultiresolutionDataset(Dataset, ABC):
+    """Abstract base class for multiresolution datasets with bucket-based sampling."""
+
+    def __init__(self, cache_dir: str, quantization: int = 64):
+        """
+        Args:
+            cache_dir: Directory containing preprocessed cache (metadata.json + shards)
+            quantization: Resolution quantization factor (64 for images, 8 for video)
+        """
+        self.cache_dir = Path(cache_dir)
+
+        # Load metadata
+        self.metadata = self._load_metadata()
+
+        logger.info(f"Loaded dataset with {len(self.metadata)} samples")
+
+        # Group by bucket
+        self._group_by_bucket()
+
+        # Initialize bucket calculator for dynamic batch sizes
+        self.calculator = MultiTierBucketCalculator(quantization=quantization)
+
+    def _load_metadata(self) -> List[Dict]:
+        """Load metadata from cache directory.
+
+        Expects metadata.json with "shards" key referencing shard files.
+        """
+        metadata_file = self.cache_dir / "metadata.json"
+
+        if not metadata_file.exists():
+            raise FileNotFoundError(f"No metadata.json found in {self.cache_dir}")
+
+        with open(metadata_file, "r") as f:
+            data = json.load(f)
+
+        if not isinstance(data, dict) or "shards" not in data:
+            raise ValueError(f"Invalid metadata format in {metadata_file}. Expected dict with 'shards' key.")
+
+        # Load all shard files
+        metadata = []
+        for shard_name in data["shards"]:
+            shard_path = self.cache_dir / shard_name
+            with open(shard_path, "r") as f:
+                shard_data = json.load(f)
+                metadata.extend(shard_data)
+
+        return metadata
+
+    def _aspect_ratio_to_name(self, aspect_ratio: float) -> str:
+        """Convert aspect ratio to a descriptive name."""
+        if aspect_ratio < 0.85:
+            return "tall"
+        elif aspect_ratio > 1.18:
+            return "wide"
+        else:
+            return "square"
+
+    def _group_by_bucket(self):
+        """Group samples by bucket (aspect_ratio + resolution)."""
+        self.bucket_groups = {}
+
+        # Support both bucket_resolution (video) and crop_resolution (image) keys
+        resolution_key = "bucket_resolution" if "bucket_resolution" in self.metadata[0] else "crop_resolution"
+
+        for idx, item in enumerate(self.metadata):
+            aspect_ratio = item.get("aspect_ratio", 1.0)
+            aspect_name = self._aspect_ratio_to_name(aspect_ratio)
+            resolution = tuple(item[resolution_key])
+            bucket_key = (aspect_name, resolution)
+
+            if bucket_key not in self.bucket_groups:
+                self.bucket_groups[bucket_key] = {
+                    "indices": [],
+                    "aspect_name": aspect_name,
+                    "aspect_ratio": aspect_ratio,
+                    "resolution": resolution,
+                    "pixels": resolution[0] * resolution[1],
+                }
+
+            self.bucket_groups[bucket_key]["indices"].append(idx)
+
+        # Sort buckets by resolution (low to high for optimal memory usage)
+        self.sorted_bucket_keys = sorted(self.bucket_groups.keys(), key=lambda k: self.bucket_groups[k]["pixels"])
+
+        logger.info(f"\nDataset organized into {len(self.bucket_groups)} buckets:")
+        for key in self.sorted_bucket_keys:
+            bucket = self.bucket_groups[key]
+            aspect_name, resolution = key
+            logger.info(
+                f"  {aspect_name:6s} {resolution[0]:4d}x{resolution[1]:4d}: {len(bucket['indices']):5d} samples"
+            )
+
+    def get_bucket_info(self) -> Dict:
+        """Get bucket organization information."""
+        return {
+            "total_buckets": len(self.bucket_groups),
+            "buckets": {f"{k[0]}/{k[1][0]}x{k[1][1]}": len(v["indices"]) for k, v in self.bucket_groups.items()},
+        }
+
+    def __len__(self) -> int:
+        return len(self.metadata)
+
+    @abstractmethod
+    def __getitem__(self, idx: int) -> Dict:
+        """Load a single sample. Subclasses must implement."""
+        ...
diff --git a/nemo_automodel/components/datasets/diffusion/collate_fns.py b/nemo_automodel/components/datasets/diffusion/collate_fns.py
index 3f9567661..426f7bd56 100644
--- a/nemo_automodel/components/datasets/diffusion/collate_fns.py
+++ b/nemo_automodel/components/datasets/diffusion/collate_fns.py
@@ -13,27 +13,70 @@
 # limitations under the License.
 
 """
-Flux-compatible collate function that wraps the multiresolution dataloader output
-to match the FlowMatchingPipeline expected batch format.
+Collate functions and dataloader builders for multiresolution diffusion training.
+
+Supports both image and video pipelines via the FlowMatchingPipeline
+expected batch format.
 """
 
+import functools
 import logging
-from typing import Dict, List, Tuple
+from typing import Callable, Dict, List, Tuple
 
+import torch
 from torch.utils.data import DataLoader
 
-from .sampler import (
-    SequentialBucketSampler,
-    collate_fn_production,
-)
+from .sampler import SequentialBucketSampler
 from .text_to_image_dataset import TextToImageDataset
+from .text_to_video_dataset import TextToVideoDataset, collate_optional_video_fields
 
 logger = logging.getLogger(__name__)
 
 
-def collate_fn_flux(batch: List[Dict]) -> Dict:
+def collate_fn_production(batch: List[Dict]) -> Dict:
+    """Production collate function with verification."""
+    # Verify all samples have same resolution
+    resolutions = [tuple(item["crop_resolution"].tolist()) for item in batch]
+    assert len(set(resolutions)) == 1, f"Mixed resolutions in batch: {set(resolutions)}"
+
+    # Stack tensors
+    latents = torch.stack([item["latent"] for item in batch])
+    crop_resolutions = torch.stack([item["crop_resolution"] for item in batch])
+    original_resolutions = torch.stack([item["original_resolution"] for item in batch])
+    crop_offsets = torch.stack([item["crop_offset"] for item in batch])
+
+    # Collect metadata
+    prompts = [item["prompt"] for item in batch]
+    image_paths = [item["image_path"] for item in batch]
+    bucket_ids = [item["bucket_id"] for item in batch]
+    aspect_ratios = [item["aspect_ratio"] for item in batch]
+
+    output = {
+        "latent": latents,
+        "crop_resolution": crop_resolutions,
+        "original_resolution": original_resolutions,
+        "crop_offset": crop_offsets,
+        "prompt": prompts,
+        "image_path": image_paths,
+        "bucket_id": bucket_ids,
+        "aspect_ratio": aspect_ratios,
+    }
+
+    # Handle text encodings
+    if "clip_hidden" in batch[0]:
+        output["clip_hidden"] = torch.stack([item["clip_hidden"] for item in batch])
+        output["pooled_prompt_embeds"] = torch.stack([item["pooled_prompt_embeds"] for item in batch])
+        output["prompt_embeds"] = torch.stack([item["prompt_embeds"] for item in batch])
+    else:
+        output["clip_tokens"] = torch.stack([item["clip_tokens"] for item in batch])
+        output["t5_tokens"] = torch.stack([item["t5_tokens"] for item in batch])
+
+    return output
+
+
+def collate_fn_text_to_image(batch: List[Dict]) -> Dict:
     """
-    Flux-compatible collate function that transforms multiresolution batch output
+    Text-to-image collate function that transforms multiresolution batch output
     to match FlowMatchingPipeline expected format.
 
     Args:
@@ -45,11 +88,11 @@ def collate_fn_flux(batch: List[Dict]) -> Dict:
     # First, use the production collate to stack tensors
     production_batch = collate_fn_production(batch)
 
-    # Keep latent as 4D [B, C, H, W] for Flux (image model, not video)
+    # Keep latent as 4D [B, C, H, W] for image (not video)
     latent = production_batch["latent"]
 
-    # Use "image_latents" key for 4D tensors (FluxAdapter expects 4D)
-    flux_batch = {
+    # Use "image_latents" key for 4D tensors
+    image_batch = {
         "image_latents": latent,
         "data_type": "image",
         "metadata": {
@@ -66,23 +109,64 @@ def collate_fn_flux(batch: List[Dict]) -> Dict:
     # Handle text embeddings (pre-encoded vs tokenized)
     if "prompt_embeds" in production_batch:
         # Pre-encoded text embeddings
-        flux_batch["text_embeddings"] = production_batch["prompt_embeds"]
-        flux_batch["pooled_prompt_embeds"] = production_batch["pooled_prompt_embeds"]
+        image_batch["text_embeddings"] = production_batch["prompt_embeds"]
+        image_batch["pooled_prompt_embeds"] = production_batch["pooled_prompt_embeds"]
         # Also include CLIP hidden for models that need it
         if "clip_hidden" in production_batch:
-            flux_batch["clip_hidden"] = production_batch["clip_hidden"]
+            image_batch["clip_hidden"] = production_batch["clip_hidden"]
     else:
         # Tokenized - need to encode during training (not supported yet)
-        flux_batch["t5_tokens"] = production_batch["t5_tokens"]
-        flux_batch["clip_tokens"] = production_batch["clip_tokens"]
+        image_batch["t5_tokens"] = production_batch["t5_tokens"]
+        image_batch["clip_tokens"] = production_batch["clip_tokens"]
         raise NotImplementedError(
             "On-the-fly text encoding not yet supported. Please use pre-encoded text embeddings in your dataset."
         )
 
-    return flux_batch
+    return image_batch
 
 
-def build_flux_multiresolution_dataloader(
+def _build_multiresolution_dataloader_core(
+    *,
+    dataset,
+    collate_fn: Callable,
+    batch_size: int,
+    dp_rank: int,
+    dp_world_size: int,
+    base_resolution: Tuple[int, int] = (512, 512),
+    drop_last: bool = True,
+    shuffle: bool = True,
+    dynamic_batch_size: bool = False,
+    num_workers: int = 4,
+    pin_memory: bool = True,
+    prefetch_factor: int = 2,
+) -> Tuple[DataLoader, SequentialBucketSampler]:
+    """Internal helper: create sampler + DataLoader from dataset and collate fn."""
+    sampler = SequentialBucketSampler(
+        dataset,
+        base_batch_size=batch_size,
+        base_resolution=base_resolution,
+        drop_last=drop_last,
+        shuffle_buckets=shuffle,
+        shuffle_within_bucket=shuffle,
+        dynamic_batch_size=dynamic_batch_size,
+        num_replicas=dp_world_size,
+        rank=dp_rank,
+    )
+
+    dataloader = DataLoader(
+        dataset,
+        batch_sampler=sampler,
+        collate_fn=collate_fn,
+        num_workers=num_workers,
+        pin_memory=pin_memory,
+        prefetch_factor=prefetch_factor if num_workers > 0 else None,
+        persistent_workers=num_workers > 0,
+    )
+
+    return dataloader, sampler
+
+
+def build_text_to_image_multiresolution_dataloader(
     *,
     # TextToImageDataset parameters
     cache_dir: str,
@@ -100,10 +184,10 @@ def build_flux_multiresolution_dataloader(
     prefetch_factor: int = 2,
 ) -> Tuple[DataLoader, SequentialBucketSampler]:
     """
-    Build a Flux-compatible multiresolution dataloader for TrainDiffusionRecipe.
+    Build a text-to-image multiresolution dataloader for TrainDiffusionRecipe.
 
     This wraps the existing TextToImageDataset and SequentialBucketSampler
-    with a Flux-compatible collate function.
+    with a text-to-image collate function.
 
     Args:
         cache_dir: Directory containing preprocessed cache (metadata.json, shards, and resolution subdirs)
@@ -122,40 +206,138 @@ def build_flux_multiresolution_dataloader(
     Returns:
         Tuple of (DataLoader, SequentialBucketSampler)
     """
-    logger.info("Building Flux multiresolution dataloader:")
+    logger.info("Building text-to-image multiresolution dataloader:")
     logger.info(f"  cache_dir: {cache_dir}")
     logger.info(f"  train_text_encoder: {train_text_encoder}")
     logger.info(f"  batch_size: {batch_size}")
     logger.info(f"  dp_rank: {dp_rank}, dp_world_size: {dp_world_size}")
 
-    # Create dataset
     dataset = TextToImageDataset(
         cache_dir=cache_dir,
         train_text_encoder=train_text_encoder,
     )
 
-    # Create sampler
-    sampler = SequentialBucketSampler(
-        dataset,
-        base_batch_size=batch_size,
+    dataloader, sampler = _build_multiresolution_dataloader_core(
+        dataset=dataset,
+        collate_fn=collate_fn_text_to_image,
+        batch_size=batch_size,
+        dp_rank=dp_rank,
+        dp_world_size=dp_world_size,
         base_resolution=base_resolution,
         drop_last=drop_last,
-        shuffle_buckets=shuffle,
-        shuffle_within_bucket=shuffle,
+        shuffle=shuffle,
         dynamic_batch_size=dynamic_batch_size,
-        num_replicas=dp_world_size,
-        rank=dp_rank,
+        num_workers=num_workers,
+        pin_memory=pin_memory,
+        prefetch_factor=prefetch_factor,
     )
 
-    # Create dataloader with Flux-compatible collate
-    dataloader = DataLoader(
-        dataset,
-        batch_sampler=sampler,
-        collate_fn=collate_fn_flux,  # Use Flux-compatible collate
+    logger.info(f"  Dataset size: {len(dataset)}")
+    logger.info(f"  Batches per epoch: {len(sampler)}")
+
+    return dataloader, sampler
+
+
+def collate_fn_video(batch: List[Dict], model_type: str = "wan") -> Dict:
+    """
+    Video-compatible collate function for multiresolution video training.
+
+    Concatenates video_latents (5D) and text_embeddings (3D) along the batch dim,
+    matching the format expected by FlowMatchingPipeline with SimpleAdapter.
+
+    Args:
+        batch: List of samples from TextToVideoDataset
+        model_type: Model type for model-specific field handling
+
+    Returns:
+        Dict compatible with FlowMatchingPipeline.step()
+    """
+    # Verify all samples have the same bucket resolution
+    resolutions = [tuple(item["bucket_resolution"].tolist()) for item in batch]
+    assert len(set(resolutions)) == 1, f"Mixed resolutions in batch: {set(resolutions)}"
+
+    video_latents = torch.cat([item["video_latents"] for item in batch], dim=0)
+    text_embeddings = torch.cat([item["text_embeddings"] for item in batch], dim=0)
+
+    result = {
+        "video_latents": video_latents,
+        "text_embeddings": text_embeddings,
+        "data_type": "video",
+    }
+
+    # Collate model-specific optional fields
+    collate_optional_video_fields(batch, result)
+
+    return result
+
+
+def build_video_multiresolution_dataloader(
+    *,
+    cache_dir: str,
+    model_type: str = "wan",
+    device: str = "cpu",
+    batch_size: int = 1,
+    dp_rank: int = 0,
+    dp_world_size: int = 1,
+    base_resolution: Tuple[int, int] = (512, 512),
+    drop_last: bool = True,
+    shuffle: bool = True,
+    dynamic_batch_size: bool = False,
+    num_workers: int = 2,
+    pin_memory: bool = True,
+    prefetch_factor: int = 2,
+) -> Tuple[DataLoader, SequentialBucketSampler]:
+    """
+    Build a multiresolution video dataloader for TrainDiffusionRecipe.
+
+    Uses TextToVideoDataset with SequentialBucketSampler for bucket-based
+    multiresolution video training (e.g. Wan, Hunyuan).
+
+    Args:
+        cache_dir: Directory containing preprocessed cache (metadata.json + shards + WxH/*.meta)
+        model_type: Model type ("wan", "hunyuan", etc.)
+        device: Device to load tensors to
+        batch_size: Batch size per GPU
+        dp_rank: Data parallel rank
+        dp_world_size: Data parallel world size
+        base_resolution: Base resolution for dynamic batch sizing
+        drop_last: Drop incomplete batches
+        shuffle: Shuffle data
+        dynamic_batch_size: Scale batch size by resolution
+        num_workers: DataLoader workers
+        pin_memory: Pin memory for GPU transfer
+        prefetch_factor: Prefetch batches per worker
+
+    Returns:
+        Tuple of (DataLoader, SequentialBucketSampler)
+    """
+    logger.info("Building video multiresolution dataloader:")
+    logger.info(f"  cache_dir: {cache_dir}")
+    logger.info(f"  model_type: {model_type}")
+    logger.info(f"  batch_size: {batch_size}")
+    logger.info(f"  dp_rank: {dp_rank}, dp_world_size: {dp_world_size}")
+
+    dataset = TextToVideoDataset(
+        cache_dir=cache_dir,
+        model_type=model_type,
+        device=device,
+    )
+
+    collate = functools.partial(collate_fn_video, model_type=model_type)
+
+    dataloader, sampler = _build_multiresolution_dataloader_core(
+        dataset=dataset,
+        collate_fn=collate,
+        batch_size=batch_size,
+        dp_rank=dp_rank,
+        dp_world_size=dp_world_size,
+        base_resolution=base_resolution,
+        drop_last=drop_last,
+        shuffle=shuffle,
+        dynamic_batch_size=dynamic_batch_size,
         num_workers=num_workers,
         pin_memory=pin_memory,
-        prefetch_factor=prefetch_factor if num_workers > 0 else None,
-        persistent_workers=num_workers > 0,
+        prefetch_factor=prefetch_factor,
     )
 
     logger.info(f"  Dataset size: {len(dataset)}")
diff --git a/nemo_automodel/components/datasets/diffusion/meta_files_dataset.py b/nemo_automodel/components/datasets/diffusion/meta_files_dataset.py
index 381b53745..4dbeea2bc 100644
--- a/nemo_automodel/components/datasets/diffusion/meta_files_dataset.py
+++ b/nemo_automodel/components/datasets/diffusion/meta_files_dataset.py
@@ -24,6 +24,8 @@
 import torch.distributed as dist
 from torch.utils.data import DataLoader, Dataset, DistributedSampler
 
+from .text_to_video_dataset import collate_optional_video_fields, load_optional_video_fields
+
 logger = logging.getLogger(__name__)
 
 
@@ -111,20 +113,6 @@ def __getitem__(self, index: int) -> Dict[str, torch.Tensor]:  # type: ignore[ov
         text_embeddings: torch.Tensor = data["text_embeddings"].to(self.device)
         video_latents: torch.Tensor = data["video_latents"].to(self.device)
 
-        # Load text_mask if available (backwards compatible)
-        text_mask = data.get("text_mask")
-        text_embeddings_2 = data.get("text_embeddings_2")
-        text_mask_2 = data.get("text_mask_2")
-        image_embeds = data.get("image_embeds")
-        if text_mask is not None:
-            text_mask = text_mask.to(self.device)
-        if text_embeddings_2 is not None:
-            text_embeddings_2 = text_embeddings_2.to(self.device)
-        if text_mask_2 is not None:
-            text_mask_2 = text_mask_2.to(self.device)
-        if image_embeds is not None:
-            image_embeds = image_embeds.to(self.device)
-
         if self.transform_text is not None:
             text_embeddings = self.transform_text(text_embeddings)
         if self.transform_video is not None:
@@ -146,15 +134,8 @@ def __getitem__(self, index: int) -> Dict[str, torch.Tensor]:  # type: ignore[ov
             "file_info": file_info,
         }
 
-        # Add text_mask if available (backwards compatible)
-        if text_mask is not None:
-            result["text_mask"] = text_mask
-        if text_embeddings_2 is not None:
-            result["text_embeddings_2"] = text_embeddings_2
-        if text_mask_2 is not None:
-            result["text_mask_2"] = text_mask_2
-        if image_embeds is not None:
-            result["image_embeds"] = image_embeds
+        # Optional model-specific fields (backwards compatible)
+        result.update(load_optional_video_fields(data, self.device))
 
         return result
 
@@ -174,19 +155,8 @@ def collate_fn(batch: List[Dict[str, torch.Tensor]]) -> Dict[str, torch.Tensor]:
         "file_info": [item["file_info"] for item in batch],
     }
 
-    # Collate text_mask if available (backwards compatible)
-    if len(batch) > 0 and "text_mask" in batch[0]:
-        text_mask = torch.cat([item["text_mask"] for item in batch], dim=0)
-        result["text_mask"] = text_mask
-    if len(batch) > 0 and "text_embeddings_2" in batch[0]:
-        text_embeddings_2 = torch.cat([item["text_embeddings_2"] for item in batch], dim=0)
-        result["text_embeddings_2"] = text_embeddings_2
-    if len(batch) > 0 and "text_mask_2" in batch[0]:
-        text_mask_2 = torch.cat([item["text_mask_2"] for item in batch], dim=0)
-        result["text_mask_2"] = text_mask_2
-    if len(batch) > 0 and "image_embeds" in batch[0]:
-        image_embeds = torch.cat([item["image_embeds"] for item in batch], dim=0)
-        result["image_embeds"] = image_embeds
+    # Optional model-specific fields (backwards compatible)
+    collate_optional_video_fields(batch, result)
 
     return result
 
diff --git a/nemo_automodel/components/datasets/diffusion/sampler.py b/nemo_automodel/components/datasets/diffusion/sampler.py
index 3ee806631..92ac23a9c 100644
--- a/nemo_automodel/components/datasets/diffusion/sampler.py
+++ b/nemo_automodel/components/datasets/diffusion/sampler.py
@@ -18,9 +18,9 @@
 
 import torch
 import torch.distributed as dist
-from torch.utils.data import DataLoader, Sampler
+from torch.utils.data import Sampler
 
-from .text_to_image_dataset import TextToImageDataset
+from .base_dataset import BaseMultiresolutionDataset
 
 logger = logging.getLogger(__name__)
 
@@ -41,7 +41,7 @@ class SequentialBucketSampler(Sampler[List[int]]):
 
     def __init__(
         self,
-        dataset: TextToImageDataset,
+        dataset: BaseMultiresolutionDataset,
         base_batch_size: int = 32,
         base_resolution: Tuple[int, int] = (512, 512),
         drop_last: bool = True,
@@ -54,7 +54,7 @@ def __init__(
     ):
         """
         Args:
-            dataset: TextToImageDataset
+            dataset: BaseMultiresolutionDataset (or any subclass)
             base_batch_size: Batch size (fixed if dynamic_batch_size=False,
                             or base for scaling if dynamic_batch_size=True)
             base_resolution: Reference resolution for batch size scaling
@@ -222,103 +222,3 @@ def get_batch_info(self, batch_idx: int) -> Dict:
             running_count += num_batches
 
         return {}
-
-
-def collate_fn_production(batch: List[Dict]) -> Dict:
-    """Production collate function with verification."""
-    # Verify all samples have same resolution
-    resolutions = [tuple(item["crop_resolution"].tolist()) for item in batch]
-    assert len(set(resolutions)) == 1, f"Mixed resolutions in batch: {set(resolutions)}"
-
-    # Stack tensors
-    latents = torch.stack([item["latent"] for item in batch])
-    crop_resolutions = torch.stack([item["crop_resolution"] for item in batch])
-    original_resolutions = torch.stack([item["original_resolution"] for item in batch])
-    crop_offsets = torch.stack([item["crop_offset"] for item in batch])
-
-    # Collect metadata
-    prompts = [item["prompt"] for item in batch]
-    image_paths = [item["image_path"] for item in batch]
-    bucket_ids = [item["bucket_id"] for item in batch]
-    aspect_ratios = [item["aspect_ratio"] for item in batch]
-
-    output = {
-        "latent": latents,
-        "crop_resolution": crop_resolutions,
-        "original_resolution": original_resolutions,
-        "crop_offset": crop_offsets,
-        "prompt": prompts,
-        "image_path": image_paths,
-        "bucket_id": bucket_ids,
-        "aspect_ratio": aspect_ratios,
-    }
-
-    # Handle text encodings
-    if "clip_hidden" in batch[0]:
-        output["clip_hidden"] = torch.stack([item["clip_hidden"] for item in batch])
-        output["pooled_prompt_embeds"] = torch.stack([item["pooled_prompt_embeds"] for item in batch])
-        output["prompt_embeds"] = torch.stack([item["prompt_embeds"] for item in batch])
-    else:
-        output["clip_tokens"] = torch.stack([item["clip_tokens"] for item in batch])
-        output["t5_tokens"] = torch.stack([item["t5_tokens"] for item in batch])
-
-    return output
-
-
-def build_multiresolution_dataloader(
-    *,
-    dataset: TextToImageDataset,
-    base_batch_size: int,
-    dp_rank: int,
-    dp_world_size: int,
-    base_resolution: Tuple[int, int] = (512, 512),
-    drop_last: bool = True,
-    shuffle: bool = True,
-    dynamic_batch_size: bool = False,
-    num_workers: int = 4,
-    pin_memory: bool = True,
-    prefetch_factor: int = 2,
-) -> Tuple[DataLoader, SequentialBucketSampler]:
-    """
-    Build production dataloader with sequential bucket iteration and distributed training support.
-
-    Args:
-        dataset: TextToImageDataset instance
-        base_batch_size: Batch size (fixed, or base for scaling if dynamic_batch_size=True)
-        dp_rank: Rank of current process in data parallel group
-        dp_world_size: Total number of processes in data parallel group
-        base_resolution: Reference resolution (only used if dynamic_batch_size=True)
-        drop_last: Drop incomplete batches
-        shuffle: Shuffle bucket order and samples within buckets each epoch
-        dynamic_batch_size: If True, scale batch size based on resolution.
-                           If False (default), use base_batch_size for all buckets.
-        num_workers: Number of data loading workers
-        pin_memory: Pin memory for faster GPU transfer
-        prefetch_factor: How many batches to prefetch per worker
-
-    Returns:
-        Tuple of (DataLoader, SequentialBucketSampler) for production training
-    """
-    sampler = SequentialBucketSampler(
-        dataset,
-        base_batch_size=base_batch_size,
-        base_resolution=base_resolution,
-        drop_last=drop_last,
-        shuffle_buckets=shuffle,
-        shuffle_within_bucket=shuffle,
-        dynamic_batch_size=dynamic_batch_size,
-        num_replicas=dp_world_size,
-        rank=dp_rank,
-    )
-
-    dataloader = DataLoader(
-        dataset,
-        batch_sampler=sampler,
-        collate_fn=collate_fn_production,
-        num_workers=num_workers,
-        pin_memory=pin_memory,
-        prefetch_factor=prefetch_factor if num_workers > 0 else None,
-        persistent_workers=num_workers > 0,  # Keep workers alive between epochs
-    )
-
-    return dataloader, sampler
diff --git a/nemo_automodel/components/datasets/diffusion/text_to_image_dataset.py b/nemo_automodel/components/datasets/diffusion/text_to_image_dataset.py
index 062e0aeec..41e356220 100644
--- a/nemo_automodel/components/datasets/diffusion/text_to_image_dataset.py
+++ b/nemo_automodel/components/datasets/diffusion/text_to_image_dataset.py
@@ -12,20 +12,15 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import json
-import logging
 from pathlib import Path
-from typing import Dict, List
+from typing import Dict
 
 import torch
-from torch.utils.data import Dataset
 
-from .multi_tier_bucketing import MultiTierBucketCalculator
+from .base_dataset import BaseMultiresolutionDataset
 
-logger = logging.getLogger(__name__)
 
-
-class TextToImageDataset(Dataset):
+class TextToImageDataset(BaseMultiresolutionDataset):
     """Text-to-Image dataset with hierarchical bucket organization."""
 
     def __init__(
@@ -38,97 +33,8 @@ def __init__(
             cache_dir: Directory containing preprocessed cache
             train_text_encoder: If True, returns tokens instead of embeddings
         """
-        self.cache_dir = Path(cache_dir)
         self.train_text_encoder = train_text_encoder
-
-        # Load metadata
-        self.metadata = self._load_metadata()
-
-        logger.info(f"Loaded dataset with {len(self.metadata)} samples")
-
-        # Group by bucket
-        self._group_by_bucket()
-
-        # Initialize bucket calculator for dynamic batch sizes
-        self.calculator = MultiTierBucketCalculator(quantization=64)
-
-    def _load_metadata(self) -> List[Dict]:
-        """Load metadata from cache directory.
-
-        Expects metadata.json with "shards" key referencing shard files.
-        """
-        metadata_file = self.cache_dir / "metadata.json"
-
-        if not metadata_file.exists():
-            raise FileNotFoundError(f"No metadata.json found in {self.cache_dir}")
-
-        with open(metadata_file, "r") as f:
-            data = json.load(f)
-
-        if not isinstance(data, dict) or "shards" not in data:
-            raise ValueError(f"Invalid metadata format in {metadata_file}. Expected dict with 'shards' key.")
-
-        # Load all shard files
-        metadata = []
-        for shard_name in data["shards"]:
-            shard_path = self.cache_dir / shard_name
-            with open(shard_path, "r") as f:
-                shard_data = json.load(f)
-                metadata.extend(shard_data)
-
-        return metadata
-
-    def _aspect_ratio_to_name(self, aspect_ratio: float) -> str:
-        """Convert aspect ratio to a descriptive name."""
-        if aspect_ratio < 0.85:
-            return "tall"
-        elif aspect_ratio > 1.18:
-            return "wide"
-        else:
-            return "square"
-
-    def _group_by_bucket(self):
-        """Group samples by bucket (aspect_ratio + resolution)."""
-        self.bucket_groups = {}
-
-        for idx, item in enumerate(self.metadata):
-            # Bucket key: aspect_name/resolution
-            aspect_ratio = item.get("aspect_ratio", 1.0)
-            aspect_name = self._aspect_ratio_to_name(aspect_ratio)
-            resolution = tuple(item["crop_resolution"])
-            bucket_key = (aspect_name, resolution)
-
-            if bucket_key not in self.bucket_groups:
-                self.bucket_groups[bucket_key] = {
-                    "indices": [],
-                    "aspect_name": aspect_name,
-                    "aspect_ratio": aspect_ratio,
-                    "resolution": resolution,
-                    "pixels": resolution[0] * resolution[1],
-                }
-
-            self.bucket_groups[bucket_key]["indices"].append(idx)
-
-        # Sort buckets by resolution (low to high for optimal memory usage)
-        self.sorted_bucket_keys = sorted(self.bucket_groups.keys(), key=lambda k: self.bucket_groups[k]["pixels"])
-
-        logger.info(f"\nDataset organized into {len(self.bucket_groups)} buckets:")
-        for key in self.sorted_bucket_keys:
-            bucket = self.bucket_groups[key]
-            aspect_name, resolution = key
-            logger.info(
-                f"  {aspect_name:6s} {resolution[0]:4d}x{resolution[1]:4d}: {len(bucket['indices']):5d} samples"
-            )
-
-    def get_bucket_info(self) -> Dict:
-        """Get bucket organization information."""
-        return {
-            "total_buckets": len(self.bucket_groups),
-            "buckets": {f"{k[0]}/{k[1][0]}x{k[1][1]}": len(v["indices"]) for k, v in self.bucket_groups.items()},
-        }
-
-    def __len__(self) -> int:
-        return len(self.metadata)
+        super().__init__(cache_dir, quantization=64)
 
     def __getitem__(self, idx: int) -> Dict[str, torch.Tensor]:
         """Load a single sample."""
diff --git a/nemo_automodel/components/datasets/diffusion/text_to_video_dataset.py b/nemo_automodel/components/datasets/diffusion/text_to_video_dataset.py
new file mode 100644
index 000000000..4082d6fb8
--- /dev/null
+++ b/nemo_automodel/components/datasets/diffusion/text_to_video_dataset.py
@@ -0,0 +1,83 @@
+# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import pickle
+from pathlib import Path
+from typing import Dict, List
+
+import torch
+
+from .base_dataset import BaseMultiresolutionDataset
+
+VIDEO_OPTIONAL_FIELDS = ("text_mask", "text_embeddings_2", "text_mask_2", "image_embeds")
+
+
+def load_optional_video_fields(data: dict, device: str = "cpu") -> dict:
+    """Extract optional model-specific fields, moving to device."""
+    result = {}
+    for key in VIDEO_OPTIONAL_FIELDS:
+        if key in data and data[key] is not None:
+            result[key] = data[key].to(device)
+    return result
+
+
+def collate_optional_video_fields(batch: List[Dict], result: dict) -> None:
+    """Concatenate optional video fields present in batch into result dict."""
+    if not batch:
+        return
+    for key in VIDEO_OPTIONAL_FIELDS:
+        if key in batch[0]:
+            result[key] = torch.cat([item[key] for item in batch], dim=0)
+
+
+class TextToVideoDataset(BaseMultiresolutionDataset):
+    """Text-to-Video dataset with multiresolution bucket organization.
+
+    Loads preprocessed .meta files organized by resolution bucket.
+    Compatible with SequentialBucketSampler for multiresolution training.
+    """
+
+    def __init__(self, cache_dir: str, model_type: str = "wan", device: str = "cpu"):
+        """
+        Args:
+            cache_dir: Directory containing preprocessed cache (metadata.json + shards + WxH/*.meta)
+            model_type: Model type for model-specific fields ("wan", "hunyuan", etc.)
+            device: Device to load tensors to
+        """
+        self.model_type = model_type
+        self.device = device
+        super().__init__(cache_dir, quantization=8)
+
+    def __getitem__(self, idx: int) -> Dict[str, torch.Tensor]:
+        """Load a single video sample from its .meta file."""
+        item = self.metadata[idx]
+        cache_file = Path(item["cache_file"])
+
+        with open(cache_file, "rb") as f:
+            data = pickle.load(f)
+
+        video_latents = data["video_latents"].to(self.device)
+        text_embeddings = data["text_embeddings"].to(self.device)
+
+        output = {
+            "video_latents": video_latents,
+            "text_embeddings": text_embeddings,
+            "bucket_resolution": torch.tensor(item["bucket_resolution"]),
+            "aspect_ratio": item.get("aspect_ratio", 1.0),
+        }
+
+        # Model-specific optional fields
+        output.update(load_optional_video_fields(data, self.device))
+
+        return output
diff --git a/nemo_automodel/recipes/base_recipe.py b/nemo_automodel/recipes/base_recipe.py
index 46f305290..b66ca0386 100644
--- a/nemo_automodel/recipes/base_recipe.py
+++ b/nemo_automodel/recipes/base_recipe.py
@@ -351,9 +351,24 @@ def to_item(x):
             # Unwrap DDP if present
             if isinstance(unwrapped_model, DistributedDataParallel):
                 unwrapped_model = unwrapped_model.module
-            unwrapped_model.save_pretrained(
-                save_directory=path, checkpointer=self.checkpointer, tokenizer=tokenizer, peft_config=self.peft_config
-            )
+            # Models with HFCheckpointingMixin route save_pretrained through checkpointer.save_model (DCP).
+            # Models without it (e.g. diffusers) would use their native save_pretrained which fails on
+            # FSDP2-sharded DTensors, so fall back to checkpointer.save_model directly.
+            if hasattr(unwrapped_model, 'save_pretrained') and hasattr(unwrapped_model.save_pretrained, '__func__'):
+                from nemo_automodel.components.models.common.hf_checkpointing_mixin import HFCheckpointingMixin
+
+                if isinstance(unwrapped_model, HFCheckpointingMixin):
+                    unwrapped_model.save_pretrained(
+                        save_directory=path, checkpointer=self.checkpointer, tokenizer=tokenizer, peft_config=self.peft_config
+                    )
+                else:
+                    self.checkpointer.save_model(
+                        model=unwrapped_model, weights_path=path, peft_config=self.peft_config, tokenizer=tokenizer
+                    )
+            else:
+                self.checkpointer.save_model(
+                    model=unwrapped_model, weights_path=path, peft_config=self.peft_config, tokenizer=tokenizer
+                )
 
         # Sync before checkpointing for Dion
         optimizers = optimizer if isinstance(optimizer, list) else [optimizer]
diff --git a/nemo_automodel/recipes/diffusion/train.py b/nemo_automodel/recipes/diffusion/train.py
index 64f5733f4..5e2d7d913 100644
--- a/nemo_automodel/recipes/diffusion/train.py
+++ b/nemo_automodel/recipes/diffusion/train.py
@@ -620,7 +620,7 @@ def run_train_validation_loop(self):
                         )
 
                 if self.step_scheduler.is_ckpt_step:
-                    self.save_checkpoint(epoch, global_step)
+                    self.save_checkpoint(epoch, global_step, epoch_loss / num_steps)
 
             avg_loss = epoch_loss / num_steps
             logging.info(f"[INFO] Epoch {epoch + 1} complete. avg_loss={avg_loss:.6f}")
diff --git a/pyproject.toml b/pyproject.toml
index b3c32c247..49c071594 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -94,7 +94,9 @@ diffusion = [
     "ftfy",
     "imageio",
     "imageio-ffmpeg",
+    "kernels",
     "opencv-python-headless",
+    "torchvision",
 ]
 # nvidia-cudnn-cu12 pin: This is required for GPTOSS TE support + faster cudnn attention (only on Linux where CUDA is available)
 # "nvidia-cudnn-cu12>=9.18.0.0; sys_platform == 'linux'",
@@ -205,6 +207,11 @@ torch = [
   { index = "pytorch-cu129", marker = "sys_platform == 'linux'" },
   { index = "pypi", marker = "sys_platform == 'darwin'" },
 ]
+torchvision = [
+  { index = "pytorch-cpu", marker = "sys_platform != 'linux' and sys_platform != 'darwin'" },
+  { index = "pytorch-cu129", marker = "sys_platform == 'linux'" },
+  { index = "pypi", marker = "sys_platform == 'darwin'" },
+]
 
 [[tool.uv.index]]
 name = "pypi"
diff --git a/tests/unit_tests/datasets/diffusion/test_collate_diffusion.py b/tests/unit_tests/datasets/diffusion/test_collate_diffusion.py
index fdc155fec..d657334f0 100644
--- a/tests/unit_tests/datasets/diffusion/test_collate_diffusion.py
+++ b/tests/unit_tests/datasets/diffusion/test_collate_diffusion.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-"""Unit tests for collate_fns.py: collate_fn_flux, build_flux_multiresolution_dataloader."""
+"""Unit tests for collate_fns.py: collate_fn_text_to_image and build_text_to_image_multiresolution_dataloader."""
 
 import json
 import tempfile
@@ -23,8 +23,8 @@
 import torch
 
 from nemo_automodel.components.datasets.diffusion.collate_fns import (
-    build_flux_multiresolution_dataloader,
-    collate_fn_flux,
+    build_text_to_image_multiresolution_dataloader,
+    collate_fn_text_to_image,
 )
 
 
@@ -58,12 +58,12 @@ def _make_production_batch(
 
 
 # =============================================================================
-# TestCollateFnFlux
+# TestCollateFnTextToImage
 # =============================================================================
 
 
-class TestCollateFnFlux:
-    """Tests for collate_fn_flux."""
+class TestCollateFnTextToImage:
+    """Tests for collate_fn_text_to_image."""
 
     def test_pre_encoded_embeddings(self):
         prod_batch = _make_production_batch(has_prompt_embeds=True)
@@ -71,7 +71,7 @@ def test_pre_encoded_embeddings(self):
             "nemo_automodel.components.datasets.diffusion.collate_fns.collate_fn_production",
             return_value=prod_batch,
         ):
-            result = collate_fn_flux([{}, {}])  # Dummy batch items
+            result = collate_fn_text_to_image([{}, {}])  # Dummy batch items
 
         assert "image_latents" in result
         assert "text_embeddings" in result
@@ -85,7 +85,7 @@ def test_with_clip_hidden(self):
             "nemo_automodel.components.datasets.diffusion.collate_fns.collate_fn_production",
             return_value=prod_batch,
         ):
-            result = collate_fn_flux([{}, {}])
+            result = collate_fn_text_to_image([{}, {}])
 
         assert "clip_hidden" in result
 
@@ -95,7 +95,7 @@ def test_without_clip_hidden(self):
             "nemo_automodel.components.datasets.diffusion.collate_fns.collate_fn_production",
             return_value=prod_batch,
         ):
-            result = collate_fn_flux([{}, {}])
+            result = collate_fn_text_to_image([{}, {}])
 
         assert "clip_hidden" not in result
 
@@ -108,7 +108,7 @@ def test_tokenized_input_raises(self):
             return_value=prod_batch,
         ):
             with pytest.raises(NotImplementedError, match="On-the-fly text encoding"):
-                collate_fn_flux([{}, {}])
+                collate_fn_text_to_image([{}, {}])
 
     def test_metadata_fields(self):
         prod_batch = _make_production_batch(has_prompt_embeds=True)
@@ -116,7 +116,7 @@ def test_metadata_fields(self):
             "nemo_automodel.components.datasets.diffusion.collate_fns.collate_fn_production",
             return_value=prod_batch,
         ):
-            result = collate_fn_flux([{}, {}])
+            result = collate_fn_text_to_image([{}, {}])
 
         meta = result["metadata"]
         assert "prompts" in meta
@@ -129,7 +129,7 @@ def test_metadata_fields(self):
 
 
 # =============================================================================
-# TestBuildFluxMultiresolutionDataloader
+# TestBuildTextToImageMultiresolutionDataloader
 # =============================================================================
 
 
@@ -177,8 +177,8 @@ def build_cache(self, resolution=(512, 512)):
         return metadata
 
 
-class TestBuildFluxMultiresolutionDataloader:
-    """Tests for build_flux_multiresolution_dataloader."""
+class TestBuildTextToImageMultiresolutionDataloader:
+    """Tests for build_text_to_image_multiresolution_dataloader."""
 
     def test_returns_dataloader_and_sampler(self):
         with tempfile.TemporaryDirectory() as tmpdir:
@@ -186,7 +186,7 @@ def test_returns_dataloader_and_sampler(self):
             builder = MockCacheBuilder(cache_dir, num_samples=10)
             builder.build_cache()
 
-            dl, sampler = build_flux_multiresolution_dataloader(
+            dl, sampler = build_text_to_image_multiresolution_dataloader(
                 cache_dir=str(cache_dir),
                 batch_size=2,
                 dp_rank=0,
@@ -205,7 +205,7 @@ def test_iteration(self):
             builder = MockCacheBuilder(cache_dir, num_samples=10)
             builder.build_cache()
 
-            dl, _ = build_flux_multiresolution_dataloader(
+            dl, _ = build_text_to_image_multiresolution_dataloader(
                 cache_dir=str(cache_dir),
                 batch_size=2,
                 dp_rank=0,
diff --git a/tests/unit_tests/datasets/diffusion/test_dataloader.py b/tests/unit_tests/datasets/diffusion/test_dataloader.py
index 3c820776c..47421039d 100644
--- a/tests/unit_tests/datasets/diffusion/test_dataloader.py
+++ b/tests/unit_tests/datasets/diffusion/test_dataloader.py
@@ -17,7 +17,7 @@
 This module contains both CPU and GPU tests for:
 - SequentialBucketSampler
 - collate_fn_production
-- build_multiresolution_dataloader
+- _build_multiresolution_dataloader_core
 
 GPU tests are skipped when CUDA is not available.
 """
@@ -30,10 +30,12 @@
 import pytest
 import torch
 
+from nemo_automodel.components.datasets.diffusion.collate_fns import (
+    _build_multiresolution_dataloader_core,
+    collate_fn_production,
+)
 from nemo_automodel.components.datasets.diffusion.sampler import (
     SequentialBucketSampler,
-    build_multiresolution_dataloader,
-    collate_fn_production,
 )
 from nemo_automodel.components.datasets.diffusion.text_to_image_dataset import (
     TextToImageDataset,
@@ -182,7 +184,7 @@ def test_sampler_init_basic(self, simple_dataset):
         """Test basic sampler initialization."""
         sampler = SequentialBucketSampler(
             simple_dataset,
-            base_batch_size=4,
+            batch_size=4,
             num_replicas=1,
             rank=0,
         )
@@ -196,7 +198,7 @@ def test_sampler_len(self, simple_dataset):
         """Test sampler __len__ returns correct batch count."""
         sampler = SequentialBucketSampler(
             simple_dataset,
-            base_batch_size=4,
+            batch_size=4,
             num_replicas=1,
             rank=0,
         )
@@ -209,7 +211,7 @@ def test_sampler_iter_yields_batches(self, simple_dataset):
         """Test sampler iteration yields batches of indices."""
         sampler = SequentialBucketSampler(
             simple_dataset,
-            base_batch_size=4,
+            batch_size=4,
             num_replicas=1,
             rank=0,
         )
@@ -226,7 +228,7 @@ def test_sampler_batch_size_respected(self, simple_dataset):
         batch_size = 4
         sampler = SequentialBucketSampler(
             simple_dataset,
-            base_batch_size=batch_size,
+            batch_size=batch_size,
             num_replicas=1,
             rank=0,
             drop_last=True,
@@ -241,7 +243,7 @@ def test_sampler_drop_last_false(self, simple_dataset):
         """Test sampler with drop_last=False includes all samples."""
         sampler = SequentialBucketSampler(
             simple_dataset,
-            base_batch_size=4,
+            batch_size=4,
             num_replicas=1,
             rank=0,
             drop_last=False,
@@ -257,7 +259,7 @@ def test_sampler_set_epoch(self, simple_dataset):
         """Test set_epoch changes sampler state."""
         sampler = SequentialBucketSampler(
             simple_dataset,
-            base_batch_size=4,
+            batch_size=4,
             num_replicas=1,
             rank=0,
         )
@@ -270,7 +272,7 @@ def test_sampler_deterministic_shuffling(self, simple_dataset):
         """Test same seed produces same batch order."""
         sampler1 = SequentialBucketSampler(
             simple_dataset,
-            base_batch_size=4,
+            batch_size=4,
             num_replicas=1,
             rank=0,
             seed=42,
@@ -278,7 +280,7 @@ def test_sampler_deterministic_shuffling(self, simple_dataset):
 
         sampler2 = SequentialBucketSampler(
             simple_dataset,
-            base_batch_size=4,
+            batch_size=4,
             num_replicas=1,
             rank=0,
             seed=42,
@@ -293,7 +295,7 @@ def test_sampler_different_seeds_different_order(self, simple_dataset):
         """Test different seeds produce different batch orders."""
         sampler1 = SequentialBucketSampler(
             simple_dataset,
-            base_batch_size=4,
+            batch_size=4,
             num_replicas=1,
             rank=0,
             seed=42,
@@ -302,7 +304,7 @@ def test_sampler_different_seeds_different_order(self, simple_dataset):
 
         sampler2 = SequentialBucketSampler(
             simple_dataset,
-            base_batch_size=4,
+            batch_size=4,
             num_replicas=1,
             rank=0,
             seed=123,
@@ -321,7 +323,7 @@ def test_sampler_no_shuffle(self, simple_dataset):
         """Test sampler without shuffling."""
         sampler = SequentialBucketSampler(
             simple_dataset,
-            base_batch_size=4,
+            batch_size=4,
             num_replicas=1,
             rank=0,
             shuffle_buckets=False,
@@ -335,7 +337,7 @@ def test_sampler_different_order_across_epochs(self, large_dataset):
         """Test that bucket and element order differs across epochs."""
         sampler = SequentialBucketSampler(
             large_dataset,
-            base_batch_size=8,
+            batch_size=8,
             num_replicas=1,
             rank=0,
             seed=42,
@@ -380,7 +382,7 @@ def test_sampler_dynamic_batch_size_disabled(self, multi_resolution_dataset):
         batch_size = 4
         sampler = SequentialBucketSampler(
             multi_resolution_dataset,
-            base_batch_size=batch_size,
+            batch_size=batch_size,
             dynamic_batch_size=False,
             num_replicas=1,
             rank=0,
@@ -396,7 +398,7 @@ def test_sampler_dynamic_batch_size_enabled(self, multi_resolution_dataset):
         """Test sampler with dynamic_batch_size=True varies batch size."""
         sampler = SequentialBucketSampler(
             multi_resolution_dataset,
-            base_batch_size=8,
+            batch_size=8,
             base_resolution=(512, 512),
             dynamic_batch_size=True,
             num_replicas=1,
@@ -417,7 +419,7 @@ def test_sampler_get_batch_info(self, simple_dataset):
         """Test get_batch_info returns bucket information."""
         sampler = SequentialBucketSampler(
             simple_dataset,
-            base_batch_size=4,
+            batch_size=4,
             num_replicas=1,
             rank=0,
         )
@@ -442,7 +444,7 @@ def test_multi_rank_same_batch_count(self, large_dataset):
         for rank in range(world_size):
             sampler = SequentialBucketSampler(
                 large_dataset,
-                base_batch_size=8,
+                batch_size=8,
                 num_replicas=world_size,
                 rank=rank,
             )
@@ -457,7 +459,7 @@ def test_multi_rank_different_samples(self, large_dataset):
 
         sampler0 = SequentialBucketSampler(
             large_dataset,
-            base_batch_size=8,
+            batch_size=8,
             num_replicas=world_size,
             rank=0,
             seed=42,
@@ -465,7 +467,7 @@ def test_multi_rank_different_samples(self, large_dataset):
 
         sampler1 = SequentialBucketSampler(
             large_dataset,
-            base_batch_size=8,
+            batch_size=8,
             num_replicas=world_size,
             rank=1,
             seed=42,
@@ -486,7 +488,7 @@ def test_single_rank_equivalent(self, simple_dataset):
         """Test single rank (world_size=1) processes all data."""
         sampler = SequentialBucketSampler(
             simple_dataset,
-            base_batch_size=4,
+            batch_size=4,
             num_replicas=1,
             rank=0,
         )
@@ -570,18 +572,19 @@ def test_collate_same_resolution_required(self, multi_resolution_dataset):
 
 
 # ============================================================================
-# CPU Tests - build_multiresolution_dataloader
+# CPU Tests - _build_multiresolution_dataloader_core
 # ============================================================================
 
 
-class TestBuildMultiresolutionDataloaderCPU:
-    """CPU tests for build_multiresolution_dataloader."""
+class TestBuildMultiresolutionDataloaderCoreCPU:
+    """CPU tests for _build_multiresolution_dataloader_core."""
 
     def test_build_dataloader_returns_tuple(self, simple_dataset):
         """Test function returns dataloader and sampler."""
-        dataloader, sampler = build_multiresolution_dataloader(
+        dataloader, sampler = _build_multiresolution_dataloader_core(
+            collate_fn=collate_fn_production,
             dataset=simple_dataset,
-            base_batch_size=4,
+            batch_size=4,
             dp_rank=0,
             dp_world_size=1,
             num_workers=0,
@@ -593,9 +596,10 @@ def test_build_dataloader_returns_tuple(self, simple_dataset):
 
     def test_dataloader_iteration(self, simple_dataset):
         """Test dataloader can be iterated."""
-        dataloader, sampler = build_multiresolution_dataloader(
+        dataloader, sampler = _build_multiresolution_dataloader_core(
+            collate_fn=collate_fn_production,
             dataset=simple_dataset,
-            base_batch_size=4,
+            batch_size=4,
             dp_rank=0,
             dp_world_size=1,
             num_workers=0,
@@ -613,9 +617,10 @@ def test_dataloader_iteration(self, simple_dataset):
 
     def test_dataloader_batch_content(self, simple_dataset):
         """Test dataloader batches have correct content."""
-        dataloader, _ = build_multiresolution_dataloader(
+        dataloader, _ = _build_multiresolution_dataloader_core(
+            collate_fn=collate_fn_production,
             dataset=simple_dataset,
-            base_batch_size=4,
+            batch_size=4,
             dp_rank=0,
             dp_world_size=1,
             num_workers=0,
@@ -628,9 +633,10 @@ def test_dataloader_batch_content(self, simple_dataset):
 
     def test_dataloader_with_shuffle(self, simple_dataset):
         """Test dataloader with shuffle enabled."""
-        dataloader, _ = build_multiresolution_dataloader(
+        dataloader, _ = _build_multiresolution_dataloader_core(
+            collate_fn=collate_fn_production,
             dataset=simple_dataset,
-            base_batch_size=4,
+            batch_size=4,
             dp_rank=0,
             dp_world_size=1,
             shuffle=True,
@@ -643,9 +649,10 @@ def test_dataloader_with_shuffle(self, simple_dataset):
 
     def test_dataloader_without_shuffle(self, simple_dataset):
         """Test dataloader with shuffle disabled."""
-        dataloader, _ = build_multiresolution_dataloader(
+        dataloader, _ = _build_multiresolution_dataloader_core(
+            collate_fn=collate_fn_production,
             dataset=simple_dataset,
-            base_batch_size=4,
+            batch_size=4,
             dp_rank=0,
             dp_world_size=1,
             shuffle=False,
@@ -658,9 +665,10 @@ def test_dataloader_without_shuffle(self, simple_dataset):
 
     def test_dataloader_with_dynamic_batch(self, multi_resolution_dataset):
         """Test dataloader with dynamic batch sizing."""
-        dataloader, _ = build_multiresolution_dataloader(
+        dataloader, _ = _build_multiresolution_dataloader_core(
+            collate_fn=collate_fn_production,
             dataset=multi_resolution_dataset,
-            base_batch_size=8,
+            batch_size=8,
             base_resolution=(512, 512),
             dp_rank=0,
             dp_world_size=1,
@@ -687,7 +695,7 @@ def test_sampler_with_gpu_tensors(self, simple_dataset):
 
         sampler = SequentialBucketSampler(
             simple_dataset,
-            base_batch_size=4,
+            batch_size=4,
             num_replicas=1,
             rank=0,
         )
@@ -747,20 +755,21 @@ def test_collate_then_transfer_to_gpu(self, simple_dataset):
 
 
 # ============================================================================
-# GPU Tests - build_multiresolution_dataloader
+# GPU Tests - _build_multiresolution_dataloader_core
 # ============================================================================
 
 
 @pytest.mark.skipif(not torch.cuda.is_available(), reason="Requires CUDA")
-class TestBuildMultiresolutionDataloaderGPU:
-    """GPU tests for build_multiresolution_dataloader."""
+class TestBuildMultiresolutionDataloaderCoreGPU:
+    """GPU tests for _build_multiresolution_dataloader_core."""
 
     def test_dataloader_with_pin_memory(self, simple_dataset):
         """Test dataloader with pin_memory for faster GPU transfer."""
 
-        dataloader, _ = build_multiresolution_dataloader(
+        dataloader, _ = _build_multiresolution_dataloader_core(
+            collate_fn=collate_fn_production,
             dataset=simple_dataset,
-            base_batch_size=4,
+            batch_size=4,
             dp_rank=0,
             dp_world_size=1,
             pin_memory=True,
@@ -775,9 +784,10 @@ def test_dataloader_with_pin_memory(self, simple_dataset):
 
     def test_dataloader_batch_to_gpu(self, simple_dataset):
         """Test full batch transfer to GPU."""
-        dataloader, _ = build_multiresolution_dataloader(
+        dataloader, _ = _build_multiresolution_dataloader_core(
+            collate_fn=collate_fn_production,
             dataset=simple_dataset,
-            base_batch_size=4,
+            batch_size=4,
             dp_rank=0,
             dp_world_size=1,
             num_workers=0,
@@ -814,9 +824,10 @@ def test_dataloader_gpu_memory_cleanup(self, simple_dataset):
         torch.cuda.empty_cache()
         initial_memory = torch.cuda.memory_allocated()
 
-        dataloader, _ = build_multiresolution_dataloader(
+        dataloader, _ = _build_multiresolution_dataloader_core(
+            collate_fn=collate_fn_production,
             dataset=simple_dataset,
-            base_batch_size=4,
+            batch_size=4,
             dp_rank=0,
             dp_world_size=1,
             num_workers=0,
@@ -846,9 +857,9 @@ def test_dataloader_multi_gpu_simulation(self, large_dataset):
         # Create dataloaders for each GPU (simulated)
         dataloaders = []
         for rank in range(min(gpu_count, 2)):  # Use up to 2 GPUs for test
-            dl, _ = build_multiresolution_dataloader(
+            dl, _ = _build_multiresolution_dataloader_core(
                 dataset=large_dataset,
-                base_batch_size=8,
+                batch_size=8,
                 dp_rank=rank,
                 dp_world_size=min(gpu_count, 2),
                 num_workers=0,
@@ -867,9 +878,10 @@ def test_dataloader_multi_gpu_simulation(self, large_dataset):
 
     def test_gpu_operations_on_batch(self, simple_dataset):
         """Test performing GPU operations on loaded batch."""
-        dataloader, _ = build_multiresolution_dataloader(
+        dataloader, _ = _build_multiresolution_dataloader_core(
+            collate_fn=collate_fn_production,
             dataset=simple_dataset,
-            base_batch_size=4,
+            batch_size=4,
             dp_rank=0,
             dp_world_size=1,
             num_workers=0,
@@ -902,9 +914,10 @@ class TestDataloaderIntegration:
 
     def test_full_epoch_iteration_cpu(self, simple_dataset):
         """Test iterating through a full epoch on CPU."""
-        dataloader, sampler = build_multiresolution_dataloader(
+        dataloader, sampler = _build_multiresolution_dataloader_core(
+            collate_fn=collate_fn_production,
             dataset=simple_dataset,
-            base_batch_size=4,
+            batch_size=4,
             dp_rank=0,
             dp_world_size=1,
             num_workers=0,
@@ -919,9 +932,10 @@ def test_full_epoch_iteration_cpu(self, simple_dataset):
 
     def test_multiple_epochs_cpu(self, simple_dataset):
         """Test iterating through multiple epochs."""
-        dataloader, sampler = build_multiresolution_dataloader(
+        dataloader, sampler = _build_multiresolution_dataloader_core(
+            collate_fn=collate_fn_production,
             dataset=simple_dataset,
-            base_batch_size=4,
+            batch_size=4,
             dp_rank=0,
             dp_world_size=1,
             num_workers=0,
@@ -937,9 +951,10 @@ def test_multiple_epochs_cpu(self, simple_dataset):
     @pytest.mark.skipif(not torch.cuda.is_available(), reason="Requires CUDA")
     def test_full_epoch_iteration_gpu(self, simple_dataset):
         """Test iterating through a full epoch with GPU transfer."""
-        dataloader, sampler = build_multiresolution_dataloader(
+        dataloader, sampler = _build_multiresolution_dataloader_core(
+            collate_fn=collate_fn_production,
             dataset=simple_dataset,
-            base_batch_size=4,
+            batch_size=4,
             dp_rank=0,
             dp_world_size=1,
             pin_memory=True,
@@ -964,7 +979,7 @@ def test_deterministic_across_ranks(self, large_dataset):
         # Create samplers for two ranks
         sampler0 = SequentialBucketSampler(
             large_dataset,
-            base_batch_size=8,
+            batch_size=8,
             num_replicas=world_size,
             rank=0,
             seed=seed,
@@ -972,7 +987,7 @@ def test_deterministic_across_ranks(self, large_dataset):
 
         sampler1 = SequentialBucketSampler(
             large_dataset,
-            base_batch_size=8,
+            batch_size=8,
             num_replicas=world_size,
             rank=1,
             seed=seed,
diff --git a/tools/diffusion/processors/wan.py b/tools/diffusion/processors/wan.py
index 8fa2ea004..750e53404 100644
--- a/tools/diffusion/processors/wan.py
+++ b/tools/diffusion/processors/wan.py
@@ -111,6 +111,8 @@ def load_models(self, model_name: str, device: str) -> Dict[str, Any]:
         from transformers import AutoTokenizer, UMT5EncoderModel
 
         dtype = torch.float16 if "cuda" in device else torch.float32
+        # UMT5 requires bfloat16 (float16 causes overflow/zeros in attention and layer norm)
+        text_encoder_dtype = torch.bfloat16 if "cuda" in device else torch.float32
 
         logger.info("[Wan] Loading models from %s...", model_name)
 
@@ -119,8 +121,19 @@ def load_models(self, model_name: str, device: str) -> Dict[str, Any]:
         text_encoder = UMT5EncoderModel.from_pretrained(
             model_name,
             subfolder="text_encoder",
-            torch_dtype=dtype,
+            torch_dtype=text_encoder_dtype,
         )
+        # Workaround for transformers>=5.0.0 weight tying regression:
+        # The Wan2.1 checkpoint stores the token embedding as "shared.weight", which
+        # transformers<5 automatically tied to "encoder.embed_tokens.weight". In v5+,
+        # this tying no longer happens during from_pretrained(), leaving embed_tokens
+        # zero-initialized and producing all-zero text embeddings.
+        if (
+            hasattr(text_encoder, "shared")
+            and hasattr(text_encoder.encoder, "embed_tokens")
+            and text_encoder.encoder.embed_tokens.weight.data_ptr() != text_encoder.shared.weight.data_ptr()
+        ):
+            text_encoder.encoder.embed_tokens.weight = text_encoder.shared.weight
         text_encoder.to(device)
         text_encoder.eval()
 
diff --git a/uv.lock b/uv.lock
index 4c5dfd33c..5a7c5beca 100644
--- a/uv.lock
+++ b/uv.lock
@@ -2275,6 +2275,21 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/41/45/1a4ed80516f02155c51f51e8cedb3c1902296743db0bbc66608a0db2814f/jsonschema_specifications-2025.9.1-py3-none-any.whl", hash = "sha256:98802fee3a11ee76ecaca44429fda8a41bff98b00a0f2838151b113f210cc6fe", size = 18437, upload-time = "2025-09-08T01:34:57.871Z" },
 ]
 
+[[package]]
+name = "kernels"
+version = "0.12.2"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "huggingface-hub" },
+    { name = "packaging" },
+    { name = "pyyaml" },
+    { name = "tomli", marker = "python_full_version < '3.11'" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/a9/07/d2b635e965b232cae1aa873c6e0458947196be8dca7bb02e64d3cd6e8d19/kernels-0.12.2.tar.gz", hash = "sha256:812fc43c2814f046cee655cbebf3918cddd489715773670bdb38cca3f5203b5b", size = 57108, upload-time = "2026-03-04T10:03:00.379Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/08/be/f5d6758b48633e4f6a28198fcf4bf9f763cc6a82e2335d9fe8802a5cb440/kernels-0.12.2-py3-none-any.whl", hash = "sha256:1289261804748cf3cf8e3afab80b505b0f1b28e4ec88379cdf08dc31e64964b8", size = 55205, upload-time = "2026-03-04T10:02:59.305Z" },
+]
+
 [[package]]
 name = "kiwisolver"
 version = "1.4.9"
@@ -3240,6 +3255,7 @@ all = [
     { name = "ftfy" },
     { name = "imageio" },
     { name = "imageio-ffmpeg" },
+    { name = "kernels" },
     { name = "mamba-ssm" },
     { name = "mistral-common", extra = ["opencv"] },
     { name = "numba", version = "0.53.1", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.13'" },
@@ -3257,6 +3273,10 @@ all = [
     { name = "sentencepiece" },
     { name = "timm" },
     { name = "torchcodec", marker = "platform_machine == 'x86_64' and sys_platform != 'darwin'" },
+    { name = "torchvision", version = "0.24.0", source = { registry = "https://download.pytorch.org/whl/cu129" }, marker = "platform_machine == 'aarch64' and sys_platform == 'linux'" },
+    { name = "torchvision", version = "0.24.0+cu129", source = { registry = "https://download.pytorch.org/whl/cu129" }, marker = "platform_machine != 'aarch64' and sys_platform == 'linux'" },
+    { name = "torchvision", version = "0.25.0", source = { registry = "https://pypi.org/simple" }, marker = "sys_platform == 'darwin'" },
+    { name = "torchvision", version = "0.25.0+cpu", source = { registry = "https://download.pytorch.org/whl/cpu" }, marker = "sys_platform != 'darwin' and sys_platform != 'linux'" },
     { name = "transformer-engine", extra = ["pytorch"] },
 ]
 cuda = [
@@ -3276,7 +3296,12 @@ diffusion = [
     { name = "ftfy" },
     { name = "imageio" },
     { name = "imageio-ffmpeg" },
+    { name = "kernels" },
     { name = "opencv-python-headless" },
+    { name = "torchvision", version = "0.24.0", source = { registry = "https://download.pytorch.org/whl/cu129" }, marker = "platform_machine == 'aarch64' and sys_platform == 'linux'" },
+    { name = "torchvision", version = "0.24.0+cu129", source = { registry = "https://download.pytorch.org/whl/cu129" }, marker = "platform_machine != 'aarch64' and sys_platform == 'linux'" },
+    { name = "torchvision", version = "0.25.0", source = { registry = "https://pypi.org/simple" }, marker = "sys_platform == 'darwin'" },
+    { name = "torchvision", version = "0.25.0+cpu", source = { registry = "https://download.pytorch.org/whl/cpu" }, marker = "sys_platform != 'darwin' and sys_platform != 'linux'" },
 ]
 extra = [
     { name = "flash-linear-attention" },
@@ -3365,6 +3390,7 @@ requires-dist = [
     { name = "ftfy", marker = "extra == 'diffusion'" },
     { name = "imageio", marker = "extra == 'diffusion'" },
     { name = "imageio-ffmpeg", marker = "extra == 'diffusion'" },
+    { name = "kernels", marker = "extra == 'diffusion'" },
     { name = "mamba-ssm", marker = "extra == 'cuda'" },
     { name = "megatron-fsdp", specifier = ">=0.2.3" },
     { name = "mistral-common", extras = ["audio", "hf-hub", "image", "sentencepiece"] },
@@ -3397,6 +3423,9 @@ requires-dist = [
     { name = "torchao" },
     { name = "torchcodec", marker = "platform_machine == 'x86_64' and sys_platform != 'darwin' and extra == 'vlm'" },
     { name = "torchdata" },
+    { name = "torchvision", marker = "sys_platform == 'darwin' and extra == 'diffusion'", index = "https://pypi.org/simple" },
+    { name = "torchvision", marker = "sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'diffusion'", index = "https://download.pytorch.org/whl/cpu" },
+    { name = "torchvision", marker = "sys_platform == 'linux' and extra == 'diffusion'", index = "https://download.pytorch.org/whl/cu129" },
     { name = "transformer-engine", extras = ["pytorch"], marker = "extra == 'cuda'", specifier = "<=2.11.0" },
     { name = "transformers", specifier = ">=5.0.0" },
     { name = "wandb" },
@@ -4041,8 +4070,10 @@ dependencies = [
     { name = "torch", version = "2.9.0+cu129", source = { registry = "https://download.pytorch.org/whl/cu129" }, marker = "sys_platform == 'linux'" },
     { name = "torch", version = "2.10.0", source = { registry = "https://pypi.org/simple" }, marker = "sys_platform == 'darwin'" },
     { name = "torch", version = "2.10.0+cpu", source = { registry = "https://download.pytorch.org/whl/cpu" }, marker = "sys_platform != 'darwin' and sys_platform != 'linux'" },
-    { name = "torchvision", version = "0.24.0", source = { registry = "https://pypi.org/simple" }, marker = "sys_platform == 'linux'" },
-    { name = "torchvision", version = "0.25.0", source = { registry = "https://pypi.org/simple" }, marker = "sys_platform != 'linux'" },
+    { name = "torchvision", version = "0.24.0", source = { registry = "https://download.pytorch.org/whl/cu129" }, marker = "platform_machine == 'aarch64' and sys_platform == 'linux'" },
+    { name = "torchvision", version = "0.24.0+cu129", source = { registry = "https://download.pytorch.org/whl/cu129" }, marker = "platform_machine != 'aarch64' and sys_platform == 'linux'" },
+    { name = "torchvision", version = "0.25.0", source = { registry = "https://pypi.org/simple" }, marker = "sys_platform == 'darwin'" },
+    { name = "torchvision", version = "0.25.0+cpu", source = { registry = "https://download.pytorch.org/whl/cpu" }, marker = "sys_platform != 'darwin' and sys_platform != 'linux'" },
     { name = "tqdm" },
 ]
 sdist = { url = "https://files.pythonhosted.org/packages/30/46/fb8be250fa7fcfc56fbeb41583645e18d868268f67fbbbeb8ed62a8ff18a/open_clip_torch-3.2.0.tar.gz", hash = "sha256:62b7743012ccc40fb7c64819fa762fba0a13dd74585ac733babe58c2974c2506", size = 1502853, upload-time = "2025-09-21T17:32:08.289Z" }
@@ -6460,8 +6491,10 @@ dependencies = [
     { name = "torch", version = "2.9.0+cu129", source = { registry = "https://download.pytorch.org/whl/cu129" }, marker = "sys_platform == 'linux'" },
     { name = "torch", version = "2.10.0", source = { registry = "https://pypi.org/simple" }, marker = "sys_platform == 'darwin'" },
     { name = "torch", version = "2.10.0+cpu", source = { registry = "https://download.pytorch.org/whl/cpu" }, marker = "sys_platform != 'darwin' and sys_platform != 'linux'" },
-    { name = "torchvision", version = "0.24.0", source = { registry = "https://pypi.org/simple" }, marker = "sys_platform == 'linux'" },
-    { name = "torchvision", version = "0.25.0", source = { registry = "https://pypi.org/simple" }, marker = "sys_platform != 'linux'" },
+    { name = "torchvision", version = "0.24.0", source = { registry = "https://download.pytorch.org/whl/cu129" }, marker = "platform_machine == 'aarch64' and sys_platform == 'linux'" },
+    { name = "torchvision", version = "0.24.0+cu129", source = { registry = "https://download.pytorch.org/whl/cu129" }, marker = "platform_machine != 'aarch64' and sys_platform == 'linux'" },
+    { name = "torchvision", version = "0.25.0", source = { registry = "https://pypi.org/simple" }, marker = "sys_platform == 'darwin'" },
+    { name = "torchvision", version = "0.25.0+cpu", source = { registry = "https://download.pytorch.org/whl/cpu" }, marker = "sys_platform != 'darwin' and sys_platform != 'linux'" },
 ]
 sdist = { url = "https://files.pythonhosted.org/packages/c5/9d/e4670765d1c033f97096c760b3b907eeb659cf80f3678640e5f060b04c6c/timm-1.0.22.tar.gz", hash = "sha256:14fd74bcc17db3856b1a47d26fb305576c98579ab9d02b36714a5e6b25cde422", size = 2382998, upload-time = "2025-11-05T04:06:09.377Z" }
 wheels = [
@@ -6722,45 +6755,60 @@ wheels = [
 [[package]]
 name = "torchvision"
 version = "0.24.0"
-source = { registry = "https://pypi.org/simple" }
+source = { registry = "https://download.pytorch.org/whl/cu129" }
 resolution-markers = [
     "python_full_version >= '3.14' and platform_machine == 'aarch64' and sys_platform == 'linux'",
     "python_full_version == '3.13.*' and platform_machine == 'aarch64' and sys_platform == 'linux'",
     "python_full_version == '3.12.*' and platform_machine == 'aarch64' and sys_platform == 'linux'",
+    "python_full_version == '3.11.*' and platform_machine == 'aarch64' and sys_platform == 'linux'",
+    "python_full_version < '3.11' and platform_machine == 'aarch64' and sys_platform == 'linux'",
+]
+dependencies = [
+    { name = "numpy", version = "1.26.4", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.13' and platform_machine == 'aarch64' and sys_platform == 'linux'" },
+    { name = "numpy", version = "2.4.1", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.13' and platform_machine == 'aarch64' and sys_platform == 'linux'" },
+    { name = "pillow", marker = "platform_machine == 'aarch64' and sys_platform == 'linux'" },
+    { name = "torch", version = "2.9.0+cu129", source = { registry = "https://download.pytorch.org/whl/cu129" }, marker = "platform_machine == 'aarch64' and sys_platform == 'linux'" },
+]
+wheels = [
+    { url = "https://download.pytorch.org/whl/cu129/torchvision-0.24.0-cp310-cp310-manylinux_2_28_aarch64.whl" },
+    { url = "https://download.pytorch.org/whl/cu129/torchvision-0.24.0-cp311-cp311-manylinux_2_28_aarch64.whl" },
+    { url = "https://download.pytorch.org/whl/cu129/torchvision-0.24.0-cp312-cp312-manylinux_2_28_aarch64.whl" },
+    { url = "https://download.pytorch.org/whl/cu129/torchvision-0.24.0-cp313-cp313-manylinux_2_28_aarch64.whl" },
+    { url = "https://download.pytorch.org/whl/cu129/torchvision-0.24.0-cp313-cp313t-manylinux_2_28_aarch64.whl" },
+    { url = "https://download.pytorch.org/whl/cu129/torchvision-0.24.0-cp314-cp314-manylinux_2_28_aarch64.whl" },
+    { url = "https://download.pytorch.org/whl/cu129/torchvision-0.24.0-cp314-cp314t-manylinux_2_28_aarch64.whl" },
+]
+
+[[package]]
+name = "torchvision"
+version = "0.24.0+cu129"
+source = { registry = "https://download.pytorch.org/whl/cu129" }
+resolution-markers = [
     "python_full_version >= '3.14' and platform_machine == 'x86_64' and sys_platform == 'linux'",
     "python_full_version >= '3.14' and platform_machine != 'aarch64' and platform_machine != 'x86_64' and sys_platform == 'linux'",
     "python_full_version == '3.13.*' and platform_machine == 'x86_64' and sys_platform == 'linux'",
     "python_full_version == '3.13.*' and platform_machine != 'aarch64' and platform_machine != 'x86_64' and sys_platform == 'linux'",
     "python_full_version == '3.12.*' and platform_machine == 'x86_64' and sys_platform == 'linux'",
     "python_full_version == '3.12.*' and platform_machine != 'aarch64' and platform_machine != 'x86_64' and sys_platform == 'linux'",
-    "python_full_version == '3.11.*' and platform_machine == 'aarch64' and sys_platform == 'linux'",
     "python_full_version == '3.11.*' and platform_machine == 'x86_64' and sys_platform == 'linux'",
     "python_full_version == '3.11.*' and platform_machine != 'aarch64' and platform_machine != 'x86_64' and sys_platform == 'linux'",
-    "python_full_version < '3.11' and platform_machine == 'aarch64' and sys_platform == 'linux'",
     "python_full_version < '3.11' and platform_machine == 'x86_64' and sys_platform == 'linux'",
     "python_full_version < '3.11' and platform_machine != 'aarch64' and platform_machine != 'x86_64' and sys_platform == 'linux'",
 ]
 dependencies = [
-    { name = "numpy", version = "1.26.4", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.13' and sys_platform == 'linux'" },
-    { name = "numpy", version = "2.4.1", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.13' and sys_platform == 'linux'" },
-    { name = "pillow", marker = "sys_platform == 'linux'" },
-    { name = "torch", version = "2.9.0+cu129", source = { registry = "https://download.pytorch.org/whl/cu129" }, marker = "sys_platform == 'linux'" },
+    { name = "numpy", version = "1.26.4", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.13' and platform_machine != 'aarch64' and sys_platform == 'linux'" },
+    { name = "numpy", version = "2.4.1", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.13' and platform_machine != 'aarch64' and sys_platform == 'linux'" },
+    { name = "pillow", marker = "platform_machine != 'aarch64' and sys_platform == 'linux'" },
+    { name = "torch", version = "2.9.0+cu129", source = { registry = "https://download.pytorch.org/whl/cu129" }, marker = "platform_machine != 'aarch64' and sys_platform == 'linux'" },
 ]
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/88/e3/1b003ecd52bd721f8304aeb66691edfbc2002747ec83d36188ad6abab506/torchvision-0.24.0-cp310-cp310-manylinux_2_28_aarch64.whl", hash = "sha256:a110a51c75e89807a8382b0d8034f5e180fb9319570be3389ffd3d4ac4fd57a9", size = 2418988, upload-time = "2025-10-15T15:51:25.195Z" },
-    { url = "https://files.pythonhosted.org/packages/56/2e/3c19a35e62da0f606baf8f6e2ceeab1eb66aaa2f84c6528538b06b416d54/torchvision-0.24.0-cp310-cp310-manylinux_2_28_x86_64.whl", hash = "sha256:81d5b12a6df1bb2cc8bdbad837b637d6ea446f2866e6d94f1b5d478856331be3", size = 8046769, upload-time = "2025-10-15T15:51:15.221Z" },
-    { url = "https://files.pythonhosted.org/packages/f8/07/0cd6776eee784742ad3cb2bfd3295383d84cb2f9e87386119333d1587f0f/torchvision-0.24.0-cp311-cp311-manylinux_2_28_aarch64.whl", hash = "sha256:bbd63bf4ebff84c48c50123eba90526cc9f794fe45bc9f5dd07cec19e8c62bce", size = 2420513, upload-time = "2025-10-15T15:51:18.087Z" },
-    { url = "https://files.pythonhosted.org/packages/1a/f4/6026c08011ddcefcbc14161c5aa9dce55c35c6b045e04ef0952e88bf4594/torchvision-0.24.0-cp311-cp311-manylinux_2_28_x86_64.whl", hash = "sha256:78fe414b3bb6dbf7e6f6da6f733ba96881f6b29a9b997228de7c5f603e5ed940", size = 8048018, upload-time = "2025-10-15T15:51:13.579Z" },
-    { url = "https://files.pythonhosted.org/packages/00/7b/e3809b3302caea9a12c13f3adebe4fef127188438e719fd6c8dc93db1da6/torchvision-0.24.0-cp312-cp312-manylinux_2_28_aarch64.whl", hash = "sha256:b0531d1483fc322d7da0d83be52f0df860a75114ab87dbeeb9de765feaeda843", size = 2419495, upload-time = "2025-10-15T15:51:11.885Z" },
-    { url = "https://files.pythonhosted.org/packages/7e/e6/7324ead6793075a8c75c56abeed1236d1750de16a5613cfe2ddad164a92a/torchvision-0.24.0-cp312-cp312-manylinux_2_28_x86_64.whl", hash = "sha256:26b9dd9c083f8e5f7ac827de6d5b88c615d9c582dc87666770fbdf16887e4c25", size = 8050480, upload-time = "2025-10-15T15:51:24.012Z" },
-    { url = "https://files.pythonhosted.org/packages/8f/02/e2f6b0ff93ca4db5751ac9c5be43f13d5e53d9e9412324f464dca1775027/torchvision-0.24.0-cp313-cp313-manylinux_2_28_aarch64.whl", hash = "sha256:fec12a269cf80f6b0b71471c8d498cd3bdd9d8e892c425bf39fecb604852c3b0", size = 2371478, upload-time = "2025-10-15T15:51:37.842Z" },
-    { url = "https://files.pythonhosted.org/packages/77/85/42e5fc4f716ec7b73cf1f32eeb5c77961be4d4054b26cd6a5ff97f20c966/torchvision-0.24.0-cp313-cp313-manylinux_2_28_x86_64.whl", hash = "sha256:7323a9be5e3da695605753f501cdc87824888c5655d27735cdeaa9986b45884c", size = 8050200, upload-time = "2025-10-15T15:51:46.276Z" },
-    { url = "https://files.pythonhosted.org/packages/f7/cf/2d7e43409089ce7070f5336161f9216d58653ee1cb26bcb5d6c84cc2de36/torchvision-0.24.0-cp313-cp313t-manylinux_2_28_aarch64.whl", hash = "sha256:b1b3db80609c32a088554e8e94b4fc31f1033fe5bb4ac0673ec49c3eb03fb4da", size = 2374466, upload-time = "2025-10-15T15:51:35.382Z" },
-    { url = "https://files.pythonhosted.org/packages/e9/30/8f7c328fd7e0a9665da4b6b56b1c627665c18470bfe62f3729ad3eda9aec/torchvision-0.24.0-cp313-cp313t-manylinux_2_28_x86_64.whl", hash = "sha256:e6635f100d455c80b43f297df4b8585a76c6a2e114802f6567ddd28d7b5479b0", size = 8217068, upload-time = "2025-10-15T15:51:36.623Z" },
-    { url = "https://files.pythonhosted.org/packages/b0/d7/69479a066ea773653e88eda99031e38681e9094046f87cb957af5036db0e/torchvision-0.24.0-cp314-cp314-manylinux_2_28_aarch64.whl", hash = "sha256:73576a9c4a593223fbae85a64e8bbd77049abd1101893ecf3c5e981284fd58b4", size = 2371609, upload-time = "2025-10-15T15:51:29.859Z" },
-    { url = "https://files.pythonhosted.org/packages/46/64/3c7fdb3771ec992b9445a1f7a969466b23ce2cdb14e09303b3db351a0655/torchvision-0.24.0-cp314-cp314-manylinux_2_28_x86_64.whl", hash = "sha256:dd565b1b06666ff399d0801d4d1824fa570c0167a179ca700a5be232527b3c62", size = 8214918, upload-time = "2025-10-15T15:51:41.465Z" },
-    { url = "https://files.pythonhosted.org/packages/a2/fd/615d8a86db1578345de7fa1edaf476fbcf4f057bf7e4fd898306b620c487/torchvision-0.24.0-cp314-cp314t-manylinux_2_28_aarch64.whl", hash = "sha256:64e54494043eecf9f57a9881c6fdea49c62282782e737c002ae8b1639e6ea80e", size = 2374469, upload-time = "2025-10-15T15:51:40.19Z" },
-    { url = "https://files.pythonhosted.org/packages/04/98/bac11e8fdbf00d6c398246ff2781370aa72c99f2ac685c01ce79354c9a32/torchvision-0.24.0-cp314-cp314t-manylinux_2_28_x86_64.whl", hash = "sha256:75ef9546323b321a451239d886f0cb528f7e98bb294da47a3200effd4e572064", size = 8217060, upload-time = "2025-10-15T15:51:45.033Z" },
+    { url = "https://download.pytorch.org/whl/cu129/torchvision-0.24.0%2Bcu129-cp310-cp310-manylinux_2_28_x86_64.whl" },
+    { url = "https://download.pytorch.org/whl/cu129/torchvision-0.24.0%2Bcu129-cp311-cp311-manylinux_2_28_x86_64.whl" },
+    { url = "https://download.pytorch.org/whl/cu129/torchvision-0.24.0%2Bcu129-cp312-cp312-manylinux_2_28_x86_64.whl" },
+    { url = "https://download.pytorch.org/whl/cu129/torchvision-0.24.0%2Bcu129-cp313-cp313-manylinux_2_28_x86_64.whl" },
+    { url = "https://download.pytorch.org/whl/cu129/torchvision-0.24.0%2Bcu129-cp313-cp313t-manylinux_2_28_x86_64.whl" },
+    { url = "https://download.pytorch.org/whl/cu129/torchvision-0.24.0%2Bcu129-cp314-cp314-manylinux_2_28_x86_64.whl" },
+    { url = "https://download.pytorch.org/whl/cu129/torchvision-0.24.0%2Bcu129-cp314-cp314t-manylinux_2_28_x86_64.whl" },
 ]
 
 [[package]]
@@ -6768,16 +6816,6 @@ name = "torchvision"
 version = "0.25.0"
 source = { registry = "https://pypi.org/simple" }
 resolution-markers = [
-    "python_full_version >= '3.14' and platform_machine == 'x86_64' and sys_platform != 'darwin' and sys_platform != 'linux'",
-    "python_full_version >= '3.14' and platform_machine != 'x86_64' and sys_platform != 'darwin' and sys_platform != 'linux'",
-    "python_full_version == '3.13.*' and platform_machine == 'x86_64' and sys_platform != 'darwin' and sys_platform != 'linux'",
-    "python_full_version == '3.13.*' and platform_machine != 'x86_64' and sys_platform != 'darwin' and sys_platform != 'linux'",
-    "python_full_version == '3.12.*' and platform_machine == 'x86_64' and sys_platform != 'darwin' and sys_platform != 'linux'",
-    "python_full_version == '3.12.*' and platform_machine != 'x86_64' and sys_platform != 'darwin' and sys_platform != 'linux'",
-    "python_full_version == '3.11.*' and platform_machine == 'x86_64' and sys_platform != 'darwin' and sys_platform != 'linux'",
-    "python_full_version == '3.11.*' and platform_machine != 'x86_64' and sys_platform != 'darwin' and sys_platform != 'linux'",
-    "python_full_version < '3.11' and platform_machine == 'x86_64' and sys_platform != 'darwin' and sys_platform != 'linux'",
-    "python_full_version < '3.11' and platform_machine != 'x86_64' and sys_platform != 'darwin' and sys_platform != 'linux'",
     "python_full_version >= '3.14' and sys_platform == 'darwin'",
     "python_full_version == '3.13.*' and sys_platform == 'darwin'",
     "python_full_version == '3.12.*' and sys_platform == 'darwin'",
@@ -6785,27 +6823,51 @@ resolution-markers = [
     "python_full_version < '3.11' and sys_platform == 'darwin'",
 ]
 dependencies = [
-    { name = "numpy", version = "1.26.4", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.13' and sys_platform != 'linux'" },
-    { name = "numpy", version = "2.4.1", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.13' and sys_platform != 'linux'" },
-    { name = "pillow", marker = "sys_platform != 'linux'" },
+    { name = "numpy", version = "1.26.4", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.13' and sys_platform == 'darwin'" },
+    { name = "numpy", version = "2.4.1", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.13' and sys_platform == 'darwin'" },
+    { name = "pillow", marker = "sys_platform == 'darwin'" },
     { name = "torch", version = "2.10.0", source = { registry = "https://pypi.org/simple" }, marker = "sys_platform == 'darwin'" },
-    { name = "torch", version = "2.10.0+cpu", source = { registry = "https://download.pytorch.org/whl/cpu" }, marker = "sys_platform != 'darwin' and sys_platform != 'linux'" },
 ]
 wheels = [
     { url = "https://files.pythonhosted.org/packages/50/ae/cbf727421eb73f1cf907fbe5788326a08f111b3f6b6ddca15426b53fec9a/torchvision-0.25.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:a95c47abb817d4e90ea1a8e57bd0d728e3e6b533b3495ae77d84d883c4d11f56", size = 1874919, upload-time = "2026-01-21T16:27:47.617Z" },
-    { url = "https://files.pythonhosted.org/packages/8b/b9/a53bcf8f78f2cd89215e9ded70041765d50ef13bf301f9884ec6041a9421/torchvision-0.25.0-cp310-cp310-win_amd64.whl", hash = "sha256:b57430fbe9e9b697418a395041bb615124d9c007710a2712fda6e35fb310f264", size = 3697295, upload-time = "2026-01-21T16:27:36.574Z" },
     { url = "https://files.pythonhosted.org/packages/3e/be/c704bceaf11c4f6b19d64337a34a877fcdfe3bd68160a8c9ae9bea4a35a3/torchvision-0.25.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:db74a551946b75d19f9996c419a799ffdf6a223ecf17c656f90da011f1d75b20", size = 1874923, upload-time = "2026-01-21T16:27:46.574Z" },
-    { url = "https://files.pythonhosted.org/packages/23/19/55b28aecdc7f38df57b8eb55eb0b14a62b470ed8efeb22cdc74224df1d6a/torchvision-0.25.0-cp311-cp311-win_amd64.whl", hash = "sha256:ea580ffd6094cc01914ad32f8c8118174f18974629af905cea08cb6d5d48c7b7", size = 4038722, upload-time = "2026-01-21T16:27:41.355Z" },
     { url = "https://files.pythonhosted.org/packages/56/3a/6ea0d73f49a9bef38a1b3a92e8dd455cea58470985d25635beab93841748/torchvision-0.25.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:c2abe430c90b1d5e552680037d68da4eb80a5852ebb1c811b2b89d299b10573b", size = 1874920, upload-time = "2026-01-21T16:27:45.348Z" },
-    { url = "https://files.pythonhosted.org/packages/ad/16/8f650c2e288977cf0f8f85184b90ee56ed170a4919347fc74ee99286ed6f/torchvision-0.25.0-cp312-cp312-win_amd64.whl", hash = "sha256:f9c55ae8d673ab493325d1267cbd285bb94d56f99626c00ac4644de32a59ede3", size = 4303059, upload-time = "2026-01-21T16:27:11.08Z" },
     { url = "https://files.pythonhosted.org/packages/f5/5b/1562a04a6a5a4cf8cf40016a0cdeda91ede75d6962cff7f809a85ae966a5/torchvision-0.25.0-cp313-cp313-macosx_12_0_arm64.whl", hash = "sha256:24e11199e4d84ba9c5ee7825ebdf1cd37ce8deec225117f10243cae984ced3ec", size = 1874918, upload-time = "2026-01-21T16:27:39.02Z" },
-    { url = "https://files.pythonhosted.org/packages/32/a5/9a9b1de0720f884ea50dbf9acb22cbe5312e51d7b8c4ac6ba9b51efd9bba/torchvision-0.25.0-cp313-cp313-win_amd64.whl", hash = "sha256:cef0196be31be421f6f462d1e9da1101be7332d91984caa6f8022e6c78a5877f", size = 4321911, upload-time = "2026-01-21T16:27:35.195Z" },
     { url = "https://files.pythonhosted.org/packages/52/99/dca81ed21ebaeff2b67cc9f815a20fdaa418b69f5f9ea4c6ed71721470db/torchvision-0.25.0-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:a8f8061284395ce31bcd460f2169013382ccf411148ceb2ee38e718e9860f5a7", size = 1896209, upload-time = "2026-01-21T16:27:32.159Z" },
-    { url = "https://files.pythonhosted.org/packages/63/cc/0ea68b5802e5e3c31f44b307e74947bad5a38cc655231d845534ed50ddb8/torchvision-0.25.0-cp313-cp313t-win_amd64.whl", hash = "sha256:5e6b449e9fa7d642142c0e27c41e5a43b508d57ed8e79b7c0a0c28652da8678c", size = 4344260, upload-time = "2026-01-21T16:27:17.018Z" },
     { url = "https://files.pythonhosted.org/packages/9e/1f/fa839532660e2602b7e704d65010787c5bb296258b44fa8b9c1cd6175e7d/torchvision-0.25.0-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:620a236288d594dcec7634c754484542dc0a5c1b0e0b83a34bda5e91e9b7c3a1", size = 1896193, upload-time = "2026-01-21T16:27:24.785Z" },
-    { url = "https://files.pythonhosted.org/packages/1f/eb/d0096eed5690d962853213f2ee00d91478dfcb586b62dbbb449fb8abc3a6/torchvision-0.25.0-cp314-cp314-win_amd64.whl", hash = "sha256:d1abd5ed030c708f5dbf4812ad5f6fbe9384b63c40d6bd79f8df41a4a759a917", size = 4325058, upload-time = "2026-01-21T16:27:26.165Z" },
     { url = "https://files.pythonhosted.org/packages/97/36/96374a4c7ab50dea9787ce987815614ccfe988a42e10ac1a2e3e5b60319a/torchvision-0.25.0-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:ad9a8a5877782944d99186e4502a614770fe906626d76e9cd32446a0ac3075f2", size = 1896207, upload-time = "2026-01-21T16:27:23.383Z" },
-    { url = "https://files.pythonhosted.org/packages/b6/37/e7ca4ec820d434c0f23f824eb29f0676a0c3e7a118f1514f5b949c3356da/torchvision-0.25.0-cp314-cp314t-win_amd64.whl", hash = "sha256:f07f01d27375ad89d72aa2b3f2180f07da95dd9d2e4c758e015c0acb2da72977", size = 4425879, upload-time = "2026-01-21T16:27:12.579Z" },
+]
+
+[[package]]
+name = "torchvision"
+version = "0.25.0+cpu"
+source = { registry = "https://download.pytorch.org/whl/cpu" }
+resolution-markers = [
+    "python_full_version >= '3.14' and platform_machine == 'x86_64' and sys_platform != 'darwin' and sys_platform != 'linux'",
+    "python_full_version >= '3.14' and platform_machine != 'x86_64' and sys_platform != 'darwin' and sys_platform != 'linux'",
+    "python_full_version == '3.13.*' and platform_machine == 'x86_64' and sys_platform != 'darwin' and sys_platform != 'linux'",
+    "python_full_version == '3.13.*' and platform_machine != 'x86_64' and sys_platform != 'darwin' and sys_platform != 'linux'",
+    "python_full_version == '3.12.*' and platform_machine == 'x86_64' and sys_platform != 'darwin' and sys_platform != 'linux'",
+    "python_full_version == '3.12.*' and platform_machine != 'x86_64' and sys_platform != 'darwin' and sys_platform != 'linux'",
+    "python_full_version == '3.11.*' and platform_machine == 'x86_64' and sys_platform != 'darwin' and sys_platform != 'linux'",
+    "python_full_version == '3.11.*' and platform_machine != 'x86_64' and sys_platform != 'darwin' and sys_platform != 'linux'",
+    "python_full_version < '3.11' and platform_machine == 'x86_64' and sys_platform != 'darwin' and sys_platform != 'linux'",
+    "python_full_version < '3.11' and platform_machine != 'x86_64' and sys_platform != 'darwin' and sys_platform != 'linux'",
+]
+dependencies = [
+    { name = "numpy", version = "1.26.4", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.13' and sys_platform != 'darwin' and sys_platform != 'linux'" },
+    { name = "numpy", version = "2.4.1", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.13' and sys_platform != 'darwin' and sys_platform != 'linux'" },
+    { name = "pillow", marker = "sys_platform != 'darwin' and sys_platform != 'linux'" },
+    { name = "torch", version = "2.10.0+cpu", source = { registry = "https://download.pytorch.org/whl/cpu" }, marker = "sys_platform != 'darwin' and sys_platform != 'linux'" },
+]
+wheels = [
+    { url = "https://download.pytorch.org/whl/cpu/torchvision-0.25.0%2Bcpu-cp310-cp310-win_amd64.whl", hash = "sha256:3e2ae9981e32a5b9db685659d5c7af0f04b159ff20394650a90124baf6ada51a" },
+    { url = "https://download.pytorch.org/whl/cpu/torchvision-0.25.0%2Bcpu-cp311-cp311-win_amd64.whl", hash = "sha256:c7eb5f219fdfaf1f65e68c00eb81172ab4fa08a9874dae9dad2bca360da34d0f" },
+    { url = "https://download.pytorch.org/whl/cpu/torchvision-0.25.0%2Bcpu-cp312-cp312-win_amd64.whl", hash = "sha256:2d444009c0956669ada149f61ed78f257c1cc96d259efa6acf3929ca96ceb3f0" },
+    { url = "https://download.pytorch.org/whl/cpu/torchvision-0.25.0%2Bcpu-cp313-cp313-win_amd64.whl", hash = "sha256:783c8fc580bbfc159bff52f4f72cdd538e42b32956e70dffa42b940db114e151" },
+    { url = "https://download.pytorch.org/whl/cpu/torchvision-0.25.0%2Bcpu-cp313-cp313t-win_amd64.whl", hash = "sha256:9212210f417888e6261c040495180f053084812cf873dedba9fc51ff4b24b2d3" },
+    { url = "https://download.pytorch.org/whl/cpu/torchvision-0.25.0%2Bcpu-cp314-cp314-win_amd64.whl", hash = "sha256:499eae1e535766391b6ee2d1e6e841239c20e2e6d88203a15b8f9f8d60a1f8bd" },
+    { url = "https://download.pytorch.org/whl/cpu/torchvision-0.25.0%2Bcpu-cp314-cp314t-win_amd64.whl", hash = "sha256:fb9f07f6a10f0ac24ac482ae68c6df99110b74a0d80a4c64fddc9753267d8815" },
 ]
 
 [[package]]