diff --git a/docker/common/uv-pytorch.lock b/docker/common/uv-pytorch.lock index c26fd7e54..8bca7cbeb 100644 --- a/docker/common/uv-pytorch.lock +++ b/docker/common/uv-pytorch.lock @@ -68,7 +68,7 @@ overrides = [ { name = "nvidia-nccl-cu12", marker = "sys_platform == 'never'" }, { name = "torch", marker = "sys_platform == 'never'", index = "https://download.pytorch.org/whl/cpu" }, { name = "torchao", marker = "sys_platform == 'never'" }, - { name = "torchvision", marker = "sys_platform == 'never'" }, + { name = "torchvision", marker = "sys_platform == 'never'", index = "https://download.pytorch.org/whl/cpu" }, { name = "transformer-engine", marker = "sys_platform == 'never'" }, { name = "transformer-engine-torch", marker = "sys_platform == 'never'" }, { name = "triton", marker = "sys_platform == 'never'" }, @@ -2279,6 +2279,21 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/41/45/1a4ed80516f02155c51f51e8cedb3c1902296743db0bbc66608a0db2814f/jsonschema_specifications-2025.9.1-py3-none-any.whl", hash = "sha256:98802fee3a11ee76ecaca44429fda8a41bff98b00a0f2838151b113f210cc6fe", size = 18437, upload-time = "2025-09-08T01:34:57.871Z" }, ] +[[package]] +name = "kernels" +version = "0.12.2" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "huggingface-hub" }, + { name = "packaging" }, + { name = "pyyaml" }, + { name = "tomli", marker = "python_full_version < '3.11'" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/a9/07/d2b635e965b232cae1aa873c6e0458947196be8dca7bb02e64d3cd6e8d19/kernels-0.12.2.tar.gz", hash = "sha256:812fc43c2814f046cee655cbebf3918cddd489715773670bdb38cca3f5203b5b", size = 57108, upload-time = "2026-03-04T10:03:00.379Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/08/be/f5d6758b48633e4f6a28198fcf4bf9f763cc6a82e2335d9fe8802a5cb440/kernels-0.12.2-py3-none-any.whl", hash = "sha256:1289261804748cf3cf8e3afab80b505b0f1b28e4ec88379cdf08dc31e64964b8", size = 55205, upload-time = "2026-03-04T10:02:59.305Z" }, +] + [[package]] name = "kiwisolver" version = "1.4.9" @@ -3235,6 +3250,7 @@ all = [ { name = "ftfy" }, { name = "imageio" }, { name = "imageio-ffmpeg" }, + { name = "kernels" }, { name = "mamba-ssm" }, { name = "mistral-common", extra = ["opencv"] }, { name = "numba", version = "0.53.1", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.13'" }, @@ -3252,6 +3268,7 @@ all = [ { name = "sentencepiece" }, { name = "timm" }, { name = "torchcodec", marker = "platform_machine == 'x86_64' and sys_platform != 'darwin'" }, + { name = "torchvision", marker = "sys_platform == 'never'" }, { name = "transformer-engine", marker = "sys_platform == 'never'" }, ] cuda = [ @@ -3271,7 +3288,9 @@ diffusion = [ { name = "ftfy" }, { name = "imageio" }, { name = "imageio-ffmpeg" }, + { name = "kernels" }, { name = "opencv-python-headless" }, + { name = "torchvision", marker = "sys_platform == 'never'" }, ] extra = [ { name = "flash-linear-attention" }, @@ -3358,6 +3377,7 @@ requires-dist = [ { name = "ftfy", marker = "extra == 'diffusion'" }, { name = "imageio", marker = "extra == 'diffusion'" }, { name = "imageio-ffmpeg", marker = "extra == 'diffusion'" }, + { name = "kernels", marker = "extra == 'diffusion'" }, { name = "mamba-ssm", marker = "extra == 'cuda'" }, { name = "megatron-fsdp", specifier = ">=0.2.3" }, { name = "mistral-common", extras = ["audio", "hf-hub", "image", "sentencepiece"] }, @@ -3390,6 +3410,9 @@ requires-dist = [ { name = "torchao" }, { name = "torchcodec", marker = "platform_machine == 'x86_64' and sys_platform != 'darwin' and extra == 'vlm'" }, { name = "torchdata" }, + { name = "torchvision", marker = "sys_platform == 'darwin' and extra == 'diffusion'", index = "https://pypi.org/simple" }, + { name = "torchvision", marker = "sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'diffusion'", index = "https://download.pytorch.org/whl/cpu" }, + { name = "torchvision", marker = "sys_platform == 'linux' and extra == 'diffusion'", index = "https://download.pytorch.org/whl/cu129" }, { name = "transformer-engine", extras = ["pytorch"], marker = "extra == 'cuda'", specifier = "<=2.11.0" }, { name = "transformers", specifier = ">=5.0.0" }, { name = "wandb" }, @@ -6409,8 +6432,8 @@ wheels = [ [[package]] name = "torchvision" -version = "0.23.0" -source = { registry = "https://pypi.org/simple" } +version = "0.25.0+cpu" +source = { registry = "https://download.pytorch.org/whl/cpu" } dependencies = [ { name = "numpy", version = "1.26.4", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.13' and sys_platform != 'darwin' and sys_platform != 'linux'" }, { name = "numpy", version = "2.4.1", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.13' and sys_platform != 'darwin' and sys_platform != 'linux'" }, diff --git a/examples/diffusion/finetune/flux_t2i_flow.yaml b/examples/diffusion/finetune/flux_t2i_flow.yaml index 52674fe61..46175d806 100644 --- a/examples/diffusion/finetune/flux_t2i_flow.yaml +++ b/examples/diffusion/finetune/flux_t2i_flow.yaml @@ -54,7 +54,7 @@ step_scheduler: data: dataloader: - _target_: nemo_automodel.components.datasets.diffusion.build_flux_multiresolution_dataloader + _target_: nemo_automodel.components.datasets.diffusion.build_text_to_image_multiresolution_dataloader cache_dir: PATH_TO_YOUR_DATA train_text_encoder: false num_workers: 10 diff --git a/examples/diffusion/finetune/wan2_1_t2v_flow.yaml b/examples/diffusion/finetune/wan2_1_t2v_flow.yaml index 0c47c543a..88a558ed0 100644 --- a/examples/diffusion/finetune/wan2_1_t2v_flow.yaml +++ b/examples/diffusion/finetune/wan2_1_t2v_flow.yaml @@ -22,10 +22,14 @@ step_scheduler: data: dataloader: - _target_: nemo_automodel.components.datasets.diffusion.build_dataloader - meta_folder: PATH_TO_YOUR_DATA + _target_: nemo_automodel.components.datasets.diffusion.build_video_multiresolution_dataloader + cache_dir: PATH_TO_YOUR_DATA + model_type: wan + base_resolution: [512, 512] + dynamic_batch_size: false + shuffle: true + drop_last: false num_workers: 2 - device: cpu optim: learning_rate: 5e-6 diff --git a/examples/diffusion/finetune/wan2_1_t2v_flow_multinode.yaml b/examples/diffusion/finetune/wan2_1_t2v_flow_multinode.yaml index 72b26cf03..715c3b711 100644 --- a/examples/diffusion/finetune/wan2_1_t2v_flow_multinode.yaml +++ b/examples/diffusion/finetune/wan2_1_t2v_flow_multinode.yaml @@ -22,10 +22,14 @@ step_scheduler: data: dataloader: - _target_: nemo_automodel.components.datasets.diffusion.build_dataloader - meta_folder: PATH_TO_YOUR_DATA + _target_: nemo_automodel.components.datasets.diffusion.build_video_multiresolution_dataloader + cache_dir: PATH_TO_YOUR_DATA + model_type: wan + base_resolution: [512, 512] + dynamic_batch_size: false + shuffle: true + drop_last: false num_workers: 2 - device: cpu optim: diff --git a/examples/diffusion/generate/flux_generate.py b/examples/diffusion/generate/flux_generate.py index 623b1bb4b..195eed9fd 100644 --- a/examples/diffusion/generate/flux_generate.py +++ b/examples/diffusion/generate/flux_generate.py @@ -30,7 +30,7 @@ from diffusers import FluxPipeline # Import the provided dataloader builder -from nemo_automodel.components.datasets.diffusion import build_flux_multiresolution_dataloader +from nemo_automodel.components.datasets.diffusion import build_text_to_image_multiresolution_dataloader def parse_args(): @@ -187,7 +187,7 @@ def main(): print("=" * 80) print(f"Initializing Multiresolution Dataloader: {args.data_path}") - dataloader, _ = build_flux_multiresolution_dataloader( + dataloader, _ = build_text_to_image_multiresolution_dataloader( cache_dir=args.data_path, batch_size=1, num_workers=args.num_workers, dynamic_batch_size=True, shuffle=False ) print(f"[INFO] Dataloader ready. Batches: {len(dataloader)}") diff --git a/examples/diffusion/pretrain/flux_t2i_flow.yaml b/examples/diffusion/pretrain/flux_t2i_flow.yaml index 418b543bc..d85805eb7 100644 --- a/examples/diffusion/pretrain/flux_t2i_flow.yaml +++ b/examples/diffusion/pretrain/flux_t2i_flow.yaml @@ -58,7 +58,7 @@ step_scheduler: data: dataloader: - _target_: nemo_automodel.components.datasets.diffusion.build_flux_multiresolution_dataloader + _target_: nemo_automodel.components.datasets.diffusion.build_text_to_image_multiresolution_dataloader cache_dir: PATH_TO_YOUR_DATA train_text_encoder: false num_workers: 1 diff --git a/nemo_automodel/components/datasets/diffusion/__init__.py b/nemo_automodel/components/datasets/diffusion/__init__.py index ee3451498..2e829606c 100644 --- a/nemo_automodel/components/datasets/diffusion/__init__.py +++ b/nemo_automodel/components/datasets/diffusion/__init__.py @@ -17,14 +17,26 @@ import importlib _LAZY_ATTRS = { - "MetaFilesDataset": (".meta_files_dataset", "MetaFilesDataset"), + # Dataset classes + "BaseMultiresolutionDataset": (".base_dataset", "BaseMultiresolutionDataset"), "TextToImageDataset": (".text_to_image_dataset", "TextToImageDataset"), + "TextToVideoDataset": (".text_to_video_dataset", "TextToVideoDataset"), + "MetaFilesDataset": (".meta_files_dataset", "MetaFilesDataset"), + # Utilities "MultiTierBucketCalculator": (".multi_tier_bucketing", "MultiTierBucketCalculator"), "SequentialBucketSampler": (".sampler", "SequentialBucketSampler"), - "collate_fn_flux": (".collate_fns", "collate_fn_flux"), - "build_flux_multiresolution_dataloader": (".collate_fns", "build_flux_multiresolution_dataloader"), - "build_mock_dataloader": (".mock_dataloader", "build_mock_dataloader"), + "VIDEO_OPTIONAL_FIELDS": (".text_to_video_dataset", "VIDEO_OPTIONAL_FIELDS"), + # Collate functions + "collate_fn_text_to_image": (".collate_fns", "collate_fn_text_to_image"), + "collate_fn_video": (".collate_fns", "collate_fn_video"), + "collate_fn_production": (".collate_fns", "collate_fn_production"), + # Dataloader builders + "build_text_to_image_multiresolution_dataloader": (".collate_fns", "build_text_to_image_multiresolution_dataloader"), + "build_video_multiresolution_dataloader": (".collate_fns", "build_video_multiresolution_dataloader"), + # Legacy (non-multiresolution) "build_dataloader": (".meta_files_dataset", "build_dataloader"), + # Mock/test + "build_mock_dataloader": (".mock_dataloader", "build_mock_dataloader"), } __all__ = sorted(_LAZY_ATTRS.keys()) diff --git a/nemo_automodel/components/datasets/diffusion/base_dataset.py b/nemo_automodel/components/datasets/diffusion/base_dataset.py new file mode 100644 index 000000000..f8e74eb85 --- /dev/null +++ b/nemo_automodel/components/datasets/diffusion/base_dataset.py @@ -0,0 +1,133 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import json +import logging +from abc import ABC, abstractmethod +from pathlib import Path +from typing import Dict, List + +from torch.utils.data import Dataset + +from .multi_tier_bucketing import MultiTierBucketCalculator + +logger = logging.getLogger(__name__) + + +class BaseMultiresolutionDataset(Dataset, ABC): + """Abstract base class for multiresolution datasets with bucket-based sampling.""" + + def __init__(self, cache_dir: str, quantization: int = 64): + """ + Args: + cache_dir: Directory containing preprocessed cache (metadata.json + shards) + quantization: Resolution quantization factor (64 for images, 8 for video) + """ + self.cache_dir = Path(cache_dir) + + # Load metadata + self.metadata = self._load_metadata() + + logger.info(f"Loaded dataset with {len(self.metadata)} samples") + + # Group by bucket + self._group_by_bucket() + + # Initialize bucket calculator for dynamic batch sizes + self.calculator = MultiTierBucketCalculator(quantization=quantization) + + def _load_metadata(self) -> List[Dict]: + """Load metadata from cache directory. + + Expects metadata.json with "shards" key referencing shard files. + """ + metadata_file = self.cache_dir / "metadata.json" + + if not metadata_file.exists(): + raise FileNotFoundError(f"No metadata.json found in {self.cache_dir}") + + with open(metadata_file, "r") as f: + data = json.load(f) + + if not isinstance(data, dict) or "shards" not in data: + raise ValueError(f"Invalid metadata format in {metadata_file}. Expected dict with 'shards' key.") + + # Load all shard files + metadata = [] + for shard_name in data["shards"]: + shard_path = self.cache_dir / shard_name + with open(shard_path, "r") as f: + shard_data = json.load(f) + metadata.extend(shard_data) + + return metadata + + def _aspect_ratio_to_name(self, aspect_ratio: float) -> str: + """Convert aspect ratio to a descriptive name.""" + if aspect_ratio < 0.85: + return "tall" + elif aspect_ratio > 1.18: + return "wide" + else: + return "square" + + def _group_by_bucket(self): + """Group samples by bucket (aspect_ratio + resolution).""" + self.bucket_groups = {} + + # Support both bucket_resolution (video) and crop_resolution (image) keys + resolution_key = "bucket_resolution" if "bucket_resolution" in self.metadata[0] else "crop_resolution" + + for idx, item in enumerate(self.metadata): + aspect_ratio = item.get("aspect_ratio", 1.0) + aspect_name = self._aspect_ratio_to_name(aspect_ratio) + resolution = tuple(item[resolution_key]) + bucket_key = (aspect_name, resolution) + + if bucket_key not in self.bucket_groups: + self.bucket_groups[bucket_key] = { + "indices": [], + "aspect_name": aspect_name, + "aspect_ratio": aspect_ratio, + "resolution": resolution, + "pixels": resolution[0] * resolution[1], + } + + self.bucket_groups[bucket_key]["indices"].append(idx) + + # Sort buckets by resolution (low to high for optimal memory usage) + self.sorted_bucket_keys = sorted(self.bucket_groups.keys(), key=lambda k: self.bucket_groups[k]["pixels"]) + + logger.info(f"\nDataset organized into {len(self.bucket_groups)} buckets:") + for key in self.sorted_bucket_keys: + bucket = self.bucket_groups[key] + aspect_name, resolution = key + logger.info( + f" {aspect_name:6s} {resolution[0]:4d}x{resolution[1]:4d}: {len(bucket['indices']):5d} samples" + ) + + def get_bucket_info(self) -> Dict: + """Get bucket organization information.""" + return { + "total_buckets": len(self.bucket_groups), + "buckets": {f"{k[0]}/{k[1][0]}x{k[1][1]}": len(v["indices"]) for k, v in self.bucket_groups.items()}, + } + + def __len__(self) -> int: + return len(self.metadata) + + @abstractmethod + def __getitem__(self, idx: int) -> Dict: + """Load a single sample. Subclasses must implement.""" + ... diff --git a/nemo_automodel/components/datasets/diffusion/collate_fns.py b/nemo_automodel/components/datasets/diffusion/collate_fns.py index 3f9567661..426f7bd56 100644 --- a/nemo_automodel/components/datasets/diffusion/collate_fns.py +++ b/nemo_automodel/components/datasets/diffusion/collate_fns.py @@ -13,27 +13,70 @@ # limitations under the License. """ -Flux-compatible collate function that wraps the multiresolution dataloader output -to match the FlowMatchingPipeline expected batch format. +Collate functions and dataloader builders for multiresolution diffusion training. + +Supports both image and video pipelines via the FlowMatchingPipeline +expected batch format. """ +import functools import logging -from typing import Dict, List, Tuple +from typing import Callable, Dict, List, Tuple +import torch from torch.utils.data import DataLoader -from .sampler import ( - SequentialBucketSampler, - collate_fn_production, -) +from .sampler import SequentialBucketSampler from .text_to_image_dataset import TextToImageDataset +from .text_to_video_dataset import TextToVideoDataset, collate_optional_video_fields logger = logging.getLogger(__name__) -def collate_fn_flux(batch: List[Dict]) -> Dict: +def collate_fn_production(batch: List[Dict]) -> Dict: + """Production collate function with verification.""" + # Verify all samples have same resolution + resolutions = [tuple(item["crop_resolution"].tolist()) for item in batch] + assert len(set(resolutions)) == 1, f"Mixed resolutions in batch: {set(resolutions)}" + + # Stack tensors + latents = torch.stack([item["latent"] for item in batch]) + crop_resolutions = torch.stack([item["crop_resolution"] for item in batch]) + original_resolutions = torch.stack([item["original_resolution"] for item in batch]) + crop_offsets = torch.stack([item["crop_offset"] for item in batch]) + + # Collect metadata + prompts = [item["prompt"] for item in batch] + image_paths = [item["image_path"] for item in batch] + bucket_ids = [item["bucket_id"] for item in batch] + aspect_ratios = [item["aspect_ratio"] for item in batch] + + output = { + "latent": latents, + "crop_resolution": crop_resolutions, + "original_resolution": original_resolutions, + "crop_offset": crop_offsets, + "prompt": prompts, + "image_path": image_paths, + "bucket_id": bucket_ids, + "aspect_ratio": aspect_ratios, + } + + # Handle text encodings + if "clip_hidden" in batch[0]: + output["clip_hidden"] = torch.stack([item["clip_hidden"] for item in batch]) + output["pooled_prompt_embeds"] = torch.stack([item["pooled_prompt_embeds"] for item in batch]) + output["prompt_embeds"] = torch.stack([item["prompt_embeds"] for item in batch]) + else: + output["clip_tokens"] = torch.stack([item["clip_tokens"] for item in batch]) + output["t5_tokens"] = torch.stack([item["t5_tokens"] for item in batch]) + + return output + + +def collate_fn_text_to_image(batch: List[Dict]) -> Dict: """ - Flux-compatible collate function that transforms multiresolution batch output + Text-to-image collate function that transforms multiresolution batch output to match FlowMatchingPipeline expected format. Args: @@ -45,11 +88,11 @@ def collate_fn_flux(batch: List[Dict]) -> Dict: # First, use the production collate to stack tensors production_batch = collate_fn_production(batch) - # Keep latent as 4D [B, C, H, W] for Flux (image model, not video) + # Keep latent as 4D [B, C, H, W] for image (not video) latent = production_batch["latent"] - # Use "image_latents" key for 4D tensors (FluxAdapter expects 4D) - flux_batch = { + # Use "image_latents" key for 4D tensors + image_batch = { "image_latents": latent, "data_type": "image", "metadata": { @@ -66,23 +109,64 @@ def collate_fn_flux(batch: List[Dict]) -> Dict: # Handle text embeddings (pre-encoded vs tokenized) if "prompt_embeds" in production_batch: # Pre-encoded text embeddings - flux_batch["text_embeddings"] = production_batch["prompt_embeds"] - flux_batch["pooled_prompt_embeds"] = production_batch["pooled_prompt_embeds"] + image_batch["text_embeddings"] = production_batch["prompt_embeds"] + image_batch["pooled_prompt_embeds"] = production_batch["pooled_prompt_embeds"] # Also include CLIP hidden for models that need it if "clip_hidden" in production_batch: - flux_batch["clip_hidden"] = production_batch["clip_hidden"] + image_batch["clip_hidden"] = production_batch["clip_hidden"] else: # Tokenized - need to encode during training (not supported yet) - flux_batch["t5_tokens"] = production_batch["t5_tokens"] - flux_batch["clip_tokens"] = production_batch["clip_tokens"] + image_batch["t5_tokens"] = production_batch["t5_tokens"] + image_batch["clip_tokens"] = production_batch["clip_tokens"] raise NotImplementedError( "On-the-fly text encoding not yet supported. Please use pre-encoded text embeddings in your dataset." ) - return flux_batch + return image_batch -def build_flux_multiresolution_dataloader( +def _build_multiresolution_dataloader_core( + *, + dataset, + collate_fn: Callable, + batch_size: int, + dp_rank: int, + dp_world_size: int, + base_resolution: Tuple[int, int] = (512, 512), + drop_last: bool = True, + shuffle: bool = True, + dynamic_batch_size: bool = False, + num_workers: int = 4, + pin_memory: bool = True, + prefetch_factor: int = 2, +) -> Tuple[DataLoader, SequentialBucketSampler]: + """Internal helper: create sampler + DataLoader from dataset and collate fn.""" + sampler = SequentialBucketSampler( + dataset, + base_batch_size=batch_size, + base_resolution=base_resolution, + drop_last=drop_last, + shuffle_buckets=shuffle, + shuffle_within_bucket=shuffle, + dynamic_batch_size=dynamic_batch_size, + num_replicas=dp_world_size, + rank=dp_rank, + ) + + dataloader = DataLoader( + dataset, + batch_sampler=sampler, + collate_fn=collate_fn, + num_workers=num_workers, + pin_memory=pin_memory, + prefetch_factor=prefetch_factor if num_workers > 0 else None, + persistent_workers=num_workers > 0, + ) + + return dataloader, sampler + + +def build_text_to_image_multiresolution_dataloader( *, # TextToImageDataset parameters cache_dir: str, @@ -100,10 +184,10 @@ def build_flux_multiresolution_dataloader( prefetch_factor: int = 2, ) -> Tuple[DataLoader, SequentialBucketSampler]: """ - Build a Flux-compatible multiresolution dataloader for TrainDiffusionRecipe. + Build a text-to-image multiresolution dataloader for TrainDiffusionRecipe. This wraps the existing TextToImageDataset and SequentialBucketSampler - with a Flux-compatible collate function. + with a text-to-image collate function. Args: cache_dir: Directory containing preprocessed cache (metadata.json, shards, and resolution subdirs) @@ -122,40 +206,138 @@ def build_flux_multiresolution_dataloader( Returns: Tuple of (DataLoader, SequentialBucketSampler) """ - logger.info("Building Flux multiresolution dataloader:") + logger.info("Building text-to-image multiresolution dataloader:") logger.info(f" cache_dir: {cache_dir}") logger.info(f" train_text_encoder: {train_text_encoder}") logger.info(f" batch_size: {batch_size}") logger.info(f" dp_rank: {dp_rank}, dp_world_size: {dp_world_size}") - # Create dataset dataset = TextToImageDataset( cache_dir=cache_dir, train_text_encoder=train_text_encoder, ) - # Create sampler - sampler = SequentialBucketSampler( - dataset, - base_batch_size=batch_size, + dataloader, sampler = _build_multiresolution_dataloader_core( + dataset=dataset, + collate_fn=collate_fn_text_to_image, + batch_size=batch_size, + dp_rank=dp_rank, + dp_world_size=dp_world_size, base_resolution=base_resolution, drop_last=drop_last, - shuffle_buckets=shuffle, - shuffle_within_bucket=shuffle, + shuffle=shuffle, dynamic_batch_size=dynamic_batch_size, - num_replicas=dp_world_size, - rank=dp_rank, + num_workers=num_workers, + pin_memory=pin_memory, + prefetch_factor=prefetch_factor, ) - # Create dataloader with Flux-compatible collate - dataloader = DataLoader( - dataset, - batch_sampler=sampler, - collate_fn=collate_fn_flux, # Use Flux-compatible collate + logger.info(f" Dataset size: {len(dataset)}") + logger.info(f" Batches per epoch: {len(sampler)}") + + return dataloader, sampler + + +def collate_fn_video(batch: List[Dict], model_type: str = "wan") -> Dict: + """ + Video-compatible collate function for multiresolution video training. + + Concatenates video_latents (5D) and text_embeddings (3D) along the batch dim, + matching the format expected by FlowMatchingPipeline with SimpleAdapter. + + Args: + batch: List of samples from TextToVideoDataset + model_type: Model type for model-specific field handling + + Returns: + Dict compatible with FlowMatchingPipeline.step() + """ + # Verify all samples have the same bucket resolution + resolutions = [tuple(item["bucket_resolution"].tolist()) for item in batch] + assert len(set(resolutions)) == 1, f"Mixed resolutions in batch: {set(resolutions)}" + + video_latents = torch.cat([item["video_latents"] for item in batch], dim=0) + text_embeddings = torch.cat([item["text_embeddings"] for item in batch], dim=0) + + result = { + "video_latents": video_latents, + "text_embeddings": text_embeddings, + "data_type": "video", + } + + # Collate model-specific optional fields + collate_optional_video_fields(batch, result) + + return result + + +def build_video_multiresolution_dataloader( + *, + cache_dir: str, + model_type: str = "wan", + device: str = "cpu", + batch_size: int = 1, + dp_rank: int = 0, + dp_world_size: int = 1, + base_resolution: Tuple[int, int] = (512, 512), + drop_last: bool = True, + shuffle: bool = True, + dynamic_batch_size: bool = False, + num_workers: int = 2, + pin_memory: bool = True, + prefetch_factor: int = 2, +) -> Tuple[DataLoader, SequentialBucketSampler]: + """ + Build a multiresolution video dataloader for TrainDiffusionRecipe. + + Uses TextToVideoDataset with SequentialBucketSampler for bucket-based + multiresolution video training (e.g. Wan, Hunyuan). + + Args: + cache_dir: Directory containing preprocessed cache (metadata.json + shards + WxH/*.meta) + model_type: Model type ("wan", "hunyuan", etc.) + device: Device to load tensors to + batch_size: Batch size per GPU + dp_rank: Data parallel rank + dp_world_size: Data parallel world size + base_resolution: Base resolution for dynamic batch sizing + drop_last: Drop incomplete batches + shuffle: Shuffle data + dynamic_batch_size: Scale batch size by resolution + num_workers: DataLoader workers + pin_memory: Pin memory for GPU transfer + prefetch_factor: Prefetch batches per worker + + Returns: + Tuple of (DataLoader, SequentialBucketSampler) + """ + logger.info("Building video multiresolution dataloader:") + logger.info(f" cache_dir: {cache_dir}") + logger.info(f" model_type: {model_type}") + logger.info(f" batch_size: {batch_size}") + logger.info(f" dp_rank: {dp_rank}, dp_world_size: {dp_world_size}") + + dataset = TextToVideoDataset( + cache_dir=cache_dir, + model_type=model_type, + device=device, + ) + + collate = functools.partial(collate_fn_video, model_type=model_type) + + dataloader, sampler = _build_multiresolution_dataloader_core( + dataset=dataset, + collate_fn=collate, + batch_size=batch_size, + dp_rank=dp_rank, + dp_world_size=dp_world_size, + base_resolution=base_resolution, + drop_last=drop_last, + shuffle=shuffle, + dynamic_batch_size=dynamic_batch_size, num_workers=num_workers, pin_memory=pin_memory, - prefetch_factor=prefetch_factor if num_workers > 0 else None, - persistent_workers=num_workers > 0, + prefetch_factor=prefetch_factor, ) logger.info(f" Dataset size: {len(dataset)}") diff --git a/nemo_automodel/components/datasets/diffusion/meta_files_dataset.py b/nemo_automodel/components/datasets/diffusion/meta_files_dataset.py index 381b53745..4dbeea2bc 100644 --- a/nemo_automodel/components/datasets/diffusion/meta_files_dataset.py +++ b/nemo_automodel/components/datasets/diffusion/meta_files_dataset.py @@ -24,6 +24,8 @@ import torch.distributed as dist from torch.utils.data import DataLoader, Dataset, DistributedSampler +from .text_to_video_dataset import collate_optional_video_fields, load_optional_video_fields + logger = logging.getLogger(__name__) @@ -111,20 +113,6 @@ def __getitem__(self, index: int) -> Dict[str, torch.Tensor]: # type: ignore[ov text_embeddings: torch.Tensor = data["text_embeddings"].to(self.device) video_latents: torch.Tensor = data["video_latents"].to(self.device) - # Load text_mask if available (backwards compatible) - text_mask = data.get("text_mask") - text_embeddings_2 = data.get("text_embeddings_2") - text_mask_2 = data.get("text_mask_2") - image_embeds = data.get("image_embeds") - if text_mask is not None: - text_mask = text_mask.to(self.device) - if text_embeddings_2 is not None: - text_embeddings_2 = text_embeddings_2.to(self.device) - if text_mask_2 is not None: - text_mask_2 = text_mask_2.to(self.device) - if image_embeds is not None: - image_embeds = image_embeds.to(self.device) - if self.transform_text is not None: text_embeddings = self.transform_text(text_embeddings) if self.transform_video is not None: @@ -146,15 +134,8 @@ def __getitem__(self, index: int) -> Dict[str, torch.Tensor]: # type: ignore[ov "file_info": file_info, } - # Add text_mask if available (backwards compatible) - if text_mask is not None: - result["text_mask"] = text_mask - if text_embeddings_2 is not None: - result["text_embeddings_2"] = text_embeddings_2 - if text_mask_2 is not None: - result["text_mask_2"] = text_mask_2 - if image_embeds is not None: - result["image_embeds"] = image_embeds + # Optional model-specific fields (backwards compatible) + result.update(load_optional_video_fields(data, self.device)) return result @@ -174,19 +155,8 @@ def collate_fn(batch: List[Dict[str, torch.Tensor]]) -> Dict[str, torch.Tensor]: "file_info": [item["file_info"] for item in batch], } - # Collate text_mask if available (backwards compatible) - if len(batch) > 0 and "text_mask" in batch[0]: - text_mask = torch.cat([item["text_mask"] for item in batch], dim=0) - result["text_mask"] = text_mask - if len(batch) > 0 and "text_embeddings_2" in batch[0]: - text_embeddings_2 = torch.cat([item["text_embeddings_2"] for item in batch], dim=0) - result["text_embeddings_2"] = text_embeddings_2 - if len(batch) > 0 and "text_mask_2" in batch[0]: - text_mask_2 = torch.cat([item["text_mask_2"] for item in batch], dim=0) - result["text_mask_2"] = text_mask_2 - if len(batch) > 0 and "image_embeds" in batch[0]: - image_embeds = torch.cat([item["image_embeds"] for item in batch], dim=0) - result["image_embeds"] = image_embeds + # Optional model-specific fields (backwards compatible) + collate_optional_video_fields(batch, result) return result diff --git a/nemo_automodel/components/datasets/diffusion/sampler.py b/nemo_automodel/components/datasets/diffusion/sampler.py index 3ee806631..92ac23a9c 100644 --- a/nemo_automodel/components/datasets/diffusion/sampler.py +++ b/nemo_automodel/components/datasets/diffusion/sampler.py @@ -18,9 +18,9 @@ import torch import torch.distributed as dist -from torch.utils.data import DataLoader, Sampler +from torch.utils.data import Sampler -from .text_to_image_dataset import TextToImageDataset +from .base_dataset import BaseMultiresolutionDataset logger = logging.getLogger(__name__) @@ -41,7 +41,7 @@ class SequentialBucketSampler(Sampler[List[int]]): def __init__( self, - dataset: TextToImageDataset, + dataset: BaseMultiresolutionDataset, base_batch_size: int = 32, base_resolution: Tuple[int, int] = (512, 512), drop_last: bool = True, @@ -54,7 +54,7 @@ def __init__( ): """ Args: - dataset: TextToImageDataset + dataset: BaseMultiresolutionDataset (or any subclass) base_batch_size: Batch size (fixed if dynamic_batch_size=False, or base for scaling if dynamic_batch_size=True) base_resolution: Reference resolution for batch size scaling @@ -222,103 +222,3 @@ def get_batch_info(self, batch_idx: int) -> Dict: running_count += num_batches return {} - - -def collate_fn_production(batch: List[Dict]) -> Dict: - """Production collate function with verification.""" - # Verify all samples have same resolution - resolutions = [tuple(item["crop_resolution"].tolist()) for item in batch] - assert len(set(resolutions)) == 1, f"Mixed resolutions in batch: {set(resolutions)}" - - # Stack tensors - latents = torch.stack([item["latent"] for item in batch]) - crop_resolutions = torch.stack([item["crop_resolution"] for item in batch]) - original_resolutions = torch.stack([item["original_resolution"] for item in batch]) - crop_offsets = torch.stack([item["crop_offset"] for item in batch]) - - # Collect metadata - prompts = [item["prompt"] for item in batch] - image_paths = [item["image_path"] for item in batch] - bucket_ids = [item["bucket_id"] for item in batch] - aspect_ratios = [item["aspect_ratio"] for item in batch] - - output = { - "latent": latents, - "crop_resolution": crop_resolutions, - "original_resolution": original_resolutions, - "crop_offset": crop_offsets, - "prompt": prompts, - "image_path": image_paths, - "bucket_id": bucket_ids, - "aspect_ratio": aspect_ratios, - } - - # Handle text encodings - if "clip_hidden" in batch[0]: - output["clip_hidden"] = torch.stack([item["clip_hidden"] for item in batch]) - output["pooled_prompt_embeds"] = torch.stack([item["pooled_prompt_embeds"] for item in batch]) - output["prompt_embeds"] = torch.stack([item["prompt_embeds"] for item in batch]) - else: - output["clip_tokens"] = torch.stack([item["clip_tokens"] for item in batch]) - output["t5_tokens"] = torch.stack([item["t5_tokens"] for item in batch]) - - return output - - -def build_multiresolution_dataloader( - *, - dataset: TextToImageDataset, - base_batch_size: int, - dp_rank: int, - dp_world_size: int, - base_resolution: Tuple[int, int] = (512, 512), - drop_last: bool = True, - shuffle: bool = True, - dynamic_batch_size: bool = False, - num_workers: int = 4, - pin_memory: bool = True, - prefetch_factor: int = 2, -) -> Tuple[DataLoader, SequentialBucketSampler]: - """ - Build production dataloader with sequential bucket iteration and distributed training support. - - Args: - dataset: TextToImageDataset instance - base_batch_size: Batch size (fixed, or base for scaling if dynamic_batch_size=True) - dp_rank: Rank of current process in data parallel group - dp_world_size: Total number of processes in data parallel group - base_resolution: Reference resolution (only used if dynamic_batch_size=True) - drop_last: Drop incomplete batches - shuffle: Shuffle bucket order and samples within buckets each epoch - dynamic_batch_size: If True, scale batch size based on resolution. - If False (default), use base_batch_size for all buckets. - num_workers: Number of data loading workers - pin_memory: Pin memory for faster GPU transfer - prefetch_factor: How many batches to prefetch per worker - - Returns: - Tuple of (DataLoader, SequentialBucketSampler) for production training - """ - sampler = SequentialBucketSampler( - dataset, - base_batch_size=base_batch_size, - base_resolution=base_resolution, - drop_last=drop_last, - shuffle_buckets=shuffle, - shuffle_within_bucket=shuffle, - dynamic_batch_size=dynamic_batch_size, - num_replicas=dp_world_size, - rank=dp_rank, - ) - - dataloader = DataLoader( - dataset, - batch_sampler=sampler, - collate_fn=collate_fn_production, - num_workers=num_workers, - pin_memory=pin_memory, - prefetch_factor=prefetch_factor if num_workers > 0 else None, - persistent_workers=num_workers > 0, # Keep workers alive between epochs - ) - - return dataloader, sampler diff --git a/nemo_automodel/components/datasets/diffusion/text_to_image_dataset.py b/nemo_automodel/components/datasets/diffusion/text_to_image_dataset.py index 062e0aeec..41e356220 100644 --- a/nemo_automodel/components/datasets/diffusion/text_to_image_dataset.py +++ b/nemo_automodel/components/datasets/diffusion/text_to_image_dataset.py @@ -12,20 +12,15 @@ # See the License for the specific language governing permissions and # limitations under the License. -import json -import logging from pathlib import Path -from typing import Dict, List +from typing import Dict import torch -from torch.utils.data import Dataset -from .multi_tier_bucketing import MultiTierBucketCalculator +from .base_dataset import BaseMultiresolutionDataset -logger = logging.getLogger(__name__) - -class TextToImageDataset(Dataset): +class TextToImageDataset(BaseMultiresolutionDataset): """Text-to-Image dataset with hierarchical bucket organization.""" def __init__( @@ -38,97 +33,8 @@ def __init__( cache_dir: Directory containing preprocessed cache train_text_encoder: If True, returns tokens instead of embeddings """ - self.cache_dir = Path(cache_dir) self.train_text_encoder = train_text_encoder - - # Load metadata - self.metadata = self._load_metadata() - - logger.info(f"Loaded dataset with {len(self.metadata)} samples") - - # Group by bucket - self._group_by_bucket() - - # Initialize bucket calculator for dynamic batch sizes - self.calculator = MultiTierBucketCalculator(quantization=64) - - def _load_metadata(self) -> List[Dict]: - """Load metadata from cache directory. - - Expects metadata.json with "shards" key referencing shard files. - """ - metadata_file = self.cache_dir / "metadata.json" - - if not metadata_file.exists(): - raise FileNotFoundError(f"No metadata.json found in {self.cache_dir}") - - with open(metadata_file, "r") as f: - data = json.load(f) - - if not isinstance(data, dict) or "shards" not in data: - raise ValueError(f"Invalid metadata format in {metadata_file}. Expected dict with 'shards' key.") - - # Load all shard files - metadata = [] - for shard_name in data["shards"]: - shard_path = self.cache_dir / shard_name - with open(shard_path, "r") as f: - shard_data = json.load(f) - metadata.extend(shard_data) - - return metadata - - def _aspect_ratio_to_name(self, aspect_ratio: float) -> str: - """Convert aspect ratio to a descriptive name.""" - if aspect_ratio < 0.85: - return "tall" - elif aspect_ratio > 1.18: - return "wide" - else: - return "square" - - def _group_by_bucket(self): - """Group samples by bucket (aspect_ratio + resolution).""" - self.bucket_groups = {} - - for idx, item in enumerate(self.metadata): - # Bucket key: aspect_name/resolution - aspect_ratio = item.get("aspect_ratio", 1.0) - aspect_name = self._aspect_ratio_to_name(aspect_ratio) - resolution = tuple(item["crop_resolution"]) - bucket_key = (aspect_name, resolution) - - if bucket_key not in self.bucket_groups: - self.bucket_groups[bucket_key] = { - "indices": [], - "aspect_name": aspect_name, - "aspect_ratio": aspect_ratio, - "resolution": resolution, - "pixels": resolution[0] * resolution[1], - } - - self.bucket_groups[bucket_key]["indices"].append(idx) - - # Sort buckets by resolution (low to high for optimal memory usage) - self.sorted_bucket_keys = sorted(self.bucket_groups.keys(), key=lambda k: self.bucket_groups[k]["pixels"]) - - logger.info(f"\nDataset organized into {len(self.bucket_groups)} buckets:") - for key in self.sorted_bucket_keys: - bucket = self.bucket_groups[key] - aspect_name, resolution = key - logger.info( - f" {aspect_name:6s} {resolution[0]:4d}x{resolution[1]:4d}: {len(bucket['indices']):5d} samples" - ) - - def get_bucket_info(self) -> Dict: - """Get bucket organization information.""" - return { - "total_buckets": len(self.bucket_groups), - "buckets": {f"{k[0]}/{k[1][0]}x{k[1][1]}": len(v["indices"]) for k, v in self.bucket_groups.items()}, - } - - def __len__(self) -> int: - return len(self.metadata) + super().__init__(cache_dir, quantization=64) def __getitem__(self, idx: int) -> Dict[str, torch.Tensor]: """Load a single sample.""" diff --git a/nemo_automodel/components/datasets/diffusion/text_to_video_dataset.py b/nemo_automodel/components/datasets/diffusion/text_to_video_dataset.py new file mode 100644 index 000000000..4082d6fb8 --- /dev/null +++ b/nemo_automodel/components/datasets/diffusion/text_to_video_dataset.py @@ -0,0 +1,83 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pickle +from pathlib import Path +from typing import Dict, List + +import torch + +from .base_dataset import BaseMultiresolutionDataset + +VIDEO_OPTIONAL_FIELDS = ("text_mask", "text_embeddings_2", "text_mask_2", "image_embeds") + + +def load_optional_video_fields(data: dict, device: str = "cpu") -> dict: + """Extract optional model-specific fields, moving to device.""" + result = {} + for key in VIDEO_OPTIONAL_FIELDS: + if key in data and data[key] is not None: + result[key] = data[key].to(device) + return result + + +def collate_optional_video_fields(batch: List[Dict], result: dict) -> None: + """Concatenate optional video fields present in batch into result dict.""" + if not batch: + return + for key in VIDEO_OPTIONAL_FIELDS: + if key in batch[0]: + result[key] = torch.cat([item[key] for item in batch], dim=0) + + +class TextToVideoDataset(BaseMultiresolutionDataset): + """Text-to-Video dataset with multiresolution bucket organization. + + Loads preprocessed .meta files organized by resolution bucket. + Compatible with SequentialBucketSampler for multiresolution training. + """ + + def __init__(self, cache_dir: str, model_type: str = "wan", device: str = "cpu"): + """ + Args: + cache_dir: Directory containing preprocessed cache (metadata.json + shards + WxH/*.meta) + model_type: Model type for model-specific fields ("wan", "hunyuan", etc.) + device: Device to load tensors to + """ + self.model_type = model_type + self.device = device + super().__init__(cache_dir, quantization=8) + + def __getitem__(self, idx: int) -> Dict[str, torch.Tensor]: + """Load a single video sample from its .meta file.""" + item = self.metadata[idx] + cache_file = Path(item["cache_file"]) + + with open(cache_file, "rb") as f: + data = pickle.load(f) + + video_latents = data["video_latents"].to(self.device) + text_embeddings = data["text_embeddings"].to(self.device) + + output = { + "video_latents": video_latents, + "text_embeddings": text_embeddings, + "bucket_resolution": torch.tensor(item["bucket_resolution"]), + "aspect_ratio": item.get("aspect_ratio", 1.0), + } + + # Model-specific optional fields + output.update(load_optional_video_fields(data, self.device)) + + return output diff --git a/nemo_automodel/recipes/base_recipe.py b/nemo_automodel/recipes/base_recipe.py index 46f305290..b66ca0386 100644 --- a/nemo_automodel/recipes/base_recipe.py +++ b/nemo_automodel/recipes/base_recipe.py @@ -351,9 +351,24 @@ def to_item(x): # Unwrap DDP if present if isinstance(unwrapped_model, DistributedDataParallel): unwrapped_model = unwrapped_model.module - unwrapped_model.save_pretrained( - save_directory=path, checkpointer=self.checkpointer, tokenizer=tokenizer, peft_config=self.peft_config - ) + # Models with HFCheckpointingMixin route save_pretrained through checkpointer.save_model (DCP). + # Models without it (e.g. diffusers) would use their native save_pretrained which fails on + # FSDP2-sharded DTensors, so fall back to checkpointer.save_model directly. + if hasattr(unwrapped_model, 'save_pretrained') and hasattr(unwrapped_model.save_pretrained, '__func__'): + from nemo_automodel.components.models.common.hf_checkpointing_mixin import HFCheckpointingMixin + + if isinstance(unwrapped_model, HFCheckpointingMixin): + unwrapped_model.save_pretrained( + save_directory=path, checkpointer=self.checkpointer, tokenizer=tokenizer, peft_config=self.peft_config + ) + else: + self.checkpointer.save_model( + model=unwrapped_model, weights_path=path, peft_config=self.peft_config, tokenizer=tokenizer + ) + else: + self.checkpointer.save_model( + model=unwrapped_model, weights_path=path, peft_config=self.peft_config, tokenizer=tokenizer + ) # Sync before checkpointing for Dion optimizers = optimizer if isinstance(optimizer, list) else [optimizer] diff --git a/nemo_automodel/recipes/diffusion/train.py b/nemo_automodel/recipes/diffusion/train.py index 64f5733f4..5e2d7d913 100644 --- a/nemo_automodel/recipes/diffusion/train.py +++ b/nemo_automodel/recipes/diffusion/train.py @@ -620,7 +620,7 @@ def run_train_validation_loop(self): ) if self.step_scheduler.is_ckpt_step: - self.save_checkpoint(epoch, global_step) + self.save_checkpoint(epoch, global_step, epoch_loss / num_steps) avg_loss = epoch_loss / num_steps logging.info(f"[INFO] Epoch {epoch + 1} complete. avg_loss={avg_loss:.6f}") diff --git a/pyproject.toml b/pyproject.toml index b3c32c247..49c071594 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -94,7 +94,9 @@ diffusion = [ "ftfy", "imageio", "imageio-ffmpeg", + "kernels", "opencv-python-headless", + "torchvision", ] # nvidia-cudnn-cu12 pin: This is required for GPTOSS TE support + faster cudnn attention (only on Linux where CUDA is available) # "nvidia-cudnn-cu12>=9.18.0.0; sys_platform == 'linux'", @@ -205,6 +207,11 @@ torch = [ { index = "pytorch-cu129", marker = "sys_platform == 'linux'" }, { index = "pypi", marker = "sys_platform == 'darwin'" }, ] +torchvision = [ + { index = "pytorch-cpu", marker = "sys_platform != 'linux' and sys_platform != 'darwin'" }, + { index = "pytorch-cu129", marker = "sys_platform == 'linux'" }, + { index = "pypi", marker = "sys_platform == 'darwin'" }, +] [[tool.uv.index]] name = "pypi" diff --git a/tests/unit_tests/datasets/diffusion/test_collate_diffusion.py b/tests/unit_tests/datasets/diffusion/test_collate_diffusion.py index fdc155fec..d657334f0 100644 --- a/tests/unit_tests/datasets/diffusion/test_collate_diffusion.py +++ b/tests/unit_tests/datasets/diffusion/test_collate_diffusion.py @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -"""Unit tests for collate_fns.py: collate_fn_flux, build_flux_multiresolution_dataloader.""" +"""Unit tests for collate_fns.py: collate_fn_text_to_image and build_text_to_image_multiresolution_dataloader.""" import json import tempfile @@ -23,8 +23,8 @@ import torch from nemo_automodel.components.datasets.diffusion.collate_fns import ( - build_flux_multiresolution_dataloader, - collate_fn_flux, + build_text_to_image_multiresolution_dataloader, + collate_fn_text_to_image, ) @@ -58,12 +58,12 @@ def _make_production_batch( # ============================================================================= -# TestCollateFnFlux +# TestCollateFnTextToImage # ============================================================================= -class TestCollateFnFlux: - """Tests for collate_fn_flux.""" +class TestCollateFnTextToImage: + """Tests for collate_fn_text_to_image.""" def test_pre_encoded_embeddings(self): prod_batch = _make_production_batch(has_prompt_embeds=True) @@ -71,7 +71,7 @@ def test_pre_encoded_embeddings(self): "nemo_automodel.components.datasets.diffusion.collate_fns.collate_fn_production", return_value=prod_batch, ): - result = collate_fn_flux([{}, {}]) # Dummy batch items + result = collate_fn_text_to_image([{}, {}]) # Dummy batch items assert "image_latents" in result assert "text_embeddings" in result @@ -85,7 +85,7 @@ def test_with_clip_hidden(self): "nemo_automodel.components.datasets.diffusion.collate_fns.collate_fn_production", return_value=prod_batch, ): - result = collate_fn_flux([{}, {}]) + result = collate_fn_text_to_image([{}, {}]) assert "clip_hidden" in result @@ -95,7 +95,7 @@ def test_without_clip_hidden(self): "nemo_automodel.components.datasets.diffusion.collate_fns.collate_fn_production", return_value=prod_batch, ): - result = collate_fn_flux([{}, {}]) + result = collate_fn_text_to_image([{}, {}]) assert "clip_hidden" not in result @@ -108,7 +108,7 @@ def test_tokenized_input_raises(self): return_value=prod_batch, ): with pytest.raises(NotImplementedError, match="On-the-fly text encoding"): - collate_fn_flux([{}, {}]) + collate_fn_text_to_image([{}, {}]) def test_metadata_fields(self): prod_batch = _make_production_batch(has_prompt_embeds=True) @@ -116,7 +116,7 @@ def test_metadata_fields(self): "nemo_automodel.components.datasets.diffusion.collate_fns.collate_fn_production", return_value=prod_batch, ): - result = collate_fn_flux([{}, {}]) + result = collate_fn_text_to_image([{}, {}]) meta = result["metadata"] assert "prompts" in meta @@ -129,7 +129,7 @@ def test_metadata_fields(self): # ============================================================================= -# TestBuildFluxMultiresolutionDataloader +# TestBuildTextToImageMultiresolutionDataloader # ============================================================================= @@ -177,8 +177,8 @@ def build_cache(self, resolution=(512, 512)): return metadata -class TestBuildFluxMultiresolutionDataloader: - """Tests for build_flux_multiresolution_dataloader.""" +class TestBuildTextToImageMultiresolutionDataloader: + """Tests for build_text_to_image_multiresolution_dataloader.""" def test_returns_dataloader_and_sampler(self): with tempfile.TemporaryDirectory() as tmpdir: @@ -186,7 +186,7 @@ def test_returns_dataloader_and_sampler(self): builder = MockCacheBuilder(cache_dir, num_samples=10) builder.build_cache() - dl, sampler = build_flux_multiresolution_dataloader( + dl, sampler = build_text_to_image_multiresolution_dataloader( cache_dir=str(cache_dir), batch_size=2, dp_rank=0, @@ -205,7 +205,7 @@ def test_iteration(self): builder = MockCacheBuilder(cache_dir, num_samples=10) builder.build_cache() - dl, _ = build_flux_multiresolution_dataloader( + dl, _ = build_text_to_image_multiresolution_dataloader( cache_dir=str(cache_dir), batch_size=2, dp_rank=0, diff --git a/tests/unit_tests/datasets/diffusion/test_dataloader.py b/tests/unit_tests/datasets/diffusion/test_dataloader.py index 3c820776c..47421039d 100644 --- a/tests/unit_tests/datasets/diffusion/test_dataloader.py +++ b/tests/unit_tests/datasets/diffusion/test_dataloader.py @@ -17,7 +17,7 @@ This module contains both CPU and GPU tests for: - SequentialBucketSampler - collate_fn_production -- build_multiresolution_dataloader +- _build_multiresolution_dataloader_core GPU tests are skipped when CUDA is not available. """ @@ -30,10 +30,12 @@ import pytest import torch +from nemo_automodel.components.datasets.diffusion.collate_fns import ( + _build_multiresolution_dataloader_core, + collate_fn_production, +) from nemo_automodel.components.datasets.diffusion.sampler import ( SequentialBucketSampler, - build_multiresolution_dataloader, - collate_fn_production, ) from nemo_automodel.components.datasets.diffusion.text_to_image_dataset import ( TextToImageDataset, @@ -182,7 +184,7 @@ def test_sampler_init_basic(self, simple_dataset): """Test basic sampler initialization.""" sampler = SequentialBucketSampler( simple_dataset, - base_batch_size=4, + batch_size=4, num_replicas=1, rank=0, ) @@ -196,7 +198,7 @@ def test_sampler_len(self, simple_dataset): """Test sampler __len__ returns correct batch count.""" sampler = SequentialBucketSampler( simple_dataset, - base_batch_size=4, + batch_size=4, num_replicas=1, rank=0, ) @@ -209,7 +211,7 @@ def test_sampler_iter_yields_batches(self, simple_dataset): """Test sampler iteration yields batches of indices.""" sampler = SequentialBucketSampler( simple_dataset, - base_batch_size=4, + batch_size=4, num_replicas=1, rank=0, ) @@ -226,7 +228,7 @@ def test_sampler_batch_size_respected(self, simple_dataset): batch_size = 4 sampler = SequentialBucketSampler( simple_dataset, - base_batch_size=batch_size, + batch_size=batch_size, num_replicas=1, rank=0, drop_last=True, @@ -241,7 +243,7 @@ def test_sampler_drop_last_false(self, simple_dataset): """Test sampler with drop_last=False includes all samples.""" sampler = SequentialBucketSampler( simple_dataset, - base_batch_size=4, + batch_size=4, num_replicas=1, rank=0, drop_last=False, @@ -257,7 +259,7 @@ def test_sampler_set_epoch(self, simple_dataset): """Test set_epoch changes sampler state.""" sampler = SequentialBucketSampler( simple_dataset, - base_batch_size=4, + batch_size=4, num_replicas=1, rank=0, ) @@ -270,7 +272,7 @@ def test_sampler_deterministic_shuffling(self, simple_dataset): """Test same seed produces same batch order.""" sampler1 = SequentialBucketSampler( simple_dataset, - base_batch_size=4, + batch_size=4, num_replicas=1, rank=0, seed=42, @@ -278,7 +280,7 @@ def test_sampler_deterministic_shuffling(self, simple_dataset): sampler2 = SequentialBucketSampler( simple_dataset, - base_batch_size=4, + batch_size=4, num_replicas=1, rank=0, seed=42, @@ -293,7 +295,7 @@ def test_sampler_different_seeds_different_order(self, simple_dataset): """Test different seeds produce different batch orders.""" sampler1 = SequentialBucketSampler( simple_dataset, - base_batch_size=4, + batch_size=4, num_replicas=1, rank=0, seed=42, @@ -302,7 +304,7 @@ def test_sampler_different_seeds_different_order(self, simple_dataset): sampler2 = SequentialBucketSampler( simple_dataset, - base_batch_size=4, + batch_size=4, num_replicas=1, rank=0, seed=123, @@ -321,7 +323,7 @@ def test_sampler_no_shuffle(self, simple_dataset): """Test sampler without shuffling.""" sampler = SequentialBucketSampler( simple_dataset, - base_batch_size=4, + batch_size=4, num_replicas=1, rank=0, shuffle_buckets=False, @@ -335,7 +337,7 @@ def test_sampler_different_order_across_epochs(self, large_dataset): """Test that bucket and element order differs across epochs.""" sampler = SequentialBucketSampler( large_dataset, - base_batch_size=8, + batch_size=8, num_replicas=1, rank=0, seed=42, @@ -380,7 +382,7 @@ def test_sampler_dynamic_batch_size_disabled(self, multi_resolution_dataset): batch_size = 4 sampler = SequentialBucketSampler( multi_resolution_dataset, - base_batch_size=batch_size, + batch_size=batch_size, dynamic_batch_size=False, num_replicas=1, rank=0, @@ -396,7 +398,7 @@ def test_sampler_dynamic_batch_size_enabled(self, multi_resolution_dataset): """Test sampler with dynamic_batch_size=True varies batch size.""" sampler = SequentialBucketSampler( multi_resolution_dataset, - base_batch_size=8, + batch_size=8, base_resolution=(512, 512), dynamic_batch_size=True, num_replicas=1, @@ -417,7 +419,7 @@ def test_sampler_get_batch_info(self, simple_dataset): """Test get_batch_info returns bucket information.""" sampler = SequentialBucketSampler( simple_dataset, - base_batch_size=4, + batch_size=4, num_replicas=1, rank=0, ) @@ -442,7 +444,7 @@ def test_multi_rank_same_batch_count(self, large_dataset): for rank in range(world_size): sampler = SequentialBucketSampler( large_dataset, - base_batch_size=8, + batch_size=8, num_replicas=world_size, rank=rank, ) @@ -457,7 +459,7 @@ def test_multi_rank_different_samples(self, large_dataset): sampler0 = SequentialBucketSampler( large_dataset, - base_batch_size=8, + batch_size=8, num_replicas=world_size, rank=0, seed=42, @@ -465,7 +467,7 @@ def test_multi_rank_different_samples(self, large_dataset): sampler1 = SequentialBucketSampler( large_dataset, - base_batch_size=8, + batch_size=8, num_replicas=world_size, rank=1, seed=42, @@ -486,7 +488,7 @@ def test_single_rank_equivalent(self, simple_dataset): """Test single rank (world_size=1) processes all data.""" sampler = SequentialBucketSampler( simple_dataset, - base_batch_size=4, + batch_size=4, num_replicas=1, rank=0, ) @@ -570,18 +572,19 @@ def test_collate_same_resolution_required(self, multi_resolution_dataset): # ============================================================================ -# CPU Tests - build_multiresolution_dataloader +# CPU Tests - _build_multiresolution_dataloader_core # ============================================================================ -class TestBuildMultiresolutionDataloaderCPU: - """CPU tests for build_multiresolution_dataloader.""" +class TestBuildMultiresolutionDataloaderCoreCPU: + """CPU tests for _build_multiresolution_dataloader_core.""" def test_build_dataloader_returns_tuple(self, simple_dataset): """Test function returns dataloader and sampler.""" - dataloader, sampler = build_multiresolution_dataloader( + dataloader, sampler = _build_multiresolution_dataloader_core( + collate_fn=collate_fn_production, dataset=simple_dataset, - base_batch_size=4, + batch_size=4, dp_rank=0, dp_world_size=1, num_workers=0, @@ -593,9 +596,10 @@ def test_build_dataloader_returns_tuple(self, simple_dataset): def test_dataloader_iteration(self, simple_dataset): """Test dataloader can be iterated.""" - dataloader, sampler = build_multiresolution_dataloader( + dataloader, sampler = _build_multiresolution_dataloader_core( + collate_fn=collate_fn_production, dataset=simple_dataset, - base_batch_size=4, + batch_size=4, dp_rank=0, dp_world_size=1, num_workers=0, @@ -613,9 +617,10 @@ def test_dataloader_iteration(self, simple_dataset): def test_dataloader_batch_content(self, simple_dataset): """Test dataloader batches have correct content.""" - dataloader, _ = build_multiresolution_dataloader( + dataloader, _ = _build_multiresolution_dataloader_core( + collate_fn=collate_fn_production, dataset=simple_dataset, - base_batch_size=4, + batch_size=4, dp_rank=0, dp_world_size=1, num_workers=0, @@ -628,9 +633,10 @@ def test_dataloader_batch_content(self, simple_dataset): def test_dataloader_with_shuffle(self, simple_dataset): """Test dataloader with shuffle enabled.""" - dataloader, _ = build_multiresolution_dataloader( + dataloader, _ = _build_multiresolution_dataloader_core( + collate_fn=collate_fn_production, dataset=simple_dataset, - base_batch_size=4, + batch_size=4, dp_rank=0, dp_world_size=1, shuffle=True, @@ -643,9 +649,10 @@ def test_dataloader_with_shuffle(self, simple_dataset): def test_dataloader_without_shuffle(self, simple_dataset): """Test dataloader with shuffle disabled.""" - dataloader, _ = build_multiresolution_dataloader( + dataloader, _ = _build_multiresolution_dataloader_core( + collate_fn=collate_fn_production, dataset=simple_dataset, - base_batch_size=4, + batch_size=4, dp_rank=0, dp_world_size=1, shuffle=False, @@ -658,9 +665,10 @@ def test_dataloader_without_shuffle(self, simple_dataset): def test_dataloader_with_dynamic_batch(self, multi_resolution_dataset): """Test dataloader with dynamic batch sizing.""" - dataloader, _ = build_multiresolution_dataloader( + dataloader, _ = _build_multiresolution_dataloader_core( + collate_fn=collate_fn_production, dataset=multi_resolution_dataset, - base_batch_size=8, + batch_size=8, base_resolution=(512, 512), dp_rank=0, dp_world_size=1, @@ -687,7 +695,7 @@ def test_sampler_with_gpu_tensors(self, simple_dataset): sampler = SequentialBucketSampler( simple_dataset, - base_batch_size=4, + batch_size=4, num_replicas=1, rank=0, ) @@ -747,20 +755,21 @@ def test_collate_then_transfer_to_gpu(self, simple_dataset): # ============================================================================ -# GPU Tests - build_multiresolution_dataloader +# GPU Tests - _build_multiresolution_dataloader_core # ============================================================================ @pytest.mark.skipif(not torch.cuda.is_available(), reason="Requires CUDA") -class TestBuildMultiresolutionDataloaderGPU: - """GPU tests for build_multiresolution_dataloader.""" +class TestBuildMultiresolutionDataloaderCoreGPU: + """GPU tests for _build_multiresolution_dataloader_core.""" def test_dataloader_with_pin_memory(self, simple_dataset): """Test dataloader with pin_memory for faster GPU transfer.""" - dataloader, _ = build_multiresolution_dataloader( + dataloader, _ = _build_multiresolution_dataloader_core( + collate_fn=collate_fn_production, dataset=simple_dataset, - base_batch_size=4, + batch_size=4, dp_rank=0, dp_world_size=1, pin_memory=True, @@ -775,9 +784,10 @@ def test_dataloader_with_pin_memory(self, simple_dataset): def test_dataloader_batch_to_gpu(self, simple_dataset): """Test full batch transfer to GPU.""" - dataloader, _ = build_multiresolution_dataloader( + dataloader, _ = _build_multiresolution_dataloader_core( + collate_fn=collate_fn_production, dataset=simple_dataset, - base_batch_size=4, + batch_size=4, dp_rank=0, dp_world_size=1, num_workers=0, @@ -814,9 +824,10 @@ def test_dataloader_gpu_memory_cleanup(self, simple_dataset): torch.cuda.empty_cache() initial_memory = torch.cuda.memory_allocated() - dataloader, _ = build_multiresolution_dataloader( + dataloader, _ = _build_multiresolution_dataloader_core( + collate_fn=collate_fn_production, dataset=simple_dataset, - base_batch_size=4, + batch_size=4, dp_rank=0, dp_world_size=1, num_workers=0, @@ -846,9 +857,9 @@ def test_dataloader_multi_gpu_simulation(self, large_dataset): # Create dataloaders for each GPU (simulated) dataloaders = [] for rank in range(min(gpu_count, 2)): # Use up to 2 GPUs for test - dl, _ = build_multiresolution_dataloader( + dl, _ = _build_multiresolution_dataloader_core( dataset=large_dataset, - base_batch_size=8, + batch_size=8, dp_rank=rank, dp_world_size=min(gpu_count, 2), num_workers=0, @@ -867,9 +878,10 @@ def test_dataloader_multi_gpu_simulation(self, large_dataset): def test_gpu_operations_on_batch(self, simple_dataset): """Test performing GPU operations on loaded batch.""" - dataloader, _ = build_multiresolution_dataloader( + dataloader, _ = _build_multiresolution_dataloader_core( + collate_fn=collate_fn_production, dataset=simple_dataset, - base_batch_size=4, + batch_size=4, dp_rank=0, dp_world_size=1, num_workers=0, @@ -902,9 +914,10 @@ class TestDataloaderIntegration: def test_full_epoch_iteration_cpu(self, simple_dataset): """Test iterating through a full epoch on CPU.""" - dataloader, sampler = build_multiresolution_dataloader( + dataloader, sampler = _build_multiresolution_dataloader_core( + collate_fn=collate_fn_production, dataset=simple_dataset, - base_batch_size=4, + batch_size=4, dp_rank=0, dp_world_size=1, num_workers=0, @@ -919,9 +932,10 @@ def test_full_epoch_iteration_cpu(self, simple_dataset): def test_multiple_epochs_cpu(self, simple_dataset): """Test iterating through multiple epochs.""" - dataloader, sampler = build_multiresolution_dataloader( + dataloader, sampler = _build_multiresolution_dataloader_core( + collate_fn=collate_fn_production, dataset=simple_dataset, - base_batch_size=4, + batch_size=4, dp_rank=0, dp_world_size=1, num_workers=0, @@ -937,9 +951,10 @@ def test_multiple_epochs_cpu(self, simple_dataset): @pytest.mark.skipif(not torch.cuda.is_available(), reason="Requires CUDA") def test_full_epoch_iteration_gpu(self, simple_dataset): """Test iterating through a full epoch with GPU transfer.""" - dataloader, sampler = build_multiresolution_dataloader( + dataloader, sampler = _build_multiresolution_dataloader_core( + collate_fn=collate_fn_production, dataset=simple_dataset, - base_batch_size=4, + batch_size=4, dp_rank=0, dp_world_size=1, pin_memory=True, @@ -964,7 +979,7 @@ def test_deterministic_across_ranks(self, large_dataset): # Create samplers for two ranks sampler0 = SequentialBucketSampler( large_dataset, - base_batch_size=8, + batch_size=8, num_replicas=world_size, rank=0, seed=seed, @@ -972,7 +987,7 @@ def test_deterministic_across_ranks(self, large_dataset): sampler1 = SequentialBucketSampler( large_dataset, - base_batch_size=8, + batch_size=8, num_replicas=world_size, rank=1, seed=seed, diff --git a/tools/diffusion/processors/wan.py b/tools/diffusion/processors/wan.py index 8fa2ea004..750e53404 100644 --- a/tools/diffusion/processors/wan.py +++ b/tools/diffusion/processors/wan.py @@ -111,6 +111,8 @@ def load_models(self, model_name: str, device: str) -> Dict[str, Any]: from transformers import AutoTokenizer, UMT5EncoderModel dtype = torch.float16 if "cuda" in device else torch.float32 + # UMT5 requires bfloat16 (float16 causes overflow/zeros in attention and layer norm) + text_encoder_dtype = torch.bfloat16 if "cuda" in device else torch.float32 logger.info("[Wan] Loading models from %s...", model_name) @@ -119,8 +121,19 @@ def load_models(self, model_name: str, device: str) -> Dict[str, Any]: text_encoder = UMT5EncoderModel.from_pretrained( model_name, subfolder="text_encoder", - torch_dtype=dtype, + torch_dtype=text_encoder_dtype, ) + # Workaround for transformers>=5.0.0 weight tying regression: + # The Wan2.1 checkpoint stores the token embedding as "shared.weight", which + # transformers<5 automatically tied to "encoder.embed_tokens.weight". In v5+, + # this tying no longer happens during from_pretrained(), leaving embed_tokens + # zero-initialized and producing all-zero text embeddings. + if ( + hasattr(text_encoder, "shared") + and hasattr(text_encoder.encoder, "embed_tokens") + and text_encoder.encoder.embed_tokens.weight.data_ptr() != text_encoder.shared.weight.data_ptr() + ): + text_encoder.encoder.embed_tokens.weight = text_encoder.shared.weight text_encoder.to(device) text_encoder.eval() diff --git a/uv.lock b/uv.lock index 4c5dfd33c..5a7c5beca 100644 --- a/uv.lock +++ b/uv.lock @@ -2275,6 +2275,21 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/41/45/1a4ed80516f02155c51f51e8cedb3c1902296743db0bbc66608a0db2814f/jsonschema_specifications-2025.9.1-py3-none-any.whl", hash = "sha256:98802fee3a11ee76ecaca44429fda8a41bff98b00a0f2838151b113f210cc6fe", size = 18437, upload-time = "2025-09-08T01:34:57.871Z" }, ] +[[package]] +name = "kernels" +version = "0.12.2" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "huggingface-hub" }, + { name = "packaging" }, + { name = "pyyaml" }, + { name = "tomli", marker = "python_full_version < '3.11'" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/a9/07/d2b635e965b232cae1aa873c6e0458947196be8dca7bb02e64d3cd6e8d19/kernels-0.12.2.tar.gz", hash = "sha256:812fc43c2814f046cee655cbebf3918cddd489715773670bdb38cca3f5203b5b", size = 57108, upload-time = "2026-03-04T10:03:00.379Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/08/be/f5d6758b48633e4f6a28198fcf4bf9f763cc6a82e2335d9fe8802a5cb440/kernels-0.12.2-py3-none-any.whl", hash = "sha256:1289261804748cf3cf8e3afab80b505b0f1b28e4ec88379cdf08dc31e64964b8", size = 55205, upload-time = "2026-03-04T10:02:59.305Z" }, +] + [[package]] name = "kiwisolver" version = "1.4.9" @@ -3240,6 +3255,7 @@ all = [ { name = "ftfy" }, { name = "imageio" }, { name = "imageio-ffmpeg" }, + { name = "kernels" }, { name = "mamba-ssm" }, { name = "mistral-common", extra = ["opencv"] }, { name = "numba", version = "0.53.1", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.13'" }, @@ -3257,6 +3273,10 @@ all = [ { name = "sentencepiece" }, { name = "timm" }, { name = "torchcodec", marker = "platform_machine == 'x86_64' and sys_platform != 'darwin'" }, + { name = "torchvision", version = "0.24.0", source = { registry = "https://download.pytorch.org/whl/cu129" }, marker = "platform_machine == 'aarch64' and sys_platform == 'linux'" }, + { name = "torchvision", version = "0.24.0+cu129", source = { registry = "https://download.pytorch.org/whl/cu129" }, marker = "platform_machine != 'aarch64' and sys_platform == 'linux'" }, + { name = "torchvision", version = "0.25.0", source = { registry = "https://pypi.org/simple" }, marker = "sys_platform == 'darwin'" }, + { name = "torchvision", version = "0.25.0+cpu", source = { registry = "https://download.pytorch.org/whl/cpu" }, marker = "sys_platform != 'darwin' and sys_platform != 'linux'" }, { name = "transformer-engine", extra = ["pytorch"] }, ] cuda = [ @@ -3276,7 +3296,12 @@ diffusion = [ { name = "ftfy" }, { name = "imageio" }, { name = "imageio-ffmpeg" }, + { name = "kernels" }, { name = "opencv-python-headless" }, + { name = "torchvision", version = "0.24.0", source = { registry = "https://download.pytorch.org/whl/cu129" }, marker = "platform_machine == 'aarch64' and sys_platform == 'linux'" }, + { name = "torchvision", version = "0.24.0+cu129", source = { registry = "https://download.pytorch.org/whl/cu129" }, marker = "platform_machine != 'aarch64' and sys_platform == 'linux'" }, + { name = "torchvision", version = "0.25.0", source = { registry = "https://pypi.org/simple" }, marker = "sys_platform == 'darwin'" }, + { name = "torchvision", version = "0.25.0+cpu", source = { registry = "https://download.pytorch.org/whl/cpu" }, marker = "sys_platform != 'darwin' and sys_platform != 'linux'" }, ] extra = [ { name = "flash-linear-attention" }, @@ -3365,6 +3390,7 @@ requires-dist = [ { name = "ftfy", marker = "extra == 'diffusion'" }, { name = "imageio", marker = "extra == 'diffusion'" }, { name = "imageio-ffmpeg", marker = "extra == 'diffusion'" }, + { name = "kernels", marker = "extra == 'diffusion'" }, { name = "mamba-ssm", marker = "extra == 'cuda'" }, { name = "megatron-fsdp", specifier = ">=0.2.3" }, { name = "mistral-common", extras = ["audio", "hf-hub", "image", "sentencepiece"] }, @@ -3397,6 +3423,9 @@ requires-dist = [ { name = "torchao" }, { name = "torchcodec", marker = "platform_machine == 'x86_64' and sys_platform != 'darwin' and extra == 'vlm'" }, { name = "torchdata" }, + { name = "torchvision", marker = "sys_platform == 'darwin' and extra == 'diffusion'", index = "https://pypi.org/simple" }, + { name = "torchvision", marker = "sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'diffusion'", index = "https://download.pytorch.org/whl/cpu" }, + { name = "torchvision", marker = "sys_platform == 'linux' and extra == 'diffusion'", index = "https://download.pytorch.org/whl/cu129" }, { name = "transformer-engine", extras = ["pytorch"], marker = "extra == 'cuda'", specifier = "<=2.11.0" }, { name = "transformers", specifier = ">=5.0.0" }, { name = "wandb" }, @@ -4041,8 +4070,10 @@ dependencies = [ { name = "torch", version = "2.9.0+cu129", source = { registry = "https://download.pytorch.org/whl/cu129" }, marker = "sys_platform == 'linux'" }, { name = "torch", version = "2.10.0", source = { registry = "https://pypi.org/simple" }, marker = "sys_platform == 'darwin'" }, { name = "torch", version = "2.10.0+cpu", source = { registry = "https://download.pytorch.org/whl/cpu" }, marker = "sys_platform != 'darwin' and sys_platform != 'linux'" }, - { name = "torchvision", version = "0.24.0", source = { registry = "https://pypi.org/simple" }, marker = "sys_platform == 'linux'" }, - { name = "torchvision", version = "0.25.0", source = { registry = "https://pypi.org/simple" }, marker = "sys_platform != 'linux'" }, + { name = "torchvision", version = "0.24.0", source = { registry = "https://download.pytorch.org/whl/cu129" }, marker = "platform_machine == 'aarch64' and sys_platform == 'linux'" }, + { name = "torchvision", version = "0.24.0+cu129", source = { registry = "https://download.pytorch.org/whl/cu129" }, marker = "platform_machine != 'aarch64' and sys_platform == 'linux'" }, + { name = "torchvision", version = "0.25.0", source = { registry = "https://pypi.org/simple" }, marker = "sys_platform == 'darwin'" }, + { name = "torchvision", version = "0.25.0+cpu", source = { registry = "https://download.pytorch.org/whl/cpu" }, marker = "sys_platform != 'darwin' and sys_platform != 'linux'" }, { name = "tqdm" }, ] sdist = { url = "https://files.pythonhosted.org/packages/30/46/fb8be250fa7fcfc56fbeb41583645e18d868268f67fbbbeb8ed62a8ff18a/open_clip_torch-3.2.0.tar.gz", hash = "sha256:62b7743012ccc40fb7c64819fa762fba0a13dd74585ac733babe58c2974c2506", size = 1502853, upload-time = "2025-09-21T17:32:08.289Z" } @@ -6460,8 +6491,10 @@ dependencies = [ { name = "torch", version = "2.9.0+cu129", source = { registry = "https://download.pytorch.org/whl/cu129" }, marker = "sys_platform == 'linux'" }, { name = "torch", version = "2.10.0", source = { registry = "https://pypi.org/simple" }, marker = "sys_platform == 'darwin'" }, { name = "torch", version = "2.10.0+cpu", source = { registry = "https://download.pytorch.org/whl/cpu" }, marker = "sys_platform != 'darwin' and sys_platform != 'linux'" }, - { name = "torchvision", version = "0.24.0", source = { registry = "https://pypi.org/simple" }, marker = "sys_platform == 'linux'" }, - { name = "torchvision", version = "0.25.0", source = { registry = "https://pypi.org/simple" }, marker = "sys_platform != 'linux'" }, + { name = "torchvision", version = "0.24.0", source = { registry = "https://download.pytorch.org/whl/cu129" }, marker = "platform_machine == 'aarch64' and sys_platform == 'linux'" }, + { name = "torchvision", version = "0.24.0+cu129", source = { registry = "https://download.pytorch.org/whl/cu129" }, marker = "platform_machine != 'aarch64' and sys_platform == 'linux'" }, + { name = "torchvision", version = "0.25.0", source = { registry = "https://pypi.org/simple" }, marker = "sys_platform == 'darwin'" }, + { name = "torchvision", version = "0.25.0+cpu", source = { registry = "https://download.pytorch.org/whl/cpu" }, marker = "sys_platform != 'darwin' and sys_platform != 'linux'" }, ] sdist = { url = "https://files.pythonhosted.org/packages/c5/9d/e4670765d1c033f97096c760b3b907eeb659cf80f3678640e5f060b04c6c/timm-1.0.22.tar.gz", hash = "sha256:14fd74bcc17db3856b1a47d26fb305576c98579ab9d02b36714a5e6b25cde422", size = 2382998, upload-time = "2025-11-05T04:06:09.377Z" } wheels = [ @@ -6722,45 +6755,60 @@ wheels = [ [[package]] name = "torchvision" version = "0.24.0" -source = { registry = "https://pypi.org/simple" } +source = { registry = "https://download.pytorch.org/whl/cu129" } resolution-markers = [ "python_full_version >= '3.14' and platform_machine == 'aarch64' and sys_platform == 'linux'", "python_full_version == '3.13.*' and platform_machine == 'aarch64' and sys_platform == 'linux'", "python_full_version == '3.12.*' and platform_machine == 'aarch64' and sys_platform == 'linux'", + "python_full_version == '3.11.*' and platform_machine == 'aarch64' and sys_platform == 'linux'", + "python_full_version < '3.11' and platform_machine == 'aarch64' and sys_platform == 'linux'", +] +dependencies = [ + { name = "numpy", version = "1.26.4", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.13' and platform_machine == 'aarch64' and sys_platform == 'linux'" }, + { name = "numpy", version = "2.4.1", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.13' and platform_machine == 'aarch64' and sys_platform == 'linux'" }, + { name = "pillow", marker = "platform_machine == 'aarch64' and sys_platform == 'linux'" }, + { name = "torch", version = "2.9.0+cu129", source = { registry = "https://download.pytorch.org/whl/cu129" }, marker = "platform_machine == 'aarch64' and sys_platform == 'linux'" }, +] +wheels = [ + { url = "https://download.pytorch.org/whl/cu129/torchvision-0.24.0-cp310-cp310-manylinux_2_28_aarch64.whl" }, + { url = "https://download.pytorch.org/whl/cu129/torchvision-0.24.0-cp311-cp311-manylinux_2_28_aarch64.whl" }, + { url = "https://download.pytorch.org/whl/cu129/torchvision-0.24.0-cp312-cp312-manylinux_2_28_aarch64.whl" }, + { url = "https://download.pytorch.org/whl/cu129/torchvision-0.24.0-cp313-cp313-manylinux_2_28_aarch64.whl" }, + { url = "https://download.pytorch.org/whl/cu129/torchvision-0.24.0-cp313-cp313t-manylinux_2_28_aarch64.whl" }, + { url = "https://download.pytorch.org/whl/cu129/torchvision-0.24.0-cp314-cp314-manylinux_2_28_aarch64.whl" }, + { url = "https://download.pytorch.org/whl/cu129/torchvision-0.24.0-cp314-cp314t-manylinux_2_28_aarch64.whl" }, +] + +[[package]] +name = "torchvision" +version = "0.24.0+cu129" +source = { registry = "https://download.pytorch.org/whl/cu129" } +resolution-markers = [ "python_full_version >= '3.14' and platform_machine == 'x86_64' and sys_platform == 'linux'", "python_full_version >= '3.14' and platform_machine != 'aarch64' and platform_machine != 'x86_64' and sys_platform == 'linux'", "python_full_version == '3.13.*' and platform_machine == 'x86_64' and sys_platform == 'linux'", "python_full_version == '3.13.*' and platform_machine != 'aarch64' and platform_machine != 'x86_64' and sys_platform == 'linux'", "python_full_version == '3.12.*' and platform_machine == 'x86_64' and sys_platform == 'linux'", "python_full_version == '3.12.*' and platform_machine != 'aarch64' and platform_machine != 'x86_64' and sys_platform == 'linux'", - "python_full_version == '3.11.*' and platform_machine == 'aarch64' and sys_platform == 'linux'", "python_full_version == '3.11.*' and platform_machine == 'x86_64' and sys_platform == 'linux'", "python_full_version == '3.11.*' and platform_machine != 'aarch64' and platform_machine != 'x86_64' and sys_platform == 'linux'", - "python_full_version < '3.11' and platform_machine == 'aarch64' and sys_platform == 'linux'", "python_full_version < '3.11' and platform_machine == 'x86_64' and sys_platform == 'linux'", "python_full_version < '3.11' and platform_machine != 'aarch64' and platform_machine != 'x86_64' and sys_platform == 'linux'", ] dependencies = [ - { name = "numpy", version = "1.26.4", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.13' and sys_platform == 'linux'" }, - { name = "numpy", version = "2.4.1", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.13' and sys_platform == 'linux'" }, - { name = "pillow", marker = "sys_platform == 'linux'" }, - { name = "torch", version = "2.9.0+cu129", source = { registry = "https://download.pytorch.org/whl/cu129" }, marker = "sys_platform == 'linux'" }, + { name = "numpy", version = "1.26.4", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.13' and platform_machine != 'aarch64' and sys_platform == 'linux'" }, + { name = "numpy", version = "2.4.1", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.13' and platform_machine != 'aarch64' and sys_platform == 'linux'" }, + { name = "pillow", marker = "platform_machine != 'aarch64' and sys_platform == 'linux'" }, + { name = "torch", version = "2.9.0+cu129", source = { registry = "https://download.pytorch.org/whl/cu129" }, marker = "platform_machine != 'aarch64' and sys_platform == 'linux'" }, ] wheels = [ - { url = "https://files.pythonhosted.org/packages/88/e3/1b003ecd52bd721f8304aeb66691edfbc2002747ec83d36188ad6abab506/torchvision-0.24.0-cp310-cp310-manylinux_2_28_aarch64.whl", hash = "sha256:a110a51c75e89807a8382b0d8034f5e180fb9319570be3389ffd3d4ac4fd57a9", size = 2418988, upload-time = "2025-10-15T15:51:25.195Z" }, - { url = "https://files.pythonhosted.org/packages/56/2e/3c19a35e62da0f606baf8f6e2ceeab1eb66aaa2f84c6528538b06b416d54/torchvision-0.24.0-cp310-cp310-manylinux_2_28_x86_64.whl", hash = "sha256:81d5b12a6df1bb2cc8bdbad837b637d6ea446f2866e6d94f1b5d478856331be3", size = 8046769, upload-time = "2025-10-15T15:51:15.221Z" }, - { url = "https://files.pythonhosted.org/packages/f8/07/0cd6776eee784742ad3cb2bfd3295383d84cb2f9e87386119333d1587f0f/torchvision-0.24.0-cp311-cp311-manylinux_2_28_aarch64.whl", hash = "sha256:bbd63bf4ebff84c48c50123eba90526cc9f794fe45bc9f5dd07cec19e8c62bce", size = 2420513, upload-time = "2025-10-15T15:51:18.087Z" }, - { url = "https://files.pythonhosted.org/packages/1a/f4/6026c08011ddcefcbc14161c5aa9dce55c35c6b045e04ef0952e88bf4594/torchvision-0.24.0-cp311-cp311-manylinux_2_28_x86_64.whl", hash = "sha256:78fe414b3bb6dbf7e6f6da6f733ba96881f6b29a9b997228de7c5f603e5ed940", size = 8048018, upload-time = "2025-10-15T15:51:13.579Z" }, - { url = "https://files.pythonhosted.org/packages/00/7b/e3809b3302caea9a12c13f3adebe4fef127188438e719fd6c8dc93db1da6/torchvision-0.24.0-cp312-cp312-manylinux_2_28_aarch64.whl", hash = "sha256:b0531d1483fc322d7da0d83be52f0df860a75114ab87dbeeb9de765feaeda843", size = 2419495, upload-time = "2025-10-15T15:51:11.885Z" }, - { url = "https://files.pythonhosted.org/packages/7e/e6/7324ead6793075a8c75c56abeed1236d1750de16a5613cfe2ddad164a92a/torchvision-0.24.0-cp312-cp312-manylinux_2_28_x86_64.whl", hash = "sha256:26b9dd9c083f8e5f7ac827de6d5b88c615d9c582dc87666770fbdf16887e4c25", size = 8050480, upload-time = "2025-10-15T15:51:24.012Z" }, - { url = "https://files.pythonhosted.org/packages/8f/02/e2f6b0ff93ca4db5751ac9c5be43f13d5e53d9e9412324f464dca1775027/torchvision-0.24.0-cp313-cp313-manylinux_2_28_aarch64.whl", hash = "sha256:fec12a269cf80f6b0b71471c8d498cd3bdd9d8e892c425bf39fecb604852c3b0", size = 2371478, upload-time = "2025-10-15T15:51:37.842Z" }, - { url = "https://files.pythonhosted.org/packages/77/85/42e5fc4f716ec7b73cf1f32eeb5c77961be4d4054b26cd6a5ff97f20c966/torchvision-0.24.0-cp313-cp313-manylinux_2_28_x86_64.whl", hash = "sha256:7323a9be5e3da695605753f501cdc87824888c5655d27735cdeaa9986b45884c", size = 8050200, upload-time = "2025-10-15T15:51:46.276Z" }, - { url = "https://files.pythonhosted.org/packages/f7/cf/2d7e43409089ce7070f5336161f9216d58653ee1cb26bcb5d6c84cc2de36/torchvision-0.24.0-cp313-cp313t-manylinux_2_28_aarch64.whl", hash = "sha256:b1b3db80609c32a088554e8e94b4fc31f1033fe5bb4ac0673ec49c3eb03fb4da", size = 2374466, upload-time = "2025-10-15T15:51:35.382Z" }, - { url = "https://files.pythonhosted.org/packages/e9/30/8f7c328fd7e0a9665da4b6b56b1c627665c18470bfe62f3729ad3eda9aec/torchvision-0.24.0-cp313-cp313t-manylinux_2_28_x86_64.whl", hash = "sha256:e6635f100d455c80b43f297df4b8585a76c6a2e114802f6567ddd28d7b5479b0", size = 8217068, upload-time = "2025-10-15T15:51:36.623Z" }, - { url = "https://files.pythonhosted.org/packages/b0/d7/69479a066ea773653e88eda99031e38681e9094046f87cb957af5036db0e/torchvision-0.24.0-cp314-cp314-manylinux_2_28_aarch64.whl", hash = "sha256:73576a9c4a593223fbae85a64e8bbd77049abd1101893ecf3c5e981284fd58b4", size = 2371609, upload-time = "2025-10-15T15:51:29.859Z" }, - { url = "https://files.pythonhosted.org/packages/46/64/3c7fdb3771ec992b9445a1f7a969466b23ce2cdb14e09303b3db351a0655/torchvision-0.24.0-cp314-cp314-manylinux_2_28_x86_64.whl", hash = "sha256:dd565b1b06666ff399d0801d4d1824fa570c0167a179ca700a5be232527b3c62", size = 8214918, upload-time = "2025-10-15T15:51:41.465Z" }, - { url = "https://files.pythonhosted.org/packages/a2/fd/615d8a86db1578345de7fa1edaf476fbcf4f057bf7e4fd898306b620c487/torchvision-0.24.0-cp314-cp314t-manylinux_2_28_aarch64.whl", hash = "sha256:64e54494043eecf9f57a9881c6fdea49c62282782e737c002ae8b1639e6ea80e", size = 2374469, upload-time = "2025-10-15T15:51:40.19Z" }, - { url = "https://files.pythonhosted.org/packages/04/98/bac11e8fdbf00d6c398246ff2781370aa72c99f2ac685c01ce79354c9a32/torchvision-0.24.0-cp314-cp314t-manylinux_2_28_x86_64.whl", hash = "sha256:75ef9546323b321a451239d886f0cb528f7e98bb294da47a3200effd4e572064", size = 8217060, upload-time = "2025-10-15T15:51:45.033Z" }, + { url = "https://download.pytorch.org/whl/cu129/torchvision-0.24.0%2Bcu129-cp310-cp310-manylinux_2_28_x86_64.whl" }, + { url = "https://download.pytorch.org/whl/cu129/torchvision-0.24.0%2Bcu129-cp311-cp311-manylinux_2_28_x86_64.whl" }, + { url = "https://download.pytorch.org/whl/cu129/torchvision-0.24.0%2Bcu129-cp312-cp312-manylinux_2_28_x86_64.whl" }, + { url = "https://download.pytorch.org/whl/cu129/torchvision-0.24.0%2Bcu129-cp313-cp313-manylinux_2_28_x86_64.whl" }, + { url = "https://download.pytorch.org/whl/cu129/torchvision-0.24.0%2Bcu129-cp313-cp313t-manylinux_2_28_x86_64.whl" }, + { url = "https://download.pytorch.org/whl/cu129/torchvision-0.24.0%2Bcu129-cp314-cp314-manylinux_2_28_x86_64.whl" }, + { url = "https://download.pytorch.org/whl/cu129/torchvision-0.24.0%2Bcu129-cp314-cp314t-manylinux_2_28_x86_64.whl" }, ] [[package]] @@ -6768,16 +6816,6 @@ name = "torchvision" version = "0.25.0" source = { registry = "https://pypi.org/simple" } resolution-markers = [ - "python_full_version >= '3.14' and platform_machine == 'x86_64' and sys_platform != 'darwin' and sys_platform != 'linux'", - "python_full_version >= '3.14' and platform_machine != 'x86_64' and sys_platform != 'darwin' and sys_platform != 'linux'", - "python_full_version == '3.13.*' and platform_machine == 'x86_64' and sys_platform != 'darwin' and sys_platform != 'linux'", - "python_full_version == '3.13.*' and platform_machine != 'x86_64' and sys_platform != 'darwin' and sys_platform != 'linux'", - "python_full_version == '3.12.*' and platform_machine == 'x86_64' and sys_platform != 'darwin' and sys_platform != 'linux'", - "python_full_version == '3.12.*' and platform_machine != 'x86_64' and sys_platform != 'darwin' and sys_platform != 'linux'", - "python_full_version == '3.11.*' and platform_machine == 'x86_64' and sys_platform != 'darwin' and sys_platform != 'linux'", - "python_full_version == '3.11.*' and platform_machine != 'x86_64' and sys_platform != 'darwin' and sys_platform != 'linux'", - "python_full_version < '3.11' and platform_machine == 'x86_64' and sys_platform != 'darwin' and sys_platform != 'linux'", - "python_full_version < '3.11' and platform_machine != 'x86_64' and sys_platform != 'darwin' and sys_platform != 'linux'", "python_full_version >= '3.14' and sys_platform == 'darwin'", "python_full_version == '3.13.*' and sys_platform == 'darwin'", "python_full_version == '3.12.*' and sys_platform == 'darwin'", @@ -6785,27 +6823,51 @@ resolution-markers = [ "python_full_version < '3.11' and sys_platform == 'darwin'", ] dependencies = [ - { name = "numpy", version = "1.26.4", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.13' and sys_platform != 'linux'" }, - { name = "numpy", version = "2.4.1", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.13' and sys_platform != 'linux'" }, - { name = "pillow", marker = "sys_platform != 'linux'" }, + { name = "numpy", version = "1.26.4", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.13' and sys_platform == 'darwin'" }, + { name = "numpy", version = "2.4.1", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.13' and sys_platform == 'darwin'" }, + { name = "pillow", marker = "sys_platform == 'darwin'" }, { name = "torch", version = "2.10.0", source = { registry = "https://pypi.org/simple" }, marker = "sys_platform == 'darwin'" }, - { name = "torch", version = "2.10.0+cpu", source = { registry = "https://download.pytorch.org/whl/cpu" }, marker = "sys_platform != 'darwin' and sys_platform != 'linux'" }, ] wheels = [ { url = "https://files.pythonhosted.org/packages/50/ae/cbf727421eb73f1cf907fbe5788326a08f111b3f6b6ddca15426b53fec9a/torchvision-0.25.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:a95c47abb817d4e90ea1a8e57bd0d728e3e6b533b3495ae77d84d883c4d11f56", size = 1874919, upload-time = "2026-01-21T16:27:47.617Z" }, - { url = "https://files.pythonhosted.org/packages/8b/b9/a53bcf8f78f2cd89215e9ded70041765d50ef13bf301f9884ec6041a9421/torchvision-0.25.0-cp310-cp310-win_amd64.whl", hash = "sha256:b57430fbe9e9b697418a395041bb615124d9c007710a2712fda6e35fb310f264", size = 3697295, upload-time = "2026-01-21T16:27:36.574Z" }, { url = "https://files.pythonhosted.org/packages/3e/be/c704bceaf11c4f6b19d64337a34a877fcdfe3bd68160a8c9ae9bea4a35a3/torchvision-0.25.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:db74a551946b75d19f9996c419a799ffdf6a223ecf17c656f90da011f1d75b20", size = 1874923, upload-time = "2026-01-21T16:27:46.574Z" }, - { url = "https://files.pythonhosted.org/packages/23/19/55b28aecdc7f38df57b8eb55eb0b14a62b470ed8efeb22cdc74224df1d6a/torchvision-0.25.0-cp311-cp311-win_amd64.whl", hash = "sha256:ea580ffd6094cc01914ad32f8c8118174f18974629af905cea08cb6d5d48c7b7", size = 4038722, upload-time = "2026-01-21T16:27:41.355Z" }, { url = "https://files.pythonhosted.org/packages/56/3a/6ea0d73f49a9bef38a1b3a92e8dd455cea58470985d25635beab93841748/torchvision-0.25.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:c2abe430c90b1d5e552680037d68da4eb80a5852ebb1c811b2b89d299b10573b", size = 1874920, upload-time = "2026-01-21T16:27:45.348Z" }, - { url = "https://files.pythonhosted.org/packages/ad/16/8f650c2e288977cf0f8f85184b90ee56ed170a4919347fc74ee99286ed6f/torchvision-0.25.0-cp312-cp312-win_amd64.whl", hash = "sha256:f9c55ae8d673ab493325d1267cbd285bb94d56f99626c00ac4644de32a59ede3", size = 4303059, upload-time = "2026-01-21T16:27:11.08Z" }, { url = "https://files.pythonhosted.org/packages/f5/5b/1562a04a6a5a4cf8cf40016a0cdeda91ede75d6962cff7f809a85ae966a5/torchvision-0.25.0-cp313-cp313-macosx_12_0_arm64.whl", hash = "sha256:24e11199e4d84ba9c5ee7825ebdf1cd37ce8deec225117f10243cae984ced3ec", size = 1874918, upload-time = "2026-01-21T16:27:39.02Z" }, - { url = "https://files.pythonhosted.org/packages/32/a5/9a9b1de0720f884ea50dbf9acb22cbe5312e51d7b8c4ac6ba9b51efd9bba/torchvision-0.25.0-cp313-cp313-win_amd64.whl", hash = "sha256:cef0196be31be421f6f462d1e9da1101be7332d91984caa6f8022e6c78a5877f", size = 4321911, upload-time = "2026-01-21T16:27:35.195Z" }, { url = "https://files.pythonhosted.org/packages/52/99/dca81ed21ebaeff2b67cc9f815a20fdaa418b69f5f9ea4c6ed71721470db/torchvision-0.25.0-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:a8f8061284395ce31bcd460f2169013382ccf411148ceb2ee38e718e9860f5a7", size = 1896209, upload-time = "2026-01-21T16:27:32.159Z" }, - { url = "https://files.pythonhosted.org/packages/63/cc/0ea68b5802e5e3c31f44b307e74947bad5a38cc655231d845534ed50ddb8/torchvision-0.25.0-cp313-cp313t-win_amd64.whl", hash = "sha256:5e6b449e9fa7d642142c0e27c41e5a43b508d57ed8e79b7c0a0c28652da8678c", size = 4344260, upload-time = "2026-01-21T16:27:17.018Z" }, { url = "https://files.pythonhosted.org/packages/9e/1f/fa839532660e2602b7e704d65010787c5bb296258b44fa8b9c1cd6175e7d/torchvision-0.25.0-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:620a236288d594dcec7634c754484542dc0a5c1b0e0b83a34bda5e91e9b7c3a1", size = 1896193, upload-time = "2026-01-21T16:27:24.785Z" }, - { url = "https://files.pythonhosted.org/packages/1f/eb/d0096eed5690d962853213f2ee00d91478dfcb586b62dbbb449fb8abc3a6/torchvision-0.25.0-cp314-cp314-win_amd64.whl", hash = "sha256:d1abd5ed030c708f5dbf4812ad5f6fbe9384b63c40d6bd79f8df41a4a759a917", size = 4325058, upload-time = "2026-01-21T16:27:26.165Z" }, { url = "https://files.pythonhosted.org/packages/97/36/96374a4c7ab50dea9787ce987815614ccfe988a42e10ac1a2e3e5b60319a/torchvision-0.25.0-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:ad9a8a5877782944d99186e4502a614770fe906626d76e9cd32446a0ac3075f2", size = 1896207, upload-time = "2026-01-21T16:27:23.383Z" }, - { url = "https://files.pythonhosted.org/packages/b6/37/e7ca4ec820d434c0f23f824eb29f0676a0c3e7a118f1514f5b949c3356da/torchvision-0.25.0-cp314-cp314t-win_amd64.whl", hash = "sha256:f07f01d27375ad89d72aa2b3f2180f07da95dd9d2e4c758e015c0acb2da72977", size = 4425879, upload-time = "2026-01-21T16:27:12.579Z" }, +] + +[[package]] +name = "torchvision" +version = "0.25.0+cpu" +source = { registry = "https://download.pytorch.org/whl/cpu" } +resolution-markers = [ + "python_full_version >= '3.14' and platform_machine == 'x86_64' and sys_platform != 'darwin' and sys_platform != 'linux'", + "python_full_version >= '3.14' and platform_machine != 'x86_64' and sys_platform != 'darwin' and sys_platform != 'linux'", + "python_full_version == '3.13.*' and platform_machine == 'x86_64' and sys_platform != 'darwin' and sys_platform != 'linux'", + "python_full_version == '3.13.*' and platform_machine != 'x86_64' and sys_platform != 'darwin' and sys_platform != 'linux'", + "python_full_version == '3.12.*' and platform_machine == 'x86_64' and sys_platform != 'darwin' and sys_platform != 'linux'", + "python_full_version == '3.12.*' and platform_machine != 'x86_64' and sys_platform != 'darwin' and sys_platform != 'linux'", + "python_full_version == '3.11.*' and platform_machine == 'x86_64' and sys_platform != 'darwin' and sys_platform != 'linux'", + "python_full_version == '3.11.*' and platform_machine != 'x86_64' and sys_platform != 'darwin' and sys_platform != 'linux'", + "python_full_version < '3.11' and platform_machine == 'x86_64' and sys_platform != 'darwin' and sys_platform != 'linux'", + "python_full_version < '3.11' and platform_machine != 'x86_64' and sys_platform != 'darwin' and sys_platform != 'linux'", +] +dependencies = [ + { name = "numpy", version = "1.26.4", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.13' and sys_platform != 'darwin' and sys_platform != 'linux'" }, + { name = "numpy", version = "2.4.1", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.13' and sys_platform != 'darwin' and sys_platform != 'linux'" }, + { name = "pillow", marker = "sys_platform != 'darwin' and sys_platform != 'linux'" }, + { name = "torch", version = "2.10.0+cpu", source = { registry = "https://download.pytorch.org/whl/cpu" }, marker = "sys_platform != 'darwin' and sys_platform != 'linux'" }, +] +wheels = [ + { url = "https://download.pytorch.org/whl/cpu/torchvision-0.25.0%2Bcpu-cp310-cp310-win_amd64.whl", hash = "sha256:3e2ae9981e32a5b9db685659d5c7af0f04b159ff20394650a90124baf6ada51a" }, + { url = "https://download.pytorch.org/whl/cpu/torchvision-0.25.0%2Bcpu-cp311-cp311-win_amd64.whl", hash = "sha256:c7eb5f219fdfaf1f65e68c00eb81172ab4fa08a9874dae9dad2bca360da34d0f" }, + { url = "https://download.pytorch.org/whl/cpu/torchvision-0.25.0%2Bcpu-cp312-cp312-win_amd64.whl", hash = "sha256:2d444009c0956669ada149f61ed78f257c1cc96d259efa6acf3929ca96ceb3f0" }, + { url = "https://download.pytorch.org/whl/cpu/torchvision-0.25.0%2Bcpu-cp313-cp313-win_amd64.whl", hash = "sha256:783c8fc580bbfc159bff52f4f72cdd538e42b32956e70dffa42b940db114e151" }, + { url = "https://download.pytorch.org/whl/cpu/torchvision-0.25.0%2Bcpu-cp313-cp313t-win_amd64.whl", hash = "sha256:9212210f417888e6261c040495180f053084812cf873dedba9fc51ff4b24b2d3" }, + { url = "https://download.pytorch.org/whl/cpu/torchvision-0.25.0%2Bcpu-cp314-cp314-win_amd64.whl", hash = "sha256:499eae1e535766391b6ee2d1e6e841239c20e2e6d88203a15b8f9f8d60a1f8bd" }, + { url = "https://download.pytorch.org/whl/cpu/torchvision-0.25.0%2Bcpu-cp314-cp314t-win_amd64.whl", hash = "sha256:fb9f07f6a10f0ac24ac482ae68c6df99110b74a0d80a4c64fddc9753267d8815" }, ] [[package]]