From ff0b9a3c4c38cbd8814eddb217973bda335e4b68 Mon Sep 17 00:00:00 2001
From: Edna <88869424+Ednaordinary@users.noreply.github.com>
Date: Mon, 9 Jun 2025 20:59:00 -0600
Subject: [PATCH 001/108] working state from hameerabbasi and iddl

---
 .../pipelines/chroma/pipeline_chroma.py       | 1001 +++++++++++++++++
 1 file changed, 1001 insertions(+)
 create mode 100644 src/diffusers/pipelines/chroma/pipeline_chroma.py

diff --git a/src/diffusers/pipelines/chroma/pipeline_chroma.py b/src/diffusers/pipelines/chroma/pipeline_chroma.py
new file mode 100644
index 000000000000..50c0c4cedc57
--- /dev/null
+++ b/src/diffusers/pipelines/chroma/pipeline_chroma.py
@@ -0,0 +1,1001 @@
+# Copyright 2024 Black Forest Labs and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import inspect
+from typing import Any, Callable, Dict, List, Optional, Union
+
+import numpy as np
+import torch
+from transformers import (
+    CLIPImageProcessor,
+    CLIPTextModel,
+    CLIPTokenizer,
+    CLIPVisionModelWithProjection,
+    T5EncoderModel,
+    T5TokenizerFast,
+)
+
+from ...image_processor import PipelineImageInput, VaeImageProcessor
+from ...loaders import FluxIPAdapterMixin, FluxLoraLoaderMixin, FromSingleFileMixin, TextualInversionLoaderMixin
+from ...models import AutoencoderKL, FluxTransformer2DModel
+from ...schedulers import FlowMatchEulerDiscreteScheduler
+from ...utils import (
+    USE_PEFT_BACKEND,
+    is_torch_xla_available,
+    logging,
+    replace_example_docstring,
+    scale_lora_layers,
+    unscale_lora_layers,
+)
+from ...utils.torch_utils import randn_tensor
+from ..pipeline_utils import DiffusionPipeline
+from .pipeline_output import FluxPipelineOutput
+
+
+if is_torch_xla_available():
+    import torch_xla.core.xla_model as xm
+
+    XLA_AVAILABLE = True
+else:
+    XLA_AVAILABLE = False
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+EXAMPLE_DOC_STRING = """
+    Examples:
+        ```py
+        >>> import torch
+        >>> from diffusers import FluxPipeline
+
+        >>> pipe = FluxPipeline.from_pretrained("black-forest-labs/FLUX.1-schnell", torch_dtype=torch.bfloat16)
+        >>> pipe.to("cuda")
+        >>> prompt = "A cat holding a sign that says hello world"
+        >>> # Depending on the variant being used, the pipeline call will slightly vary.
+        >>> # Refer to the pipeline documentation for more details.
+        >>> image = pipe(prompt, num_inference_steps=4, guidance_scale=0.0).images[0]
+        >>> image.save("flux.png")
+        ```
+"""
+
+
+def calculate_shift(
+    image_seq_len,
+    base_seq_len: int = 256,
+    max_seq_len: int = 4096,
+    base_shift: float = 0.5,
+    max_shift: float = 1.15,
+):
+    m = (max_shift - base_shift) / (max_seq_len - base_seq_len)
+    b = base_shift - m * base_seq_len
+    mu = image_seq_len * m + b
+    return mu
+
+
+# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.retrieve_timesteps
+def retrieve_timesteps(
+    scheduler,
+    num_inference_steps: Optional[int] = None,
+    device: Optional[Union[str, torch.device]] = None,
+    timesteps: Optional[List[int]] = None,
+    sigmas: Optional[List[float]] = None,
+    **kwargs,
+):
+    r"""
+    Calls the scheduler's `set_timesteps` method and retrieves timesteps from the scheduler after the call. Handles
+    custom timesteps. Any kwargs will be supplied to `scheduler.set_timesteps`.
+
+    Args:
+        scheduler (`SchedulerMixin`):
+            The scheduler to get timesteps from.
+        num_inference_steps (`int`):
+            The number of diffusion steps used when generating samples with a pre-trained model. If used, `timesteps`
+            must be `None`.
+        device (`str` or `torch.device`, *optional*):
+            The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
+        timesteps (`List[int]`, *optional*):
+            Custom timesteps used to override the timestep spacing strategy of the scheduler. If `timesteps` is passed,
+            `num_inference_steps` and `sigmas` must be `None`.
+        sigmas (`List[float]`, *optional*):
+            Custom sigmas used to override the timestep spacing strategy of the scheduler. If `sigmas` is passed,
+            `num_inference_steps` and `timesteps` must be `None`.
+
+    Returns:
+        `Tuple[torch.Tensor, int]`: A tuple where the first element is the timestep schedule from the scheduler and the
+        second element is the number of inference steps.
+    """
+    if timesteps is not None and sigmas is not None:
+        raise ValueError("Only one of `timesteps` or `sigmas` can be passed. Please choose one to set custom values")
+    if timesteps is not None:
+        accepts_timesteps = "timesteps" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
+        if not accepts_timesteps:
+            raise ValueError(
+                f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
+                f" timestep schedules. Please check whether you are using the correct scheduler."
+            )
+        scheduler.set_timesteps(timesteps=timesteps, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+        num_inference_steps = len(timesteps)
+    elif sigmas is not None:
+        accept_sigmas = "sigmas" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
+        if not accept_sigmas:
+            raise ValueError(
+                f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
+                f" sigmas schedules. Please check whether you are using the correct scheduler."
+            )
+        scheduler.set_timesteps(sigmas=sigmas, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+        num_inference_steps = len(timesteps)
+    else:
+        scheduler.set_timesteps(num_inference_steps, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+    return timesteps, num_inference_steps
+
+
+class FluxPipeline(
+    DiffusionPipeline,
+    FluxLoraLoaderMixin,
+    FromSingleFileMixin,
+    TextualInversionLoaderMixin,
+    FluxIPAdapterMixin,
+):
+    r"""
+    The Flux pipeline for text-to-image generation.
+
+    Reference: https://blackforestlabs.ai/announcing-black-forest-labs/
+
+    Args:
+        transformer ([`FluxTransformer2DModel`]):
+            Conditional Transformer (MMDiT) architecture to denoise the encoded image latents.
+        scheduler ([`FlowMatchEulerDiscreteScheduler`]):
+            A scheduler to be used in combination with `transformer` to denoise the encoded image latents.
+        vae ([`AutoencoderKL`]):
+            Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.
+        text_encoder ([`CLIPTextModel`]):
+            [CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModel), specifically
+            the [clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14) variant.
+        text_encoder_2 ([`T5EncoderModel`]):
+            [T5](https://huggingface.co/docs/transformers/en/model_doc/t5#transformers.T5EncoderModel), specifically
+            the [google/t5-v1_1-xxl](https://huggingface.co/google/t5-v1_1-xxl) variant.
+        tokenizer (`CLIPTokenizer`):
+            Tokenizer of class
+            [CLIPTokenizer](https://huggingface.co/docs/transformers/en/model_doc/clip#transformers.CLIPTokenizer).
+        tokenizer_2 (`T5TokenizerFast`):
+            Second Tokenizer of class
+            [T5TokenizerFast](https://huggingface.co/docs/transformers/en/model_doc/t5#transformers.T5TokenizerFast).
+    """
+
+    model_cpu_offload_seq = "text_encoder->text_encoder_2->image_encoder->transformer->vae"
+    _optional_components = ["image_encoder", "feature_extractor"]
+    _callback_tensor_inputs = ["latents", "prompt_embeds"]
+
+    def __init__(
+        self,
+        scheduler: FlowMatchEulerDiscreteScheduler,
+        vae: AutoencoderKL,
+        text_encoder: CLIPTextModel,
+        tokenizer: CLIPTokenizer,
+        text_encoder_2: T5EncoderModel,
+        tokenizer_2: T5TokenizerFast,
+        transformer: FluxTransformer2DModel,
+        image_encoder: CLIPVisionModelWithProjection = None,
+        feature_extractor: CLIPImageProcessor = None,
+        variant: str = "flux",
+    ):
+        super().__init__()
+
+        self.register_modules(
+            vae=vae,
+            text_encoder=text_encoder,
+            text_encoder_2=text_encoder_2,
+            tokenizer=tokenizer,
+            tokenizer_2=tokenizer_2,
+            transformer=transformer,
+            scheduler=scheduler,
+            image_encoder=image_encoder,
+            feature_extractor=feature_extractor,
+        )
+        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1) if getattr(self, "vae", None) else 8
+        # Flux latents are turned into 2x2 patches and packed. This means the latent width and height has to be divisible
+        # by the patch size. So the vae scale factor is multiplied by the patch size to account for this
+        self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor * 2)
+        self.tokenizer_max_length = (
+            self.tokenizer.model_max_length if hasattr(self, "tokenizer") and self.tokenizer is not None else 77
+        )
+        self.default_sample_size = 128
+        if variant not in {"flux", "chroma"}:
+            raise ValueError("`variant` must be `'flux' or `'chroma'`.")
+
+        self.variant = variant
+
+    def _get_chroma_attn_mask(self, length: torch.Tensor, max_sequence_length: int) -> torch.Tensor:
+        attention_mask = torch.zeros((length.shape[0], max_sequence_length), dtype=torch.bool, device=length.device)
+        for i, n_tokens in enumerate(length):
+            n_tokens = torch.max(n_tokens + 1, max_sequence_length)
+            attention_mask[i, :n_tokens] = True
+        return attention_mask
+
+    def _get_t5_prompt_embeds(
+        self,
+        prompt: Union[str, List[str]] = None,
+        num_images_per_prompt: int = 1,
+        max_sequence_length: int = 512,
+        device: Optional[torch.device] = None,
+        dtype: Optional[torch.dtype] = None,
+    ):
+        device = device or self._execution_device
+        dtype = dtype or self.text_encoder.dtype
+
+        prompt = [prompt] if isinstance(prompt, str) else prompt
+        batch_size = len(prompt)
+
+        if isinstance(self, TextualInversionLoaderMixin):
+            prompt = self.maybe_convert_prompt(prompt, self.tokenizer_2)
+
+        text_inputs = self.tokenizer_2(
+            prompt,
+            padding="max_length",
+            max_length=max_sequence_length,
+            truncation=True,
+            return_length=(self.variant == "chroma"),
+            return_overflowing_tokens=False,
+            return_tensors="pt",
+        )
+        text_input_ids = text_inputs.input_ids
+        untruncated_ids = self.tokenizer_2(prompt, padding="longest", return_tensors="pt").input_ids
+
+        if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(text_input_ids, untruncated_ids):
+            removed_text = self.tokenizer_2.batch_decode(untruncated_ids[:, self.tokenizer_max_length - 1 : -1])
+            logger.warning(
+                "The following part of your input was truncated because `max_sequence_length` is set to "
+                f" {max_sequence_length} tokens: {removed_text}"
+            )
+
+        prompt_embeds = self.text_encoder_2(
+            text_input_ids.to(device),
+            output_hidden_states=False,
+            attention_mask=(
+                self._get_chroma_attn_mask(text_inputs.length, max_sequence_length).to(device)
+                if self.variant == "chroma"
+                else None
+            ),
+        )[0]
+
+        dtype = self.text_encoder_2.dtype
+        prompt_embeds = prompt_embeds.to(dtype=dtype, device=device)
+
+        _, seq_len, _ = prompt_embeds.shape
+
+        # duplicate text embeddings and attention mask for each generation per prompt, using mps friendly method
+        prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
+        prompt_embeds = prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
+
+        return prompt_embeds
+
+    def _get_clip_prompt_embeds(
+        self,
+        prompt: Union[str, List[str]],
+        num_images_per_prompt: int = 1,
+        device: Optional[torch.device] = None,
+    ):
+        device = device or self._execution_device
+
+        prompt = [prompt] if isinstance(prompt, str) else prompt
+        batch_size = len(prompt)
+
+        if isinstance(self, TextualInversionLoaderMixin):
+            prompt = self.maybe_convert_prompt(prompt, self.tokenizer)
+
+        text_inputs = self.tokenizer(
+            prompt,
+            padding="max_length",
+            max_length=self.tokenizer_max_length,
+            truncation=True,
+            return_overflowing_tokens=False,
+            return_length=False,
+            return_tensors="pt",
+        )
+
+        text_input_ids = text_inputs.input_ids
+        untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pt").input_ids
+        if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(text_input_ids, untruncated_ids):
+            removed_text = self.tokenizer.batch_decode(untruncated_ids[:, self.tokenizer_max_length - 1 : -1])
+            logger.warning(
+                "The following part of your input was truncated because CLIP can only handle sequences up to"
+                f" {self.tokenizer_max_length} tokens: {removed_text}"
+            )
+        prompt_embeds = self.text_encoder(text_input_ids.to(device), output_hidden_states=False)
+
+        # Use pooled output of CLIPTextModel
+        prompt_embeds = prompt_embeds.pooler_output
+        prompt_embeds = prompt_embeds.to(dtype=self.text_encoder.dtype, device=device)
+
+        # duplicate text embeddings for each generation per prompt, using mps friendly method
+        prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt)
+        prompt_embeds = prompt_embeds.view(batch_size * num_images_per_prompt, -1)
+
+        return prompt_embeds
+
+    def encode_prompt(
+        self,
+        prompt: Union[str, List[str]],
+        prompt_2: Union[str, List[str]],
+        device: Optional[torch.device] = None,
+        num_images_per_prompt: int = 1,
+        prompt_embeds: Optional[torch.FloatTensor] = None,
+        pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
+        max_sequence_length: int = 512,
+        lora_scale: Optional[float] = None,
+    ):
+        r"""
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                prompt to be encoded
+            prompt_2 (`str` or `List[str]`, *optional*):
+                The prompt or prompts to be sent to the `tokenizer_2` and `text_encoder_2`. If not defined, `prompt` is
+                used in all text-encoders
+            device: (`torch.device`):
+                torch device
+            num_images_per_prompt (`int`):
+                number of images that should be generated per prompt
+            prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            pooled_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting.
+                If not provided, pooled text embeddings will be generated from `prompt` input argument.
+            lora_scale (`float`, *optional*):
+                A lora scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded.
+        """
+        device = device or self._execution_device
+
+        # set lora scale so that monkey patched LoRA
+        # function of text encoder can correctly access it
+        if lora_scale is not None and isinstance(self, FluxLoraLoaderMixin):
+            self._lora_scale = lora_scale
+
+            # dynamically adjust the LoRA scale
+            if self.text_encoder is not None and USE_PEFT_BACKEND:
+                scale_lora_layers(self.text_encoder, lora_scale)
+            if self.text_encoder_2 is not None and USE_PEFT_BACKEND:
+                scale_lora_layers(self.text_encoder_2, lora_scale)
+
+        prompt = [prompt] if isinstance(prompt, str) else prompt
+
+        if prompt_embeds is None:
+            prompt_2 = prompt_2 or prompt
+            prompt_2 = [prompt_2] if isinstance(prompt_2, str) else prompt_2
+
+            # We only use the pooled prompt output from the CLIPTextModel
+            pooled_prompt_embeds = self._get_clip_prompt_embeds(
+                prompt=prompt,
+                device=device,
+                num_images_per_prompt=num_images_per_prompt,
+            )
+            prompt_embeds = self._get_t5_prompt_embeds(
+                prompt=prompt_2,
+                num_images_per_prompt=num_images_per_prompt,
+                max_sequence_length=max_sequence_length,
+                device=device,
+            )
+
+        if self.text_encoder is not None:
+            if isinstance(self, FluxLoraLoaderMixin) and USE_PEFT_BACKEND:
+                # Retrieve the original scale by scaling back the LoRA layers
+                unscale_lora_layers(self.text_encoder, lora_scale)
+
+        if self.text_encoder_2 is not None:
+            if isinstance(self, FluxLoraLoaderMixin) and USE_PEFT_BACKEND:
+                # Retrieve the original scale by scaling back the LoRA layers
+                unscale_lora_layers(self.text_encoder_2, lora_scale)
+
+        dtype = self.text_encoder.dtype if self.text_encoder is not None else self.transformer.dtype
+        text_ids = torch.zeros(prompt_embeds.shape[1], 3).to(device=device, dtype=dtype)
+
+        return prompt_embeds, pooled_prompt_embeds, text_ids
+
+    def encode_image(self, image, device, num_images_per_prompt):
+        dtype = next(self.image_encoder.parameters()).dtype
+
+        if not isinstance(image, torch.Tensor):
+            image = self.feature_extractor(image, return_tensors="pt").pixel_values
+
+        image = image.to(device=device, dtype=dtype)
+        image_embeds = self.image_encoder(image).image_embeds
+        image_embeds = image_embeds.repeat_interleave(num_images_per_prompt, dim=0)
+        return image_embeds
+
+    def prepare_ip_adapter_image_embeds(
+        self, ip_adapter_image, ip_adapter_image_embeds, device, num_images_per_prompt
+    ):
+        image_embeds = []
+        if ip_adapter_image_embeds is None:
+            if not isinstance(ip_adapter_image, list):
+                ip_adapter_image = [ip_adapter_image]
+
+            if len(ip_adapter_image) != self.transformer.encoder_hid_proj.num_ip_adapters:
+                raise ValueError(
+                    f"`ip_adapter_image` must have same length as the number of IP Adapters. Got {len(ip_adapter_image)} images and {self.transformer.encoder_hid_proj.num_ip_adapters} IP Adapters."
+                )
+
+            for single_ip_adapter_image in ip_adapter_image:
+                single_image_embeds = self.encode_image(single_ip_adapter_image, device, 1)
+                image_embeds.append(single_image_embeds[None, :])
+        else:
+            if not isinstance(ip_adapter_image_embeds, list):
+                ip_adapter_image_embeds = [ip_adapter_image_embeds]
+
+            if len(ip_adapter_image_embeds) != self.transformer.encoder_hid_proj.num_ip_adapters:
+                raise ValueError(
+                    f"`ip_adapter_image_embeds` must have same length as the number of IP Adapters. Got {len(ip_adapter_image_embeds)} image embeds and {self.transformer.encoder_hid_proj.num_ip_adapters} IP Adapters."
+                )
+
+            for single_image_embeds in ip_adapter_image_embeds:
+                image_embeds.append(single_image_embeds)
+
+        ip_adapter_image_embeds = []
+        for single_image_embeds in image_embeds:
+            single_image_embeds = torch.cat([single_image_embeds] * num_images_per_prompt, dim=0)
+            single_image_embeds = single_image_embeds.to(device=device)
+            ip_adapter_image_embeds.append(single_image_embeds)
+
+        return ip_adapter_image_embeds
+
+    def check_inputs(
+        self,
+        prompt,
+        prompt_2,
+        height,
+        width,
+        negative_prompt=None,
+        negative_prompt_2=None,
+        prompt_embeds=None,
+        negative_prompt_embeds=None,
+        pooled_prompt_embeds=None,
+        negative_pooled_prompt_embeds=None,
+        callback_on_step_end_tensor_inputs=None,
+        max_sequence_length=None,
+    ):
+        if height % (self.vae_scale_factor * 2) != 0 or width % (self.vae_scale_factor * 2) != 0:
+            logger.warning(
+                f"`height` and `width` have to be divisible by {self.vae_scale_factor * 2} but are {height} and {width}. Dimensions will be resized accordingly"
+            )
+
+        if callback_on_step_end_tensor_inputs is not None and not all(
+            k in self._callback_tensor_inputs for k in callback_on_step_end_tensor_inputs
+        ):
+            raise ValueError(
+                f"`callback_on_step_end_tensor_inputs` has to be in {self._callback_tensor_inputs}, but found {[k for k in callback_on_step_end_tensor_inputs if k not in self._callback_tensor_inputs]}"
+            )
+
+        if prompt is not None and prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
+                " only forward one of the two."
+            )
+        elif prompt_2 is not None and prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt_2`: {prompt_2} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
+                " only forward one of the two."
+            )
+        elif prompt is None and prompt_embeds is None:
+            raise ValueError(
+                "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
+            )
+        elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+        elif prompt_2 is not None and (not isinstance(prompt_2, str) and not isinstance(prompt_2, list)):
+            raise ValueError(f"`prompt_2` has to be of type `str` or `list` but is {type(prompt_2)}")
+
+        if negative_prompt is not None and negative_prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
+                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
+            )
+        elif negative_prompt_2 is not None and negative_prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `negative_prompt_2`: {negative_prompt_2} and `negative_prompt_embeds`:"
+                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
+            )
+
+        if prompt_embeds is not None and pooled_prompt_embeds is None:
+            raise ValueError(
+                "If `prompt_embeds` are provided, `pooled_prompt_embeds` also have to be passed. Make sure to generate `pooled_prompt_embeds` from the same text encoder that was used to generate `prompt_embeds`."
+            )
+        if negative_prompt_embeds is not None and negative_pooled_prompt_embeds is None:
+            raise ValueError(
+                "If `negative_prompt_embeds` are provided, `negative_pooled_prompt_embeds` also have to be passed. Make sure to generate `negative_pooled_prompt_embeds` from the same text encoder that was used to generate `negative_prompt_embeds`."
+            )
+
+        if max_sequence_length is not None and max_sequence_length > 512:
+            raise ValueError(f"`max_sequence_length` cannot be greater than 512 but is {max_sequence_length}")
+
+    @staticmethod
+    def _prepare_latent_image_ids(batch_size, height, width, device, dtype):
+        latent_image_ids = torch.zeros(height, width, 3)
+        latent_image_ids[..., 1] = latent_image_ids[..., 1] + torch.arange(height)[:, None]
+        latent_image_ids[..., 2] = latent_image_ids[..., 2] + torch.arange(width)[None, :]
+
+        latent_image_id_height, latent_image_id_width, latent_image_id_channels = latent_image_ids.shape
+
+        latent_image_ids = latent_image_ids.reshape(
+            latent_image_id_height * latent_image_id_width, latent_image_id_channels
+        )
+
+        return latent_image_ids.to(device=device, dtype=dtype)
+
+    @staticmethod
+    def _pack_latents(latents, batch_size, num_channels_latents, height, width):
+        latents = latents.view(batch_size, num_channels_latents, height // 2, 2, width // 2, 2)
+        latents = latents.permute(0, 2, 4, 1, 3, 5)
+        latents = latents.reshape(batch_size, (height // 2) * (width // 2), num_channels_latents * 4)
+
+        return latents
+
+    @staticmethod
+    def _unpack_latents(latents, height, width, vae_scale_factor):
+        batch_size, num_patches, channels = latents.shape
+
+        # VAE applies 8x compression on images but we must also account for packing which requires
+        # latent height and width to be divisible by 2.
+        height = 2 * (int(height) // (vae_scale_factor * 2))
+        width = 2 * (int(width) // (vae_scale_factor * 2))
+
+        latents = latents.view(batch_size, height // 2, width // 2, channels // 4, 2, 2)
+        latents = latents.permute(0, 3, 1, 4, 2, 5)
+
+        latents = latents.reshape(batch_size, channels // (2 * 2), height, width)
+
+        return latents
+
+    def enable_vae_slicing(self):
+        r"""
+        Enable sliced VAE decoding. When this option is enabled, the VAE will split the input tensor in slices to
+        compute decoding in several steps. This is useful to save some memory and allow larger batch sizes.
+        """
+        self.vae.enable_slicing()
+
+    def disable_vae_slicing(self):
+        r"""
+        Disable sliced VAE decoding. If `enable_vae_slicing` was previously enabled, this method will go back to
+        computing decoding in one step.
+        """
+        self.vae.disable_slicing()
+
+    def enable_vae_tiling(self):
+        r"""
+        Enable tiled VAE decoding. When this option is enabled, the VAE will split the input tensor into tiles to
+        compute decoding and encoding in several steps. This is useful for saving a large amount of memory and to allow
+        processing larger images.
+        """
+        self.vae.enable_tiling()
+
+    def disable_vae_tiling(self):
+        r"""
+        Disable tiled VAE decoding. If `enable_vae_tiling` was previously enabled, this method will go back to
+        computing decoding in one step.
+        """
+        self.vae.disable_tiling()
+
+    def prepare_latents(
+        self,
+        batch_size,
+        num_channels_latents,
+        height,
+        width,
+        dtype,
+        device,
+        generator,
+        latents=None,
+    ):
+        # VAE applies 8x compression on images but we must also account for packing which requires
+        # latent height and width to be divisible by 2.
+        height = 2 * (int(height) // (self.vae_scale_factor * 2))
+        width = 2 * (int(width) // (self.vae_scale_factor * 2))
+
+        shape = (batch_size, num_channels_latents, height, width)
+
+        if latents is not None:
+            latent_image_ids = self._prepare_latent_image_ids(batch_size, height // 2, width // 2, device, dtype)
+            return latents.to(device=device, dtype=dtype), latent_image_ids
+
+        if isinstance(generator, list) and len(generator) != batch_size:
+            raise ValueError(
+                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+            )
+
+        latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+        latents = self._pack_latents(latents, batch_size, num_channels_latents, height, width)
+
+        latent_image_ids = self._prepare_latent_image_ids(batch_size, height // 2, width // 2, device, dtype)
+
+        return latents, latent_image_ids
+
+    @property
+    def guidance_scale(self):
+        return self._guidance_scale
+
+    @property
+    def joint_attention_kwargs(self):
+        return self._joint_attention_kwargs
+
+    @property
+    def num_timesteps(self):
+        return self._num_timesteps
+
+    @property
+    def current_timestep(self):
+        return self._current_timestep
+
+    @property
+    def interrupt(self):
+        return self._interrupt
+
+    @torch.no_grad()
+    @replace_example_docstring(EXAMPLE_DOC_STRING)
+    def __call__(
+        self,
+        prompt: Union[str, List[str]] = None,
+        prompt_2: Optional[Union[str, List[str]]] = None,
+        negative_prompt: Union[str, List[str]] = None,
+        negative_prompt_2: Optional[Union[str, List[str]]] = None,
+        true_cfg_scale: float = 1.0,
+        height: Optional[int] = None,
+        width: Optional[int] = None,
+        num_inference_steps: int = 28,
+        sigmas: Optional[List[float]] = None,
+        guidance_scale: float = 3.5,
+        num_images_per_prompt: Optional[int] = 1,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.FloatTensor] = None,
+        prompt_embeds: Optional[torch.FloatTensor] = None,
+        pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
+        ip_adapter_image: Optional[PipelineImageInput] = None,
+        ip_adapter_image_embeds: Optional[List[torch.Tensor]] = None,
+        negative_ip_adapter_image: Optional[PipelineImageInput] = None,
+        negative_ip_adapter_image_embeds: Optional[List[torch.Tensor]] = None,
+        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
+        negative_pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        joint_attention_kwargs: Optional[Dict[str, Any]] = None,
+        callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None,
+        callback_on_step_end_tensor_inputs: List[str] = ["latents"],
+        max_sequence_length: int = 512,
+    ):
+        r"""
+        Function invoked when calling the pipeline for generation.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
+                instead.
+            prompt_2 (`str` or `List[str]`, *optional*):
+                The prompt or prompts to be sent to `tokenizer_2` and `text_encoder_2`. If not defined, `prompt` is
+                will be used instead.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. If not defined, one has to pass
+                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `true_cfg_scale` is
+                not greater than `1`).
+            negative_prompt_2 (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation to be sent to `tokenizer_2` and
+                `text_encoder_2`. If not defined, `negative_prompt` is used in all the text-encoders.
+            true_cfg_scale (`float`, *optional*, defaults to 1.0):
+                When > 1.0 and a provided `negative_prompt`, enables true classifier-free guidance.
+            height (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
+                The height in pixels of the generated image. This is set to 1024 by default for the best results.
+            width (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
+                The width in pixels of the generated image. This is set to 1024 by default for the best results.
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            sigmas (`List[float]`, *optional*):
+                Custom sigmas to use for the denoising process with schedulers which support a `sigmas` argument in
+                their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is passed
+                will be used.
+            guidance_scale (`float`, *optional*, defaults to 3.5):
+                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
+                `guidance_scale` is defined as `w` of equation 2. of [Imagen
+                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
+                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
+                usually at the expense of lower image quality.
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
+                to make generation deterministic.
+            latents (`torch.FloatTensor`, *optional*):
+                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor will ge generated by sampling using the supplied random `generator`.
+            prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            pooled_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting.
+                If not provided, pooled text embeddings will be generated from `prompt` input argument.
+            ip_adapter_image: (`PipelineImageInput`, *optional*): Optional image input to work with IP Adapters.
+            ip_adapter_image_embeds (`List[torch.Tensor]`, *optional*):
+                Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of
+                IP-adapters. Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. If not
+                provided, embeddings are computed from the `ip_adapter_image` input argument.
+            negative_ip_adapter_image:
+                (`PipelineImageInput`, *optional*): Optional image input to work with IP Adapters.
+            negative_ip_adapter_image_embeds (`List[torch.Tensor]`, *optional*):
+                Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of
+                IP-adapters. Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. If not
+                provided, embeddings are computed from the `ip_adapter_image` input argument.
+            negative_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+            negative_pooled_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, pooled negative_prompt_embeds will be generated from `negative_prompt`
+                input argument.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generate image. Choose between
+                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.flux.FluxPipelineOutput`] instead of a plain tuple.
+            joint_attention_kwargs (`dict`, *optional*):
+                A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
+                `self.processor` in
+                [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
+            callback_on_step_end (`Callable`, *optional*):
+                A function that calls at the end of each denoising steps during the inference. The function is called
+                with the following arguments: `callback_on_step_end(self: DiffusionPipeline, step: int, timestep: int,
+                callback_kwargs: Dict)`. `callback_kwargs` will include a list of all tensors as specified by
+                `callback_on_step_end_tensor_inputs`.
+            callback_on_step_end_tensor_inputs (`List`, *optional*):
+                The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
+                will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
+                `._callback_tensor_inputs` attribute of your pipeline class.
+            max_sequence_length (`int` defaults to 512): Maximum sequence length to use with the `prompt`.
+
+        Examples:
+
+        Returns:
+            [`~pipelines.flux.FluxPipelineOutput`] or `tuple`: [`~pipelines.flux.FluxPipelineOutput`] if `return_dict`
+            is True, otherwise a `tuple`. When returning a tuple, the first element is a list with the generated
+            images.
+        """
+
+        height = height or self.default_sample_size * self.vae_scale_factor
+        width = width or self.default_sample_size * self.vae_scale_factor
+
+        # 1. Check inputs. Raise error if not correct
+        self.check_inputs(
+            prompt,
+            prompt_2,
+            height,
+            width,
+            negative_prompt=negative_prompt,
+            negative_prompt_2=negative_prompt_2,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            pooled_prompt_embeds=pooled_prompt_embeds,
+            negative_pooled_prompt_embeds=negative_pooled_prompt_embeds,
+            callback_on_step_end_tensor_inputs=callback_on_step_end_tensor_inputs,
+            max_sequence_length=max_sequence_length,
+        )
+
+        self._guidance_scale = guidance_scale
+        self._joint_attention_kwargs = joint_attention_kwargs
+        self._current_timestep = None
+        self._interrupt = False
+
+        # 2. Define call parameters
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        device = self._execution_device
+
+        lora_scale = (
+            self.joint_attention_kwargs.get("scale", None) if self.joint_attention_kwargs is not None else None
+        )
+        has_neg_prompt = negative_prompt is not None or (
+            negative_prompt_embeds is not None and negative_pooled_prompt_embeds is not None
+        )
+        do_true_cfg = true_cfg_scale > 1 and has_neg_prompt
+        (
+            prompt_embeds,
+            pooled_prompt_embeds,
+            text_ids,
+        ) = self.encode_prompt(
+            prompt=prompt,
+            prompt_2=prompt_2,
+            prompt_embeds=prompt_embeds,
+            pooled_prompt_embeds=pooled_prompt_embeds,
+            device=device,
+            num_images_per_prompt=num_images_per_prompt,
+            max_sequence_length=max_sequence_length,
+            lora_scale=lora_scale,
+        )
+        if do_true_cfg:
+            (
+                negative_prompt_embeds,
+                negative_pooled_prompt_embeds,
+                negative_text_ids,
+            ) = self.encode_prompt(
+                prompt=negative_prompt,
+                prompt_2=negative_prompt_2,
+                prompt_embeds=negative_prompt_embeds,
+                pooled_prompt_embeds=negative_pooled_prompt_embeds,
+                device=device,
+                num_images_per_prompt=num_images_per_prompt,
+                max_sequence_length=max_sequence_length,
+                lora_scale=lora_scale,
+            )
+
+        # 4. Prepare latent variables
+        num_channels_latents = self.transformer.config.in_channels // 4
+        latents, latent_image_ids = self.prepare_latents(
+            batch_size * num_images_per_prompt,
+            num_channels_latents,
+            height,
+            width,
+            prompt_embeds.dtype,
+            device,
+            generator,
+            latents,
+        )
+
+        # 5. Prepare timesteps
+        sigmas = np.linspace(1.0, 1 / num_inference_steps, num_inference_steps) if sigmas is None else sigmas
+        image_seq_len = latents.shape[1]
+        mu = calculate_shift(
+            image_seq_len,
+            self.scheduler.config.get("base_image_seq_len", 256),
+            self.scheduler.config.get("max_image_seq_len", 4096),
+            self.scheduler.config.get("base_shift", 0.5),
+            self.scheduler.config.get("max_shift", 1.15),
+        )
+        timesteps, num_inference_steps = retrieve_timesteps(
+            self.scheduler,
+            num_inference_steps,
+            device,
+            sigmas=sigmas,
+            mu=mu,
+        )
+        num_warmup_steps = max(len(timesteps) - num_inference_steps * self.scheduler.order, 0)
+        self._num_timesteps = len(timesteps)
+
+        # handle guidance
+        if self.transformer.config.guidance_embeds:
+            guidance = torch.full([1], guidance_scale, device=device, dtype=torch.float32)
+            guidance = guidance.expand(latents.shape[0])
+        else:
+            guidance = None
+
+        if (ip_adapter_image is not None or ip_adapter_image_embeds is not None) and (
+            negative_ip_adapter_image is None and negative_ip_adapter_image_embeds is None
+        ):
+            negative_ip_adapter_image = np.zeros((width, height, 3), dtype=np.uint8)
+            negative_ip_adapter_image = [negative_ip_adapter_image] * self.transformer.encoder_hid_proj.num_ip_adapters
+
+        elif (ip_adapter_image is None and ip_adapter_image_embeds is None) and (
+            negative_ip_adapter_image is not None or negative_ip_adapter_image_embeds is not None
+        ):
+            ip_adapter_image = np.zeros((width, height, 3), dtype=np.uint8)
+            ip_adapter_image = [ip_adapter_image] * self.transformer.encoder_hid_proj.num_ip_adapters
+
+        if self.joint_attention_kwargs is None:
+            self._joint_attention_kwargs = {}
+
+        image_embeds = None
+        negative_image_embeds = None
+        if ip_adapter_image is not None or ip_adapter_image_embeds is not None:
+            image_embeds = self.prepare_ip_adapter_image_embeds(
+                ip_adapter_image,
+                ip_adapter_image_embeds,
+                device,
+                batch_size * num_images_per_prompt,
+            )
+        if negative_ip_adapter_image is not None or negative_ip_adapter_image_embeds is not None:
+            negative_image_embeds = self.prepare_ip_adapter_image_embeds(
+                negative_ip_adapter_image,
+                negative_ip_adapter_image_embeds,
+                device,
+                batch_size * num_images_per_prompt,
+            )
+
+        # 6. Denoising loop
+        with self.progress_bar(total=num_inference_steps) as progress_bar:
+            for i, t in enumerate(timesteps):
+                if self.interrupt:
+                    continue
+
+                self._current_timestep = t
+                if image_embeds is not None:
+                    self._joint_attention_kwargs["ip_adapter_image_embeds"] = image_embeds
+                # broadcast to batch dimension in a way that's compatible with ONNX/Core ML
+                timestep = t.expand(latents.shape[0]).to(latents.dtype)
+
+                noise_pred = self.transformer(
+                    hidden_states=latents,
+                    timestep=timestep / 1000,
+                    guidance=guidance,
+                    pooled_projections=pooled_prompt_embeds,
+                    encoder_hidden_states=prompt_embeds,
+                    txt_ids=text_ids,
+                    img_ids=latent_image_ids,
+                    joint_attention_kwargs=self.joint_attention_kwargs,
+                    return_dict=False,
+                )[0]
+
+                if do_true_cfg:
+                    if negative_image_embeds is not None:
+                        self._joint_attention_kwargs["ip_adapter_image_embeds"] = negative_image_embeds
+                    neg_noise_pred = self.transformer(
+                        hidden_states=latents,
+                        timestep=timestep / 1000,
+                        guidance=guidance,
+                        pooled_projections=negative_pooled_prompt_embeds,
+                        encoder_hidden_states=negative_prompt_embeds,
+                        txt_ids=negative_text_ids,
+                        img_ids=latent_image_ids,
+                        joint_attention_kwargs=self.joint_attention_kwargs,
+                        return_dict=False,
+                    )[0]
+                    noise_pred = neg_noise_pred + true_cfg_scale * (noise_pred - neg_noise_pred)
+
+                # compute the previous noisy sample x_t -> x_t-1
+                latents_dtype = latents.dtype
+                latents = self.scheduler.step(noise_pred, t, latents, return_dict=False)[0]
+
+                if latents.dtype != latents_dtype:
+                    if torch.backends.mps.is_available():
+                        # some platforms (eg. apple mps) misbehave due to a pytorch bug: https://github.com/pytorch/pytorch/pull/99272
+                        latents = latents.to(latents_dtype)
+
+                if callback_on_step_end is not None:
+                    callback_kwargs = {}
+                    for k in callback_on_step_end_tensor_inputs:
+                        callback_kwargs[k] = locals()[k]
+                    callback_outputs = callback_on_step_end(self, i, t, callback_kwargs)
+
+                    latents = callback_outputs.pop("latents", latents)
+                    prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds)
+
+                # call the callback, if provided
+                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+                    progress_bar.update()
+
+                if XLA_AVAILABLE:
+                    xm.mark_step()
+
+        self._current_timestep = None
+
+        if output_type == "latent":
+            image = latents
+        else:
+            latents = self._unpack_latents(latents, height, width, self.vae_scale_factor)
+            latents = (latents / self.vae.config.scaling_factor) + self.vae.config.shift_factor
+            image = self.vae.decode(latents, return_dict=False)[0]
+            image = self.image_processor.postprocess(image, output_type=output_type)
+
+        # Offload all models
+        self.maybe_free_model_hooks()
+
+        if not return_dict:
+            return (image,)
+
+        return FluxPipelineOutput(images=image)

From 3c2865c5345f0d1ae506050bd559bdbfeead5e94 Mon Sep 17 00:00:00 2001
From: Edna <88869424+Ednaordinary@users.noreply.github.com>
Date: Mon, 9 Jun 2025 21:02:12 -0600
Subject: [PATCH 002/108] working state form hameerabbasi and iddl
 (transformer)

---
 .../models/transformers/transformer_chroma.py | 636 ++++++++++++++++++
 1 file changed, 636 insertions(+)
 create mode 100644 src/diffusers/models/transformers/transformer_chroma.py

diff --git a/src/diffusers/models/transformers/transformer_chroma.py b/src/diffusers/models/transformers/transformer_chroma.py
new file mode 100644
index 000000000000..c542bcaaccf6
--- /dev/null
+++ b/src/diffusers/models/transformers/transformer_chroma.py
@@ -0,0 +1,636 @@
+# Copyright 2024 Black Forest Labs, The HuggingFace Team and The InstantX Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+from typing import Any, Dict, Optional, Tuple, Union
+
+import numpy as np
+import torch
+import torch.nn as nn
+
+from ...configuration_utils import ConfigMixin, register_to_config
+from ...loaders import FluxTransformer2DLoadersMixin, FromOriginalModelMixin, PeftAdapterMixin
+from ...utils import USE_PEFT_BACKEND, deprecate, logging, scale_lora_layers, unscale_lora_layers
+from ...utils.import_utils import is_torch_npu_available
+from ...utils.torch_utils import maybe_allow_in_graph
+from ..attention import FeedForward
+from ..attention_processor import (
+    Attention,
+    AttentionProcessor,
+    FluxAttnProcessor2_0,
+    FluxAttnProcessor2_0_NPU,
+    FusedFluxAttnProcessor2_0,
+)
+from ..cache_utils import CacheMixin
+from ..embeddings import (
+    CombinedTimestepGuidanceTextProjEmbeddings,
+    CombinedTimestepTextProjChromaEmbeddings,
+    CombinedTimestepTextProjEmbeddings,
+    ChromaApproximator,
+    FluxPosEmbed,
+)
+from ..modeling_outputs import Transformer2DModelOutput
+from ..modeling_utils import ModelMixin
+from ..normalization import (
+    AdaLayerNormContinuous,
+    AdaLayerNormContinuousPruned,
+    AdaLayerNormZero,
+    AdaLayerNormZeroPruned,
+    AdaLayerNormZeroSingle,
+    AdaLayerNormZeroSinglePruned,
+)
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+INVALID_VARIANT_ERRMSG = "`variant` must be `'flux' or `'chroma'`."
+
+
+@maybe_allow_in_graph
+class FluxSingleTransformerBlock(nn.Module):
+    def __init__(
+        self,
+        dim: int,
+        num_attention_heads: int,
+        attention_head_dim: int,
+        mlp_ratio: float = 4.0,
+        variant: str = "flux",
+    ):
+        super().__init__()
+        self.mlp_hidden_dim = int(dim * mlp_ratio)
+
+        if variant == "flux":
+            self.norm = AdaLayerNormZeroSingle(dim)
+        elif variant == "chroma":
+            self.norm = AdaLayerNormZeroSinglePruned(dim)
+        else:
+            raise ValueError(INVALID_VARIANT_ERRMSG)
+
+        self.proj_mlp = nn.Linear(dim, self.mlp_hidden_dim)
+        self.act_mlp = nn.GELU(approximate="tanh")
+        self.proj_out = nn.Linear(dim + self.mlp_hidden_dim, dim)
+
+        if is_torch_npu_available():
+            deprecation_message = (
+                "Defaulting to FluxAttnProcessor2_0_NPU for NPU devices will be removed. Attention processors "
+                "should be set explicitly using the `set_attn_processor` method."
+            )
+            deprecate("npu_processor", "0.34.0", deprecation_message)
+            processor = FluxAttnProcessor2_0_NPU()
+        else:
+            processor = FluxAttnProcessor2_0()
+
+        self.attn = Attention(
+            query_dim=dim,
+            cross_attention_dim=None,
+            dim_head=attention_head_dim,
+            heads=num_attention_heads,
+            out_dim=dim,
+            bias=True,
+            processor=processor,
+            qk_norm="rms_norm",
+            eps=1e-6,
+            pre_only=True,
+        )
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        temb: torch.Tensor,
+        image_rotary_emb: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
+        joint_attention_kwargs: Optional[Dict[str, Any]] = None,
+    ) -> torch.Tensor:
+        residual = hidden_states
+        norm_hidden_states, gate = self.norm(hidden_states, emb=temb)
+        mlp_hidden_states = self.act_mlp(self.proj_mlp(norm_hidden_states))
+        joint_attention_kwargs = joint_attention_kwargs or {}
+        attn_output = self.attn(
+            hidden_states=norm_hidden_states,
+            image_rotary_emb=image_rotary_emb,
+            **joint_attention_kwargs,
+        )
+
+        hidden_states = torch.cat([attn_output, mlp_hidden_states], dim=2)
+        gate = gate.unsqueeze(1)
+        hidden_states = gate * self.proj_out(hidden_states)
+        hidden_states = residual + hidden_states
+        if hidden_states.dtype == torch.float16:
+            hidden_states = hidden_states.clip(-65504, 65504)
+
+        return hidden_states
+
+
+@maybe_allow_in_graph
+class FluxTransformerBlock(nn.Module):
+    def __init__(
+        self,
+        dim: int,
+        num_attention_heads: int,
+        attention_head_dim: int,
+        qk_norm: str = "rms_norm",
+        eps: float = 1e-6,
+        variant: str = "flux",
+    ):
+        super().__init__()
+
+        if variant == "flux":
+            self.norm1 = AdaLayerNormZero(dim)
+            self.norm1_context = AdaLayerNormZero(dim)
+        elif variant == "chroma":
+            self.norm1 = AdaLayerNormZeroPruned(dim)
+            self.norm1_context = AdaLayerNormZeroPruned(dim)
+        else:
+            raise ValueError(INVALID_VARIANT_ERRMSG)
+
+        self.attn = Attention(
+            query_dim=dim,
+            cross_attention_dim=None,
+            added_kv_proj_dim=dim,
+            dim_head=attention_head_dim,
+            heads=num_attention_heads,
+            out_dim=dim,
+            context_pre_only=False,
+            bias=True,
+            processor=FluxAttnProcessor2_0(),
+            qk_norm=qk_norm,
+            eps=eps,
+        )
+
+        self.norm2 = nn.LayerNorm(dim, elementwise_affine=False, eps=1e-6)
+        self.ff = FeedForward(dim=dim, dim_out=dim, activation_fn="gelu-approximate")
+
+        self.norm2_context = nn.LayerNorm(dim, elementwise_affine=False, eps=1e-6)
+        self.ff_context = FeedForward(dim=dim, dim_out=dim, activation_fn="gelu-approximate")
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        encoder_hidden_states: torch.Tensor,
+        temb: torch.Tensor,
+        image_rotary_emb: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
+        joint_attention_kwargs: Optional[Dict[str, Any]] = None,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        temb_img, temb_txt = temb[:, :6], temb[:, 6:]
+        norm_hidden_states, gate_msa, shift_mlp, scale_mlp, gate_mlp = self.norm1(hidden_states, emb=temb_img)
+
+        norm_encoder_hidden_states, c_gate_msa, c_shift_mlp, c_scale_mlp, c_gate_mlp = self.norm1_context(
+            encoder_hidden_states, emb=temb_txt
+        )
+        joint_attention_kwargs = joint_attention_kwargs or {}
+        # Attention.
+        attention_outputs = self.attn(
+            hidden_states=norm_hidden_states,
+            encoder_hidden_states=norm_encoder_hidden_states,
+            image_rotary_emb=image_rotary_emb,
+            **joint_attention_kwargs,
+        )
+
+        if len(attention_outputs) == 2:
+            attn_output, context_attn_output = attention_outputs
+        elif len(attention_outputs) == 3:
+            attn_output, context_attn_output, ip_attn_output = attention_outputs
+
+        # Process attention outputs for the `hidden_states`.
+        attn_output = gate_msa.unsqueeze(1) * attn_output
+        hidden_states = hidden_states + attn_output
+
+        norm_hidden_states = self.norm2(hidden_states)
+        norm_hidden_states = norm_hidden_states * (1 + scale_mlp[:, None]) + shift_mlp[:, None]
+
+        ff_output = self.ff(norm_hidden_states)
+        ff_output = gate_mlp.unsqueeze(1) * ff_output
+
+        hidden_states = hidden_states + ff_output
+        if len(attention_outputs) == 3:
+            hidden_states = hidden_states + ip_attn_output
+
+        # Process attention outputs for the `encoder_hidden_states`.
+
+        context_attn_output = c_gate_msa.unsqueeze(1) * context_attn_output
+        encoder_hidden_states = encoder_hidden_states + context_attn_output
+
+        norm_encoder_hidden_states = self.norm2_context(encoder_hidden_states)
+        norm_encoder_hidden_states = norm_encoder_hidden_states * (1 + c_scale_mlp[:, None]) + c_shift_mlp[:, None]
+
+        context_ff_output = self.ff_context(norm_encoder_hidden_states)
+        encoder_hidden_states = encoder_hidden_states + c_gate_mlp.unsqueeze(1) * context_ff_output
+        if encoder_hidden_states.dtype == torch.float16:
+            encoder_hidden_states = encoder_hidden_states.clip(-65504, 65504)
+
+        return encoder_hidden_states, hidden_states
+
+
+class FluxTransformer2DModel(
+    ModelMixin, ConfigMixin, PeftAdapterMixin, FromOriginalModelMixin, FluxTransformer2DLoadersMixin, CacheMixin
+):
+    """
+    The Transformer model introduced in Flux.
+
+    Reference: https://blackforestlabs.ai/announcing-black-forest-labs/
+
+    Args:
+        patch_size (`int`, defaults to `1`):
+            Patch size to turn the input data into small patches.
+        in_channels (`int`, defaults to `64`):
+            The number of channels in the input.
+        out_channels (`int`, *optional*, defaults to `None`):
+            The number of channels in the output. If not specified, it defaults to `in_channels`.
+        num_layers (`int`, defaults to `19`):
+            The number of layers of dual stream DiT blocks to use.
+        num_single_layers (`int`, defaults to `38`):
+            The number of layers of single stream DiT blocks to use.
+        attention_head_dim (`int`, defaults to `128`):
+            The number of dimensions to use for each attention head.
+        num_attention_heads (`int`, defaults to `24`):
+            The number of attention heads to use.
+        joint_attention_dim (`int`, defaults to `4096`):
+            The number of dimensions to use for the joint attention (embedding/channel dimension of
+            `encoder_hidden_states`).
+        pooled_projection_dim (`int`, defaults to `768`):
+            The number of dimensions to use for the pooled projection.
+        guidance_embeds (`bool`, defaults to `False`):
+            Whether to use guidance embeddings for guidance-distilled variant of the model.
+        axes_dims_rope (`Tuple[int]`, defaults to `(16, 56, 56)`):
+            The dimensions to use for the rotary positional embeddings.
+    """
+
+    _supports_gradient_checkpointing = True
+    _no_split_modules = ["FluxTransformerBlock", "FluxSingleTransformerBlock"]
+    _skip_layerwise_casting_patterns = ["pos_embed", "norm"]
+
+    @register_to_config
+    def __init__(
+        self,
+        patch_size: int = 1,
+        in_channels: int = 64,
+        out_channels: Optional[int] = None,
+        num_layers: int = 19,
+        num_single_layers: int = 38,
+        attention_head_dim: int = 128,
+        num_attention_heads: int = 24,
+        joint_attention_dim: int = 4096,
+        pooled_projection_dim: int = 768,
+        guidance_embeds: bool = False,
+        axes_dims_rope: Tuple[int, ...] = (16, 56, 56),
+        variant: str = "flux",
+        approximator_in_factor: int = 16,
+        approximator_hidden_dim: int = 5120,
+        approximator_layers: int = 5,
+    ):
+        super().__init__()
+        self.out_channels = out_channels or in_channels
+        self.inner_dim = num_attention_heads * attention_head_dim
+
+        self.pos_embed = FluxPosEmbed(theta=10000, axes_dim=axes_dims_rope)
+
+        if variant == "flux":
+            text_time_guidance_cls = (
+                CombinedTimestepGuidanceTextProjEmbeddings if guidance_embeds else CombinedTimestepTextProjEmbeddings
+            )
+            self.time_text_embed = text_time_guidance_cls(
+                embedding_dim=self.inner_dim, pooled_projection_dim=pooled_projection_dim
+            )
+        elif variant == "chroma":
+            self.time_text_embed = CombinedTimestepTextProjChromaEmbeddings(
+                factor=approximator_in_factor,
+                hidden_dim=approximator_hidden_dim,
+                out_dim=3 * num_single_layers + 2 * 6 * num_layers + 2,
+                embedding_dim=self.inner_dim,
+                n_layers=approximator_layers,
+            )
+            self.distilled_guidance_layer = ChromaApproximator(in_dim=64, out_dim=3072, hidden_dim=5120, n_layers=5)
+        else:
+            raise ValueError(INVALID_VARIANT_ERRMSG)
+
+        self.context_embedder = nn.Linear(joint_attention_dim, self.inner_dim)
+        self.x_embedder = nn.Linear(in_channels, self.inner_dim)
+
+        self.transformer_blocks = nn.ModuleList(
+            [
+                FluxTransformerBlock(
+                    dim=self.inner_dim,
+                    num_attention_heads=num_attention_heads,
+                    attention_head_dim=attention_head_dim,
+                    variant=variant,
+                )
+                for _ in range(num_layers)
+            ]
+        )
+
+        self.single_transformer_blocks = nn.ModuleList(
+            [
+                FluxSingleTransformerBlock(
+                    dim=self.inner_dim,
+                    num_attention_heads=num_attention_heads,
+                    attention_head_dim=attention_head_dim,
+                    variant=variant,
+                )
+                for _ in range(num_single_layers)
+            ]
+        )
+
+        norm_out_cls = AdaLayerNormContinuous if variant != "chroma" else AdaLayerNormContinuousPruned
+        self.norm_out = norm_out_cls(self.inner_dim, self.inner_dim, elementwise_affine=False, eps=1e-6)
+        self.proj_out = nn.Linear(self.inner_dim, patch_size * patch_size * self.out_channels, bias=True)
+
+        self.gradient_checkpointing = False
+
+    @property
+    def is_chroma(self) -> bool:
+        return isinstance(self.time_text_embed, CombinedTimestepTextProjChromaEmbeddings)
+
+    @property
+    # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.attn_processors
+    def attn_processors(self) -> Dict[str, AttentionProcessor]:
+        r"""
+        Returns:
+            `dict` of attention processors: A dictionary containing all attention processors used in the model with
+            indexed by its weight name.
+        """
+        # set recursively
+        processors = {}
+
+        def fn_recursive_add_processors(name: str, module: torch.nn.Module, processors: Dict[str, AttentionProcessor]):
+            if hasattr(module, "get_processor"):
+                processors[f"{name}.processor"] = module.get_processor()
+
+            for sub_name, child in module.named_children():
+                fn_recursive_add_processors(f"{name}.{sub_name}", child, processors)
+
+            return processors
+
+        for name, module in self.named_children():
+            fn_recursive_add_processors(name, module, processors)
+
+        return processors
+
+    # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.set_attn_processor
+    def set_attn_processor(self, processor: Union[AttentionProcessor, Dict[str, AttentionProcessor]]):
+        r"""
+        Sets the attention processor to use to compute attention.
+
+        Parameters:
+            processor (`dict` of `AttentionProcessor` or only `AttentionProcessor`):
+                The instantiated processor class or a dictionary of processor classes that will be set as the processor
+                for **all** `Attention` layers.
+
+                If `processor` is a dict, the key needs to define the path to the corresponding cross attention
+                processor. This is strongly recommended when setting trainable attention processors.
+
+        """
+        count = len(self.attn_processors.keys())
+
+        if isinstance(processor, dict) and len(processor) != count:
+            raise ValueError(
+                f"A dict of processors was passed, but the number of processors {len(processor)} does not match the"
+                f" number of attention layers: {count}. Please make sure to pass {count} processor classes."
+            )
+
+        def fn_recursive_attn_processor(name: str, module: torch.nn.Module, processor):
+            if hasattr(module, "set_processor"):
+                if not isinstance(processor, dict):
+                    module.set_processor(processor)
+                else:
+                    module.set_processor(processor.pop(f"{name}.processor"))
+
+            for sub_name, child in module.named_children():
+                fn_recursive_attn_processor(f"{name}.{sub_name}", child, processor)
+
+        for name, module in self.named_children():
+            fn_recursive_attn_processor(name, module, processor)
+
+    # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.fuse_qkv_projections with FusedAttnProcessor2_0->FusedFluxAttnProcessor2_0
+    def fuse_qkv_projections(self):
+        """
+        Enables fused QKV projections. For self-attention modules, all projection matrices (i.e., query, key, value)
+        are fused. For cross-attention modules, key and value projection matrices are fused.
+
+        <Tip warning={true}>
+
+        This API is 🧪 experimental.
+
+        </Tip>
+        """
+        self.original_attn_processors = None
+
+        for _, attn_processor in self.attn_processors.items():
+            if "Added" in str(attn_processor.__class__.__name__):
+                raise ValueError("`fuse_qkv_projections()` is not supported for models having added KV projections.")
+
+        self.original_attn_processors = self.attn_processors
+
+        for module in self.modules():
+            if isinstance(module, Attention):
+                module.fuse_projections(fuse=True)
+
+        self.set_attn_processor(FusedFluxAttnProcessor2_0())
+
+    # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.unfuse_qkv_projections
+    def unfuse_qkv_projections(self):
+        """Disables the fused QKV projection if enabled.
+
+        <Tip warning={true}>
+
+        This API is 🧪 experimental.
+
+        </Tip>
+
+        """
+        if self.original_attn_processors is not None:
+            self.set_attn_processor(self.original_attn_processors)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        encoder_hidden_states: torch.Tensor = None,
+        pooled_projections: torch.Tensor = None,
+        timestep: torch.LongTensor = None,
+        img_ids: torch.Tensor = None,
+        txt_ids: torch.Tensor = None,
+        guidance: torch.Tensor = None,
+        joint_attention_kwargs: Optional[Dict[str, Any]] = None,
+        controlnet_block_samples=None,
+        controlnet_single_block_samples=None,
+        return_dict: bool = True,
+        controlnet_blocks_repeat: bool = False,
+    ) -> Union[torch.Tensor, Transformer2DModelOutput]:
+        """
+        The [`FluxTransformer2DModel`] forward method.
+
+        Args:
+            hidden_states (`torch.Tensor` of shape `(batch_size, image_sequence_length, in_channels)`):
+                Input `hidden_states`.
+            encoder_hidden_states (`torch.Tensor` of shape `(batch_size, text_sequence_length, joint_attention_dim)`):
+                Conditional embeddings (embeddings computed from the input conditions such as prompts) to use.
+            pooled_projections (`torch.Tensor` of shape `(batch_size, projection_dim)`): Embeddings projected
+                from the embeddings of input conditions.
+            timestep ( `torch.LongTensor`):
+                Used to indicate denoising step.
+            block_controlnet_hidden_states: (`list` of `torch.Tensor`):
+                A list of tensors that if specified are added to the residuals of transformer blocks.
+            joint_attention_kwargs (`dict`, *optional*):
+                A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
+                `self.processor` in
+                [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~models.transformer_2d.Transformer2DModelOutput`] instead of a plain
+                tuple.
+
+        Returns:
+            If `return_dict` is True, an [`~models.transformer_2d.Transformer2DModelOutput`] is returned, otherwise a
+            `tuple` where the first element is the sample tensor.
+        """
+        if joint_attention_kwargs is not None:
+            joint_attention_kwargs = joint_attention_kwargs.copy()
+            lora_scale = joint_attention_kwargs.pop("scale", 1.0)
+        else:
+            lora_scale = 1.0
+
+        if USE_PEFT_BACKEND:
+            # weight the lora layers by setting `lora_scale` for each PEFT layer
+            scale_lora_layers(self, lora_scale)
+        else:
+            if joint_attention_kwargs is not None and joint_attention_kwargs.get("scale", None) is not None:
+                logger.warning(
+                    "Passing `scale` via `joint_attention_kwargs` when not using the PEFT backend is ineffective."
+                )
+
+        is_chroma = self.is_chroma
+        hidden_states = self.x_embedder(hidden_states)
+
+        timestep = timestep.to(hidden_states.dtype) * 1000
+        if guidance is not None:
+            guidance = guidance.to(hidden_states.dtype) * 1000
+
+        if not is_chroma:
+            temb = (
+                self.time_text_embed(timestep, pooled_projections)
+                if guidance is None
+                else self.time_text_embed(timestep, guidance, pooled_projections)
+            )
+        else:
+            input_vec = self.time_text_embed(timestep, guidance, pooled_projections)
+            pooled_temb = self.distilled_guidance_layer(input_vec)
+
+        encoder_hidden_states = self.context_embedder(encoder_hidden_states)
+
+        if txt_ids.ndim == 3:
+            logger.warning(
+                "Passing `txt_ids` 3d torch.Tensor is deprecated."
+                "Please remove the batch dimension and pass it as a 2d torch Tensor"
+            )
+            txt_ids = txt_ids[0]
+        if img_ids.ndim == 3:
+            logger.warning(
+                "Passing `img_ids` 3d torch.Tensor is deprecated."
+                "Please remove the batch dimension and pass it as a 2d torch Tensor"
+            )
+            img_ids = img_ids[0]
+
+        ids = torch.cat((txt_ids, img_ids), dim=0)
+        image_rotary_emb = self.pos_embed(ids)
+
+        if joint_attention_kwargs is not None and "ip_adapter_image_embeds" in joint_attention_kwargs:
+            ip_adapter_image_embeds = joint_attention_kwargs.pop("ip_adapter_image_embeds")
+            ip_hidden_states = self.encoder_hid_proj(ip_adapter_image_embeds)
+            joint_attention_kwargs.update({"ip_hidden_states": ip_hidden_states})
+
+        for index_block, block in enumerate(self.transformer_blocks):
+            if is_chroma:
+                img_offset = 3 * len(self.single_transformer_blocks)
+                txt_offset = img_offset + 6 * len(self.transformer_blocks)
+                img_modulation = img_offset + 6 * index_block
+                text_modulation = txt_offset + 6 * index_block
+                temb = torch.cat(
+                    (
+                        pooled_temb[:, img_modulation : img_modulation + 6],
+                        pooled_temb[:, text_modulation : text_modulation + 6],
+                    ),
+                    dim=1,
+                )
+            if torch.is_grad_enabled() and self.gradient_checkpointing:
+                encoder_hidden_states, hidden_states = self._gradient_checkpointing_func(
+                    block,
+                    hidden_states,
+                    encoder_hidden_states,
+                    temb,
+                    image_rotary_emb,
+                )
+
+            else:
+                encoder_hidden_states, hidden_states = block(
+                    hidden_states=hidden_states,
+                    encoder_hidden_states=encoder_hidden_states,
+                    temb=temb,
+                    image_rotary_emb=image_rotary_emb,
+                    joint_attention_kwargs=joint_attention_kwargs,
+                )
+
+            # controlnet residual
+            if controlnet_block_samples is not None:
+                interval_control = len(self.transformer_blocks) / len(controlnet_block_samples)
+                interval_control = int(np.ceil(interval_control))
+                # For Xlabs ControlNet.
+                if controlnet_blocks_repeat:
+                    hidden_states = (
+                        hidden_states + controlnet_block_samples[index_block % len(controlnet_block_samples)]
+                    )
+                else:
+                    hidden_states = hidden_states + controlnet_block_samples[index_block // interval_control]
+        hidden_states = torch.cat([encoder_hidden_states, hidden_states], dim=1)
+
+        for index_block, block in enumerate(self.single_transformer_blocks):
+            if is_chroma:
+                start_idx = 3 * index_block
+                temb = pooled_temb[:, start_idx : start_idx + 3]
+            if torch.is_grad_enabled() and self.gradient_checkpointing:
+                hidden_states = self._gradient_checkpointing_func(
+                    block,
+                    hidden_states,
+                    temb,
+                    image_rotary_emb,
+                )
+
+            else:
+                hidden_states = block(
+                    hidden_states=hidden_states,
+                    temb=temb,
+                    image_rotary_emb=image_rotary_emb,
+                    joint_attention_kwargs=joint_attention_kwargs,
+                )
+
+            # controlnet residual
+            if controlnet_single_block_samples is not None:
+                interval_control = len(self.single_transformer_blocks) / len(controlnet_single_block_samples)
+                interval_control = int(np.ceil(interval_control))
+                hidden_states[:, encoder_hidden_states.shape[1] :, ...] = (
+                    hidden_states[:, encoder_hidden_states.shape[1] :, ...]
+                    + controlnet_single_block_samples[index_block // interval_control]
+                )
+
+        hidden_states = hidden_states[:, encoder_hidden_states.shape[1] :, ...]
+
+        if is_chroma:
+            temb = pooled_temb[:, -2:]
+        hidden_states = self.norm_out(hidden_states, temb)
+        output = self.proj_out(hidden_states)
+
+        if USE_PEFT_BACKEND:
+            # remove `lora_scale` from each PEFT layer
+            unscale_lora_layers(self, lora_scale)
+
+        if not return_dict:
+            return (output,)
+
+        return Transformer2DModelOutput(sample=output)

From e271af9495435016e2af1230e66ea242e624c720 Mon Sep 17 00:00:00 2001
From: Edna <88869424+Ednaordinary@users.noreply.github.com>
Date: Mon, 9 Jun 2025 21:03:10 -0600
Subject: [PATCH 003/108] working state (normalization)

---
 src/diffusers/models/normalization.py | 119 +++++++++++++++++++++++++-
 1 file changed, 116 insertions(+), 3 deletions(-)

diff --git a/src/diffusers/models/normalization.py b/src/diffusers/models/normalization.py
index 4a512c5cb166..f2b71bb6888e 100644
--- a/src/diffusers/models/normalization.py
+++ b/src/diffusers/models/normalization.py
@@ -171,6 +171,46 @@ def forward(
         return x, gate_msa, shift_mlp, scale_mlp, gate_mlp
 
 
+class AdaLayerNormZeroPruned(nn.Module):
+    r"""
+    Norm layer adaptive layer norm zero (adaLN-Zero).
+
+    Parameters:
+        embedding_dim (`int`): The size of each embedding vector.
+        num_embeddings (`int`): The size of the embeddings dictionary.
+    """
+
+    def __init__(self, embedding_dim: int, num_embeddings: Optional[int] = None, norm_type="layer_norm", bias=True):
+        super().__init__()
+        if num_embeddings is not None:
+            self.emb = CombinedTimestepLabelEmbeddings(num_embeddings, embedding_dim)
+        else:
+            self.emb = None
+
+        if norm_type == "layer_norm":
+            self.norm = nn.LayerNorm(embedding_dim, elementwise_affine=False, eps=1e-6)
+        elif norm_type == "fp32_layer_norm":
+            self.norm = FP32LayerNorm(embedding_dim, elementwise_affine=False, bias=False)
+        else:
+            raise ValueError(
+                f"Unsupported `norm_type` ({norm_type}) provided. Supported ones are: 'layer_norm', 'fp32_layer_norm'."
+            )
+
+    def forward(
+        self,
+        x: torch.Tensor,
+        timestep: Optional[torch.Tensor] = None,
+        class_labels: Optional[torch.LongTensor] = None,
+        hidden_dtype: Optional[torch.dtype] = None,
+        emb: Optional[torch.Tensor] = None,
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
+        if self.emb is not None:
+            emb = self.emb(timestep, class_labels, hidden_dtype=hidden_dtype)
+        shift_msa, scale_msa, gate_msa, shift_mlp, scale_mlp, gate_mlp = emb.squeeze(0).chunk(6, dim=0)
+        x = self.norm(x) * (1 + scale_msa[:, None]) + shift_msa[:, None]
+        return x, gate_msa, shift_mlp, scale_mlp, gate_mlp
+
+
 class AdaLayerNormZeroSingle(nn.Module):
     r"""
     Norm layer adaptive layer norm zero (adaLN-Zero).
@@ -203,6 +243,35 @@ def forward(
         return x, gate_msa
 
 
+class AdaLayerNormZeroSinglePruned(nn.Module):
+    r"""
+    Norm layer adaptive layer norm zero (adaLN-Zero).
+
+    Parameters:
+        embedding_dim (`int`): The size of each embedding vector.
+        num_embeddings (`int`): The size of the embeddings dictionary.
+    """
+
+    def __init__(self, embedding_dim: int, norm_type="layer_norm", bias=True):
+        super().__init__()
+
+        if norm_type == "layer_norm":
+            self.norm = nn.LayerNorm(embedding_dim, elementwise_affine=False, eps=1e-6)
+        else:
+            raise ValueError(
+                f"Unsupported `norm_type` ({norm_type}) provided. Supported ones are: 'layer_norm', 'fp32_layer_norm'."
+            )
+
+    def forward(
+        self,
+        x: torch.Tensor,
+        emb: Optional[torch.Tensor] = None,
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
+        shift_msa, scale_msa, gate_msa = emb.squeeze(0).chunk(3, dim=0)
+        x = self.norm(x) * (1 + scale_msa[:, None]) + shift_msa[:, None]
+        return x, gate_msa
+
+
 class LuminaRMSNormZero(nn.Module):
     """
     Norm layer adaptive RMS normalization zero.
@@ -237,7 +306,7 @@ class AdaLayerNormSingle(nn.Module):
     r"""
     Norm layer adaptive layer norm single (adaLN-single).
 
-    As proposed in PixArt-Alpha (see: https://huggingface.co/papers/2310.00426; Section 2.3).
+    As proposed in PixArt-Alpha (see: https://arxiv.org/abs/2310.00426; Section 2.3).
 
     Parameters:
         embedding_dim (`int`): The size of each embedding vector.
@@ -305,6 +374,50 @@ def forward(self, x: torch.Tensor, emb: torch.Tensor) -> torch.Tensor:
         return x
 
 
+class AdaLayerNormContinuousPruned(nn.Module):
+    r"""
+    Adaptive normalization layer with a norm layer (layer_norm or rms_norm).
+
+    Args:
+        embedding_dim (`int`): Embedding dimension to use during projection.
+        conditioning_embedding_dim (`int`): Dimension of the input condition.
+        elementwise_affine (`bool`, defaults to `True`):
+            Boolean flag to denote if affine transformation should be applied.
+        eps (`float`, defaults to 1e-5): Epsilon factor.
+        bias (`bias`, defaults to `True`): Boolean flag to denote if bias should be use.
+        norm_type (`str`, defaults to `"layer_norm"`):
+            Normalization layer to use. Values supported: "layer_norm", "rms_norm".
+    """
+
+    def __init__(
+        self,
+        embedding_dim: int,
+        conditioning_embedding_dim: int,
+        # NOTE: It is a bit weird that the norm layer can be configured to have scale and shift parameters
+        # because the output is immediately scaled and shifted by the projected conditioning embeddings.
+        # Note that AdaLayerNorm does not let the norm layer have scale and shift parameters.
+        # However, this is how it was implemented in the original code, and it's rather likely you should
+        # set `elementwise_affine` to False.
+        elementwise_affine=True,
+        eps=1e-5,
+        bias=True,
+        norm_type="layer_norm",
+    ):
+        super().__init__()
+        if norm_type == "layer_norm":
+            self.norm = LayerNorm(embedding_dim, eps, elementwise_affine, bias)
+        elif norm_type == "rms_norm":
+            self.norm = RMSNorm(embedding_dim, eps, elementwise_affine)
+        else:
+            raise ValueError(f"unknown norm_type {norm_type}")
+
+    def forward(self, x: torch.Tensor, emb: torch.Tensor) -> torch.Tensor:
+        # convert back to the original dtype in case `conditioning_embedding`` is upcasted to float32 (needed for hunyuanDiT)
+        shift, scale = torch.chunk(emb.squeeze(0).to(x.dtype), 2, dim=0)
+        x = self.norm(x) * (1 + scale)[:, None, :] + shift[:, None, :]
+        return x
+
+
 class AdaLayerNormContinuous(nn.Module):
     r"""
     Adaptive normalization layer with a norm layer (layer_norm or rms_norm).
@@ -510,7 +623,7 @@ def forward(self, input):
 
 class RMSNorm(nn.Module):
     r"""
-    RMS Norm as introduced in https://huggingface.co/papers/1910.07467 by Zhang et al.
+    RMS Norm as introduced in https://arxiv.org/abs/1910.07467 by Zhang et al.
 
     Args:
         dim (`int`): Number of dimensions to use for `weights`. Only effective when `elementwise_affine` is True.
@@ -600,7 +713,7 @@ def forward(self, hidden_states):
 
 class GlobalResponseNorm(nn.Module):
     r"""
-    Global response normalization as introduced in ConvNeXt-v2 (https://huggingface.co/papers/2301.00808).
+    Global response normalization as introduced in ConvNeXt-v2 (https://arxiv.org/abs/2301.00808).
 
     Args:
         dim (`int`): Number of dimensions to use for the `gamma` and `beta`.

From 15f2bd5c3971f94475eacc01c3ac5ac802e32461 Mon Sep 17 00:00:00 2001
From: Edna <88869424+Ednaordinary@users.noreply.github.com>
Date: Mon, 9 Jun 2025 21:05:59 -0600
Subject: [PATCH 004/108] working state (embeddings)

---
 src/diffusers/models/embeddings.py | 54 ++++++++++++++++++++++++++++--
 1 file changed, 51 insertions(+), 3 deletions(-)

diff --git a/src/diffusers/models/embeddings.py b/src/diffusers/models/embeddings.py
index c25e9997e3fb..8aa2ea5841e9 100644
--- a/src/diffusers/models/embeddings.py
+++ b/src/diffusers/models/embeddings.py
@@ -31,7 +31,7 @@ def get_timestep_embedding(
     downscale_freq_shift: float = 1,
     scale: float = 1,
     max_period: int = 10000,
-):
+) -> torch.Tensor:
     """
     This matches the implementation in Denoising Diffusion Probabilistic Models: Create sinusoidal timestep embeddings.
 
@@ -1327,7 +1327,7 @@ def __init__(self, num_channels: int, flip_sin_to_cos: bool, downscale_freq_shif
         self.downscale_freq_shift = downscale_freq_shift
         self.scale = scale
 
-    def forward(self, timesteps):
+    def forward(self, timesteps: torch.Tensor) -> torch.Tensor:
         t_emb = get_timestep_embedding(
             timesteps,
             self.num_channels,
@@ -1401,7 +1401,7 @@ class ImagePositionalEmbeddings(nn.Module):
     Converts latent image classes into vector embeddings. Sums the vector embeddings with positional embeddings for the
     height and width of the latent space.
 
-    For more details, see figure 10 of the dall-e paper: https://huggingface.co/papers/2102.12092
+    For more details, see figure 10 of the dall-e paper: https://arxiv.org/abs/2102.12092
 
     For VQ-diffusion:
 
@@ -1637,6 +1637,35 @@ def forward(self, timestep, guidance, pooled_projection):
         return conditioning
 
 
+class CombinedTimestepTextProjChromaEmbeddings(nn.Module):
+    def __init__(self, factor: int, hidden_dim: int, out_dim: int, n_layers: int, embedding_dim: int):
+        super().__init__()
+
+        self.time_proj = Timesteps(num_channels=factor, flip_sin_to_cos=True, downscale_freq_shift=0)
+        self.guidance_proj = Timesteps(num_channels=factor, flip_sin_to_cos=True, downscale_freq_shift=0)
+
+        self.register_buffer(
+            "mod_proj",
+            get_timestep_embedding(torch.arange(out_dim)*1000, 2 * factor, flip_sin_to_cos=True, downscale_freq_shift=0, ),
+            persistent=False,
+        )
+
+    def forward(
+        self, timestep: torch.Tensor, guidance: Optional[torch.Tensor], pooled_projections: torch.Tensor
+    ) -> torch.Tensor:
+        mod_index_length = self.mod_proj.shape[0]
+        timesteps_proj = self.time_proj(timestep).to(dtype=timestep.dtype)
+        guidance_proj = self.guidance_proj(torch.tensor([0])).to(dtype=timestep.dtype, device=timestep.device)
+
+        mod_proj = self.mod_proj.to(dtype=timesteps_proj.dtype, device=timesteps_proj.device)
+        timestep_guidance = (
+            torch.cat([timesteps_proj, guidance_proj], dim=1).unsqueeze(1).repeat(1, mod_index_length, 1)
+        )
+        input_vec = torch.cat([timestep_guidance, mod_proj.unsqueeze(0)], dim=-1)
+
+        return input_vec
+
+
 class CogView3CombinedTimestepSizeEmbeddings(nn.Module):
     def __init__(self, embedding_dim: int, condition_dim: int, pooled_projection_dim: int, timesteps_dim: int = 256):
         super().__init__()
@@ -2230,6 +2259,25 @@ def forward(self, caption):
         return hidden_states
 
 
+class ChromaApproximator(nn.Module):
+    def __init__(self, in_dim: int, out_dim: int, hidden_dim: int, n_layers: int = 5):
+        super().__init__()
+        self.in_proj = nn.Linear(in_dim, hidden_dim, bias=True)
+        self.layers = nn.ModuleList(
+            [PixArtAlphaTextProjection(hidden_dim, hidden_dim, act_fn="silu") for _ in range(n_layers)]
+        )
+        self.norms = nn.ModuleList([nn.RMSNorm(hidden_dim) for _ in range(n_layers)])
+        self.out_proj = nn.Linear(hidden_dim, out_dim)
+
+    def forward(self, x):
+        x = self.in_proj(x)
+
+        for layer, norms in zip(self.layers, self.norms):
+            x = x + layer(norms(x))
+
+        return self.out_proj(x)
+
+
 class IPAdapterPlusImageProjectionBlock(nn.Module):
     def __init__(
         self,

From 32e6a006cfe486ba774acf2920ffcf5382ed2449 Mon Sep 17 00:00:00 2001
From: Edna <88869424+Ednaordinary@users.noreply.github.com>
Date: Mon, 9 Jun 2025 21:13:32 -0600
Subject: [PATCH 005/108] add chroma loader

---
 src/diffusers/loaders/single_file_utils.py | 166 +++++++++++++++++++++
 1 file changed, 166 insertions(+)

diff --git a/src/diffusers/loaders/single_file_utils.py b/src/diffusers/loaders/single_file_utils.py
index 0f762b949d47..aace8fc7bffb 100644
--- a/src/diffusers/loaders/single_file_utils.py
+++ b/src/diffusers/loaders/single_file_utils.py
@@ -3310,3 +3310,169 @@ def convert_hidream_transformer_to_diffusers(checkpoint, **kwargs):
             checkpoint[k.replace("model.diffusion_model.", "")] = checkpoint.pop(k)
 
     return checkpoint
+
+def convert_chroma_transformer_checkpoint_to_diffusers(checkpoint, **kwargs):
+    converted_state_dict = {}
+    keys = list(checkpoint.keys())
+
+    for k in keys:
+        if "model.diffusion_model." in k:
+            checkpoint[k.replace("model.diffusion_model.", "")] = checkpoint.pop(k)
+
+    num_layers = list(set(int(k.split(".", 2)[1]) for k in checkpoint if "double_blocks." in k))[-1] + 1  # noqa: C401
+    num_single_layers = list(set(int(k.split(".", 2)[1]) for k in checkpoint if "single_blocks." in k))[-1] + 1  # noqa: C401
+    num_guidance_layers = list(set(int(k.split(".", 3)[2]) for k in checkpoint if "distilled_guidance_layer.layers." in k))[-1] + 1  # noqa: C401
+    mlp_ratio = 4.0
+    inner_dim = 3072
+
+    # in SD3 original implementation of AdaLayerNormContinuous, it split linear projection output into shift, scale;
+    # while in diffusers it split into scale, shift. Here we swap the linear projection weights in order to be able to use diffusers implementation
+    def swap_scale_shift(weight):
+        shift, scale = weight.chunk(2, dim=0)
+        new_weight = torch.cat([scale, shift], dim=0)
+        return new_weight
+
+    # guidance
+    converted_state_dict["time_text_embed.embedder.in_proj.bias"] = checkpoint.pop(
+            "distilled_guidance_layer.in_proj.bias"
+        )
+    converted_state_dict["time_text_embed.embedder.in_proj.weight"] = checkpoint.pop(
+            "distilled_guidance_layer.in_proj.weight"
+        )
+    converted_state_dict["time_text_embed.embedder.out_proj.bias"] = checkpoint.pop(
+            "distilled_guidance_layer.out_proj.bias"
+        )
+    converted_state_dict["time_text_embed.embedder.out_proj.weight"] = checkpoint.pop(
+            "distilled_guidance_layer.out_proj.weight"
+        )
+    for i in range(num_guidance_layers):
+        block_prefix = f"time_text_embed.embedder.layers.{i}."
+        converted_state_dict[f"{block_prefix}linear_1.bias"] = checkpoint.pop(
+            f"distilled_guidance_layer.layers.{i}.in_layer.bias"
+        )
+        converted_state_dict[f"{block_prefix}linear_1.weight"] = checkpoint.pop(
+            f"distilled_guidance_layer.layers.{i}.in_layer.weight"
+        )
+        converted_state_dict[f"{block_prefix}linear_2.bias"] = checkpoint.pop(
+            f"distilled_guidance_layer.layers.{i}.out_layer.bias"
+        )
+        converted_state_dict[f"{block_prefix}linear_2.weight"] = checkpoint.pop(
+            f"distilled_guidance_layer.layers.{i}.out_layer.weight"
+        )
+        converted_state_dict[f"time_text_embed.embedder.norms.{i}.weight"] = checkpoint.pop(
+            f"distilled_guidance_layer.norms.{i}.scale"
+        )
+
+    # context_embedder
+    converted_state_dict["context_embedder.weight"] = checkpoint.pop("txt_in.weight")
+    converted_state_dict["context_embedder.bias"] = checkpoint.pop("txt_in.bias")
+
+    # x_embedder
+    converted_state_dict["x_embedder.weight"] = checkpoint.pop("img_in.weight")
+    converted_state_dict["x_embedder.bias"] = checkpoint.pop("img_in.bias")
+
+    # double transformer blocks
+    for i in range(num_layers):
+        block_prefix = f"transformer_blocks.{i}."
+        # Q, K, V
+        sample_q, sample_k, sample_v = torch.chunk(checkpoint.pop(f"double_blocks.{i}.img_attn.qkv.weight"), 3, dim=0)
+        context_q, context_k, context_v = torch.chunk(
+            checkpoint.pop(f"double_blocks.{i}.txt_attn.qkv.weight"), 3, dim=0
+        )
+        sample_q_bias, sample_k_bias, sample_v_bias = torch.chunk(
+            checkpoint.pop(f"double_blocks.{i}.img_attn.qkv.bias"), 3, dim=0
+        )
+        context_q_bias, context_k_bias, context_v_bias = torch.chunk(
+            checkpoint.pop(f"double_blocks.{i}.txt_attn.qkv.bias"), 3, dim=0
+        )
+        converted_state_dict[f"{block_prefix}attn.to_q.weight"] = torch.cat([sample_q])
+        converted_state_dict[f"{block_prefix}attn.to_q.bias"] = torch.cat([sample_q_bias])
+        converted_state_dict[f"{block_prefix}attn.to_k.weight"] = torch.cat([sample_k])
+        converted_state_dict[f"{block_prefix}attn.to_k.bias"] = torch.cat([sample_k_bias])
+        converted_state_dict[f"{block_prefix}attn.to_v.weight"] = torch.cat([sample_v])
+        converted_state_dict[f"{block_prefix}attn.to_v.bias"] = torch.cat([sample_v_bias])
+        converted_state_dict[f"{block_prefix}attn.add_q_proj.weight"] = torch.cat([context_q])
+        converted_state_dict[f"{block_prefix}attn.add_q_proj.bias"] = torch.cat([context_q_bias])
+        converted_state_dict[f"{block_prefix}attn.add_k_proj.weight"] = torch.cat([context_k])
+        converted_state_dict[f"{block_prefix}attn.add_k_proj.bias"] = torch.cat([context_k_bias])
+        converted_state_dict[f"{block_prefix}attn.add_v_proj.weight"] = torch.cat([context_v])
+        converted_state_dict[f"{block_prefix}attn.add_v_proj.bias"] = torch.cat([context_v_bias])
+        # qk_norm
+        converted_state_dict[f"{block_prefix}attn.norm_q.weight"] = checkpoint.pop(
+            f"double_blocks.{i}.img_attn.norm.query_norm.scale"
+        )
+        converted_state_dict[f"{block_prefix}attn.norm_k.weight"] = checkpoint.pop(
+            f"double_blocks.{i}.img_attn.norm.key_norm.scale"
+        )
+        converted_state_dict[f"{block_prefix}attn.norm_added_q.weight"] = checkpoint.pop(
+            f"double_blocks.{i}.txt_attn.norm.query_norm.scale"
+        )
+        converted_state_dict[f"{block_prefix}attn.norm_added_k.weight"] = checkpoint.pop(
+            f"double_blocks.{i}.txt_attn.norm.key_norm.scale"
+        )
+        # ff img_mlp
+        converted_state_dict[f"{block_prefix}ff.net.0.proj.weight"] = checkpoint.pop(
+            f"double_blocks.{i}.img_mlp.0.weight"
+        )
+        converted_state_dict[f"{block_prefix}ff.net.0.proj.bias"] = checkpoint.pop(f"double_blocks.{i}.img_mlp.0.bias")
+        converted_state_dict[f"{block_prefix}ff.net.2.weight"] = checkpoint.pop(f"double_blocks.{i}.img_mlp.2.weight")
+        converted_state_dict[f"{block_prefix}ff.net.2.bias"] = checkpoint.pop(f"double_blocks.{i}.img_mlp.2.bias")
+        converted_state_dict[f"{block_prefix}ff_context.net.0.proj.weight"] = checkpoint.pop(
+            f"double_blocks.{i}.txt_mlp.0.weight"
+        )
+        converted_state_dict[f"{block_prefix}ff_context.net.0.proj.bias"] = checkpoint.pop(
+            f"double_blocks.{i}.txt_mlp.0.bias"
+        )
+        converted_state_dict[f"{block_prefix}ff_context.net.2.weight"] = checkpoint.pop(
+            f"double_blocks.{i}.txt_mlp.2.weight"
+        )
+        converted_state_dict[f"{block_prefix}ff_context.net.2.bias"] = checkpoint.pop(
+            f"double_blocks.{i}.txt_mlp.2.bias"
+        )
+        # output projections.
+        converted_state_dict[f"{block_prefix}attn.to_out.0.weight"] = checkpoint.pop(
+            f"double_blocks.{i}.img_attn.proj.weight"
+        )
+        converted_state_dict[f"{block_prefix}attn.to_out.0.bias"] = checkpoint.pop(
+            f"double_blocks.{i}.img_attn.proj.bias"
+        )
+        converted_state_dict[f"{block_prefix}attn.to_add_out.weight"] = checkpoint.pop(
+            f"double_blocks.{i}.txt_attn.proj.weight"
+        )
+        converted_state_dict[f"{block_prefix}attn.to_add_out.bias"] = checkpoint.pop(
+            f"double_blocks.{i}.txt_attn.proj.bias"
+        )
+
+    # single transformer blocks
+    for i in range(num_single_layers):
+        block_prefix = f"single_transformer_blocks.{i}."
+        # Q, K, V, mlp
+        mlp_hidden_dim = int(inner_dim * mlp_ratio)
+        split_size = (inner_dim, inner_dim, inner_dim, mlp_hidden_dim)
+        q, k, v, mlp = torch.split(checkpoint.pop(f"single_blocks.{i}.linear1.weight"), split_size, dim=0)
+        q_bias, k_bias, v_bias, mlp_bias = torch.split(
+            checkpoint.pop(f"single_blocks.{i}.linear1.bias"), split_size, dim=0
+        )
+        converted_state_dict[f"{block_prefix}attn.to_q.weight"] = torch.cat([q])
+        converted_state_dict[f"{block_prefix}attn.to_q.bias"] = torch.cat([q_bias])
+        converted_state_dict[f"{block_prefix}attn.to_k.weight"] = torch.cat([k])
+        converted_state_dict[f"{block_prefix}attn.to_k.bias"] = torch.cat([k_bias])
+        converted_state_dict[f"{block_prefix}attn.to_v.weight"] = torch.cat([v])
+        converted_state_dict[f"{block_prefix}attn.to_v.bias"] = torch.cat([v_bias])
+        converted_state_dict[f"{block_prefix}proj_mlp.weight"] = torch.cat([mlp])
+        converted_state_dict[f"{block_prefix}proj_mlp.bias"] = torch.cat([mlp_bias])
+        # qk norm
+        converted_state_dict[f"{block_prefix}attn.norm_q.weight"] = checkpoint.pop(
+            f"single_blocks.{i}.norm.query_norm.scale"
+        )
+        converted_state_dict[f"{block_prefix}attn.norm_k.weight"] = checkpoint.pop(
+            f"single_blocks.{i}.norm.key_norm.scale"
+        )
+        # output projections.
+        converted_state_dict[f"{block_prefix}proj_out.weight"] = checkpoint.pop(f"single_blocks.{i}.linear2.weight")
+        converted_state_dict[f"{block_prefix}proj_out.bias"] = checkpoint.pop(f"single_blocks.{i}.linear2.bias")
+
+    converted_state_dict["proj_out.weight"] = checkpoint.pop("final_layer.linear.weight")
+    converted_state_dict["proj_out.bias"] = checkpoint.pop("final_layer.linear.bias")
+
+    return converted_state_dict

From bc36a0d883bc594ec49ed4c01537aa827a8202c1 Mon Sep 17 00:00:00 2001
From: Edna <88869424+Ednaordinary@users.noreply.github.com>
Date: Mon, 9 Jun 2025 21:15:19 -0600
Subject: [PATCH 006/108] add chroma to mappings

---
 src/diffusers/loaders/single_file_model.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/src/diffusers/loaders/single_file_model.py b/src/diffusers/loaders/single_file_model.py
index 6919c4949d59..82e4db7283cc 100644
--- a/src/diffusers/loaders/single_file_model.py
+++ b/src/diffusers/loaders/single_file_model.py
@@ -30,6 +30,7 @@
     convert_auraflow_transformer_checkpoint_to_diffusers,
     convert_autoencoder_dc_checkpoint_to_diffusers,
     convert_controlnet_checkpoint,
+    convert_chroma_transformer_checkpoint_to_diffusers,
     convert_flux_transformer_checkpoint_to_diffusers,
     convert_hidream_transformer_to_diffusers,
     convert_hunyuan_video_transformer_to_diffusers,
@@ -97,6 +98,10 @@
         "checkpoint_mapping_fn": convert_flux_transformer_checkpoint_to_diffusers,
         "default_subfolder": "transformer",
     },
+    "ChromaTransformer2DModel": {
+        "checkpoint_mapping_fn": convert_chroma_transformer_checkpoint_to_diffusers,
+        "default_subfolder": "transformer",
+    }
     "LTXVideoTransformer3DModel": {
         "checkpoint_mapping_fn": convert_ltx_transformer_checkpoint_to_diffusers,
         "default_subfolder": "transformer",

From 33ea0b65a42f65965fe74ba1ab778b86d0d05919 Mon Sep 17 00:00:00 2001
From: Edna <88869424+Ednaordinary@users.noreply.github.com>
Date: Mon, 9 Jun 2025 21:25:19 -0600
Subject: [PATCH 007/108] add chroma to transformer init

---
 src/diffusers/models/transformers/__init__.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/diffusers/models/transformers/__init__.py b/src/diffusers/models/transformers/__init__.py
index e7b8ba55ca61..cc03a0ccbcdf 100755
--- a/src/diffusers/models/transformers/__init__.py
+++ b/src/diffusers/models/transformers/__init__.py
@@ -17,6 +17,7 @@
     from .t5_film_transformer import T5FilmDecoder
     from .transformer_2d import Transformer2DModel
     from .transformer_allegro import AllegroTransformer3DModel
+    from .transformer_chroma import ChromaTransformer2DModel
     from .transformer_cogview3plus import CogView3PlusTransformer2DModel
     from .transformer_cogview4 import CogView4Transformer2DModel
     from .transformer_cosmos import CosmosTransformer3DModel

From 22ecd19f91039705f90a81c5cc1afa2d8413a26b Mon Sep 17 00:00:00 2001
From: Edna <88869424+Ednaordinary@users.noreply.github.com>
Date: Mon, 9 Jun 2025 21:32:52 -0600
Subject: [PATCH 008/108] take out variant stuff

---
 .../models/transformers/transformer_chroma.py | 119 ++++++------------
 1 file changed, 36 insertions(+), 83 deletions(-)

diff --git a/src/diffusers/models/transformers/transformer_chroma.py b/src/diffusers/models/transformers/transformer_chroma.py
index c542bcaaccf6..1f726f5cb4b0 100644
--- a/src/diffusers/models/transformers/transformer_chroma.py
+++ b/src/diffusers/models/transformers/transformer_chroma.py
@@ -43,40 +43,27 @@
 from ..modeling_outputs import Transformer2DModelOutput
 from ..modeling_utils import ModelMixin
 from ..normalization import (
-    AdaLayerNormContinuous,
     AdaLayerNormContinuousPruned,
-    AdaLayerNormZero,
     AdaLayerNormZeroPruned,
-    AdaLayerNormZeroSingle,
     AdaLayerNormZeroSinglePruned,
 )
 
 
 logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
 
-INVALID_VARIANT_ERRMSG = "`variant` must be `'flux' or `'chroma'`."
-
 
 @maybe_allow_in_graph
-class FluxSingleTransformerBlock(nn.Module):
+class ChromaSingleTransformerBlock(nn.Module):
     def __init__(
         self,
         dim: int,
         num_attention_heads: int,
         attention_head_dim: int,
         mlp_ratio: float = 4.0,
-        variant: str = "flux",
     ):
         super().__init__()
         self.mlp_hidden_dim = int(dim * mlp_ratio)
-
-        if variant == "flux":
-            self.norm = AdaLayerNormZeroSingle(dim)
-        elif variant == "chroma":
-            self.norm = AdaLayerNormZeroSinglePruned(dim)
-        else:
-            raise ValueError(INVALID_VARIANT_ERRMSG)
-
+        self.norm = AdaLayerNormZeroSinglePruned(dim)
         self.proj_mlp = nn.Linear(dim, self.mlp_hidden_dim)
         self.act_mlp = nn.GELU(approximate="tanh")
         self.proj_out = nn.Linear(dim + self.mlp_hidden_dim, dim)
@@ -132,7 +119,7 @@ def forward(
 
 
 @maybe_allow_in_graph
-class FluxTransformerBlock(nn.Module):
+class ChromaTransformerBlock(nn.Module):
     def __init__(
         self,
         dim: int,
@@ -140,18 +127,10 @@ def __init__(
         attention_head_dim: int,
         qk_norm: str = "rms_norm",
         eps: float = 1e-6,
-        variant: str = "flux",
     ):
         super().__init__()
-
-        if variant == "flux":
-            self.norm1 = AdaLayerNormZero(dim)
-            self.norm1_context = AdaLayerNormZero(dim)
-        elif variant == "chroma":
-            self.norm1 = AdaLayerNormZeroPruned(dim)
-            self.norm1_context = AdaLayerNormZeroPruned(dim)
-        else:
-            raise ValueError(INVALID_VARIANT_ERRMSG)
+        self.norm1 = AdaLayerNormZeroPruned(dim)
+        self.norm1_context = AdaLayerNormZeroPruned(dim)
 
         self.attn = Attention(
             query_dim=dim,
@@ -231,13 +210,13 @@ def forward(
         return encoder_hidden_states, hidden_states
 
 
-class FluxTransformer2DModel(
+class ChromaTransformer2DModel(
     ModelMixin, ConfigMixin, PeftAdapterMixin, FromOriginalModelMixin, FluxTransformer2DLoadersMixin, CacheMixin
 ):
     """
-    The Transformer model introduced in Flux.
+    The Transformer model introduced in Flux, modified for Chroma.
 
-    Reference: https://blackforestlabs.ai/announcing-black-forest-labs/
+    Reference: https://huggingface.co/lodestones/Chroma
 
     Args:
         patch_size (`int`, defaults to `1`):
@@ -266,7 +245,7 @@ class FluxTransformer2DModel(
     """
 
     _supports_gradient_checkpointing = True
-    _no_split_modules = ["FluxTransformerBlock", "FluxSingleTransformerBlock"]
+    _no_split_modules = ["ChromaTransformerBlock", "ChromaSingleTransformerBlock"]
     _skip_layerwise_casting_patterns = ["pos_embed", "norm"]
 
     @register_to_config
@@ -283,7 +262,6 @@ def __init__(
         pooled_projection_dim: int = 768,
         guidance_embeds: bool = False,
         axes_dims_rope: Tuple[int, ...] = (16, 56, 56),
-        variant: str = "flux",
         approximator_in_factor: int = 16,
         approximator_hidden_dim: int = 5120,
         approximator_layers: int = 5,
@@ -294,31 +272,21 @@ def __init__(
 
         self.pos_embed = FluxPosEmbed(theta=10000, axes_dim=axes_dims_rope)
 
-        if variant == "flux":
-            text_time_guidance_cls = (
-                CombinedTimestepGuidanceTextProjEmbeddings if guidance_embeds else CombinedTimestepTextProjEmbeddings
-            )
-            self.time_text_embed = text_time_guidance_cls(
-                embedding_dim=self.inner_dim, pooled_projection_dim=pooled_projection_dim
-            )
-        elif variant == "chroma":
-            self.time_text_embed = CombinedTimestepTextProjChromaEmbeddings(
-                factor=approximator_in_factor,
-                hidden_dim=approximator_hidden_dim,
-                out_dim=3 * num_single_layers + 2 * 6 * num_layers + 2,
-                embedding_dim=self.inner_dim,
-                n_layers=approximator_layers,
-            )
-            self.distilled_guidance_layer = ChromaApproximator(in_dim=64, out_dim=3072, hidden_dim=5120, n_layers=5)
-        else:
-            raise ValueError(INVALID_VARIANT_ERRMSG)
+        self.time_text_embed = CombinedTimestepTextProjChromaEmbeddings(
+            factor=approximator_in_factor,
+            hidden_dim=approximator_hidden_dim,
+            out_dim=3 * num_single_layers + 2 * 6 * num_layers + 2,
+            embedding_dim=self.inner_dim,
+            n_layers=approximator_layers,
+        )
+        self.distilled_guidance_layer = ChromaApproximator(in_dim=64, out_dim=3072, hidden_dim=5120, n_layers=5)
 
         self.context_embedder = nn.Linear(joint_attention_dim, self.inner_dim)
         self.x_embedder = nn.Linear(in_channels, self.inner_dim)
 
         self.transformer_blocks = nn.ModuleList(
             [
-                FluxTransformerBlock(
+                ChromaTransformerBlock(
                     dim=self.inner_dim,
                     num_attention_heads=num_attention_heads,
                     attention_head_dim=attention_head_dim,
@@ -330,7 +298,7 @@ def __init__(
 
         self.single_transformer_blocks = nn.ModuleList(
             [
-                FluxSingleTransformerBlock(
+                ChromaSingleTransformerBlock(
                     dim=self.inner_dim,
                     num_attention_heads=num_attention_heads,
                     attention_head_dim=attention_head_dim,
@@ -340,16 +308,12 @@ def __init__(
             ]
         )
 
-        norm_out_cls = AdaLayerNormContinuous if variant != "chroma" else AdaLayerNormContinuousPruned
+        norm_out_cls = AdaLayerNormContinuousPruned
         self.norm_out = norm_out_cls(self.inner_dim, self.inner_dim, elementwise_affine=False, eps=1e-6)
         self.proj_out = nn.Linear(self.inner_dim, patch_size * patch_size * self.out_channels, bias=True)
 
         self.gradient_checkpointing = False
 
-    @property
-    def is_chroma(self) -> bool:
-        return isinstance(self.time_text_embed, CombinedTimestepTextProjChromaEmbeddings)
-
     @property
     # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.attn_processors
     def attn_processors(self) -> Dict[str, AttentionProcessor]:
@@ -506,22 +470,14 @@ def forward(
                     "Passing `scale` via `joint_attention_kwargs` when not using the PEFT backend is ineffective."
                 )
 
-        is_chroma = self.is_chroma
         hidden_states = self.x_embedder(hidden_states)
 
         timestep = timestep.to(hidden_states.dtype) * 1000
         if guidance is not None:
             guidance = guidance.to(hidden_states.dtype) * 1000
 
-        if not is_chroma:
-            temb = (
-                self.time_text_embed(timestep, pooled_projections)
-                if guidance is None
-                else self.time_text_embed(timestep, guidance, pooled_projections)
-            )
-        else:
-            input_vec = self.time_text_embed(timestep, guidance, pooled_projections)
-            pooled_temb = self.distilled_guidance_layer(input_vec)
+        input_vec = self.time_text_embed(timestep, guidance, pooled_projections)
+        pooled_temb = self.distilled_guidance_layer(input_vec)
 
         encoder_hidden_states = self.context_embedder(encoder_hidden_states)
 
@@ -547,18 +503,17 @@ def forward(
             joint_attention_kwargs.update({"ip_hidden_states": ip_hidden_states})
 
         for index_block, block in enumerate(self.transformer_blocks):
-            if is_chroma:
-                img_offset = 3 * len(self.single_transformer_blocks)
-                txt_offset = img_offset + 6 * len(self.transformer_blocks)
-                img_modulation = img_offset + 6 * index_block
-                text_modulation = txt_offset + 6 * index_block
-                temb = torch.cat(
-                    (
-                        pooled_temb[:, img_modulation : img_modulation + 6],
-                        pooled_temb[:, text_modulation : text_modulation + 6],
-                    ),
-                    dim=1,
-                )
+            img_offset = 3 * len(self.single_transformer_blocks)
+            txt_offset = img_offset + 6 * len(self.transformer_blocks)
+            img_modulation = img_offset + 6 * index_block
+            text_modulation = txt_offset + 6 * index_block
+            temb = torch.cat(
+                (
+                    pooled_temb[:, img_modulation : img_modulation + 6],
+                    pooled_temb[:, text_modulation : text_modulation + 6],
+                ),
+                dim=1,
+            )
             if torch.is_grad_enabled() and self.gradient_checkpointing:
                 encoder_hidden_states, hidden_states = self._gradient_checkpointing_func(
                     block,
@@ -591,9 +546,8 @@ def forward(
         hidden_states = torch.cat([encoder_hidden_states, hidden_states], dim=1)
 
         for index_block, block in enumerate(self.single_transformer_blocks):
-            if is_chroma:
-                start_idx = 3 * index_block
-                temb = pooled_temb[:, start_idx : start_idx + 3]
+            start_idx = 3 * index_block
+            temb = pooled_temb[:, start_idx : start_idx + 3]
             if torch.is_grad_enabled() and self.gradient_checkpointing:
                 hidden_states = self._gradient_checkpointing_func(
                     block,
@@ -621,8 +575,7 @@ def forward(
 
         hidden_states = hidden_states[:, encoder_hidden_states.shape[1] :, ...]
 
-        if is_chroma:
-            temb = pooled_temb[:, -2:]
+        temb = pooled_temb[:, -2:]
         hidden_states = self.norm_out(hidden_states, temb)
         output = self.proj_out(hidden_states)
 

From b0df9691d2ec5caa42a9310eef250bef513f15f7 Mon Sep 17 00:00:00 2001
From: Edna <88869424+Ednaordinary@users.noreply.github.com>
Date: Tue, 10 Jun 2025 02:09:52 -0600
Subject: [PATCH 009/108] get decently far in changing variant stuff

---
 .../pipelines/chroma/pipeline_chroma.py       | 182 ++----------------
 1 file changed, 21 insertions(+), 161 deletions(-)

diff --git a/src/diffusers/pipelines/chroma/pipeline_chroma.py b/src/diffusers/pipelines/chroma/pipeline_chroma.py
index 50c0c4cedc57..f6d2e366e48e 100644
--- a/src/diffusers/pipelines/chroma/pipeline_chroma.py
+++ b/src/diffusers/pipelines/chroma/pipeline_chroma.py
@@ -40,7 +40,7 @@
 )
 from ...utils.torch_utils import randn_tensor
 from ..pipeline_utils import DiffusionPipeline
-from .pipeline_output import FluxPipelineOutput
+from .pipeline_output import ChromaPipelineOutput
 
 
 if is_torch_xla_available():
@@ -57,15 +57,13 @@
     Examples:
         ```py
         >>> import torch
-        >>> from diffusers import FluxPipeline
+        >>> from diffusers import ChromaPipeline
 
-        >>> pipe = FluxPipeline.from_pretrained("black-forest-labs/FLUX.1-schnell", torch_dtype=torch.bfloat16)
+        >>> pipe = ChromaPipeline.from_single_file("chroma-unlocked-v35-detail-calibrated.safetensors", torch_dtype=torch.bfloat16)
         >>> pipe.to("cuda")
         >>> prompt = "A cat holding a sign that says hello world"
-        >>> # Depending on the variant being used, the pipeline call will slightly vary.
-        >>> # Refer to the pipeline documentation for more details.
-        >>> image = pipe(prompt, num_inference_steps=4, guidance_scale=0.0).images[0]
-        >>> image.save("flux.png")
+        >>> image = pipe(prompt, num_inference_steps=28, guidance_scale=4.0).images[0]
+        >>> image.save("chroma.png")
         ```
 """
 
@@ -143,7 +141,7 @@ def retrieve_timesteps(
     return timesteps, num_inference_steps
 
 
-class FluxPipeline(
+class ChromaPipeline(
     DiffusionPipeline,
     FluxLoraLoaderMixin,
     FromSingleFileMixin,
@@ -151,27 +149,21 @@ class FluxPipeline(
     FluxIPAdapterMixin,
 ):
     r"""
-    The Flux pipeline for text-to-image generation.
+    The Chroma pipeline for text-to-image generation.
 
-    Reference: https://blackforestlabs.ai/announcing-black-forest-labs/
+    Reference: https://huggingface.co/lodestones/Chroma/
 
     Args:
-        transformer ([`FluxTransformer2DModel`]):
+        transformer ([`ChromaTransformer2DModel`]):
             Conditional Transformer (MMDiT) architecture to denoise the encoded image latents.
         scheduler ([`FlowMatchEulerDiscreteScheduler`]):
             A scheduler to be used in combination with `transformer` to denoise the encoded image latents.
         vae ([`AutoencoderKL`]):
-            Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.
-        text_encoder ([`CLIPTextModel`]):
-            [CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModel), specifically
-            the [clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14) variant.
-        text_encoder_2 ([`T5EncoderModel`]):
+            Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representation
+        text_encoder ([`T5EncoderModel`]):
             [T5](https://huggingface.co/docs/transformers/en/model_doc/t5#transformers.T5EncoderModel), specifically
             the [google/t5-v1_1-xxl](https://huggingface.co/google/t5-v1_1-xxl) variant.
-        tokenizer (`CLIPTokenizer`):
-            Tokenizer of class
-            [CLIPTokenizer](https://huggingface.co/docs/transformers/en/model_doc/clip#transformers.CLIPTokenizer).
-        tokenizer_2 (`T5TokenizerFast`):
+        tokenizer (`T5TokenizerFast`):
             Second Tokenizer of class
             [T5TokenizerFast](https://huggingface.co/docs/transformers/en/model_doc/t5#transformers.T5TokenizerFast).
     """
@@ -184,11 +176,9 @@ def __init__(
         self,
         scheduler: FlowMatchEulerDiscreteScheduler,
         vae: AutoencoderKL,
-        text_encoder: CLIPTextModel,
-        tokenizer: CLIPTokenizer,
-        text_encoder_2: T5EncoderModel,
-        tokenizer_2: T5TokenizerFast,
-        transformer: FluxTransformer2DModel,
+        text_encoder: T5EncoderModel,
+        tokenizer: T5TokenizerFast,
+        transformer: ChromaTransformer2DModel,
         image_encoder: CLIPVisionModelWithProjection = None,
         feature_extractor: CLIPImageProcessor = None,
         variant: str = "flux",
@@ -198,9 +188,7 @@ def __init__(
         self.register_modules(
             vae=vae,
             text_encoder=text_encoder,
-            text_encoder_2=text_encoder_2,
             tokenizer=tokenizer,
-            tokenizer_2=tokenizer_2,
             transformer=transformer,
             scheduler=scheduler,
             image_encoder=image_encoder,
@@ -214,10 +202,6 @@ def __init__(
             self.tokenizer.model_max_length if hasattr(self, "tokenizer") and self.tokenizer is not None else 77
         )
         self.default_sample_size = 128
-        if variant not in {"flux", "chroma"}:
-            raise ValueError("`variant` must be `'flux' or `'chroma'`.")
-
-        self.variant = variant
 
     def _get_chroma_attn_mask(self, length: torch.Tensor, max_sequence_length: int) -> torch.Tensor:
         attention_mask = torch.zeros((length.shape[0], max_sequence_length), dtype=torch.bool, device=length.device)
@@ -248,7 +232,7 @@ def _get_t5_prompt_embeds(
             padding="max_length",
             max_length=max_sequence_length,
             truncation=True,
-            return_length=(self.variant == "chroma"),
+            return_length=True,
             return_overflowing_tokens=False,
             return_tensors="pt",
         )
@@ -267,8 +251,6 @@ def _get_t5_prompt_embeds(
             output_hidden_states=False,
             attention_mask=(
                 self._get_chroma_attn_mask(text_inputs.length, max_sequence_length).to(device)
-                if self.variant == "chroma"
-                else None
             ),
         )[0]
 
@@ -283,58 +265,12 @@ def _get_t5_prompt_embeds(
 
         return prompt_embeds
 
-    def _get_clip_prompt_embeds(
-        self,
-        prompt: Union[str, List[str]],
-        num_images_per_prompt: int = 1,
-        device: Optional[torch.device] = None,
-    ):
-        device = device or self._execution_device
-
-        prompt = [prompt] if isinstance(prompt, str) else prompt
-        batch_size = len(prompt)
-
-        if isinstance(self, TextualInversionLoaderMixin):
-            prompt = self.maybe_convert_prompt(prompt, self.tokenizer)
-
-        text_inputs = self.tokenizer(
-            prompt,
-            padding="max_length",
-            max_length=self.tokenizer_max_length,
-            truncation=True,
-            return_overflowing_tokens=False,
-            return_length=False,
-            return_tensors="pt",
-        )
-
-        text_input_ids = text_inputs.input_ids
-        untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pt").input_ids
-        if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(text_input_ids, untruncated_ids):
-            removed_text = self.tokenizer.batch_decode(untruncated_ids[:, self.tokenizer_max_length - 1 : -1])
-            logger.warning(
-                "The following part of your input was truncated because CLIP can only handle sequences up to"
-                f" {self.tokenizer_max_length} tokens: {removed_text}"
-            )
-        prompt_embeds = self.text_encoder(text_input_ids.to(device), output_hidden_states=False)
-
-        # Use pooled output of CLIPTextModel
-        prompt_embeds = prompt_embeds.pooler_output
-        prompt_embeds = prompt_embeds.to(dtype=self.text_encoder.dtype, device=device)
-
-        # duplicate text embeddings for each generation per prompt, using mps friendly method
-        prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt)
-        prompt_embeds = prompt_embeds.view(batch_size * num_images_per_prompt, -1)
-
-        return prompt_embeds
-
     def encode_prompt(
         self,
         prompt: Union[str, List[str]],
-        prompt_2: Union[str, List[str]],
         device: Optional[torch.device] = None,
         num_images_per_prompt: int = 1,
         prompt_embeds: Optional[torch.FloatTensor] = None,
-        pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
         max_sequence_length: int = 512,
         lora_scale: Optional[float] = None,
     ):
@@ -343,9 +279,6 @@ def encode_prompt(
         Args:
             prompt (`str` or `List[str]`, *optional*):
                 prompt to be encoded
-            prompt_2 (`str` or `List[str]`, *optional*):
-                The prompt or prompts to be sent to the `tokenizer_2` and `text_encoder_2`. If not defined, `prompt` is
-                used in all text-encoders
             device: (`torch.device`):
                 torch device
             num_images_per_prompt (`int`):
@@ -369,21 +302,11 @@ def encode_prompt(
             # dynamically adjust the LoRA scale
             if self.text_encoder is not None and USE_PEFT_BACKEND:
                 scale_lora_layers(self.text_encoder, lora_scale)
-            if self.text_encoder_2 is not None and USE_PEFT_BACKEND:
-                scale_lora_layers(self.text_encoder_2, lora_scale)
 
         prompt = [prompt] if isinstance(prompt, str) else prompt
 
         if prompt_embeds is None:
-            prompt_2 = prompt_2 or prompt
-            prompt_2 = [prompt_2] if isinstance(prompt_2, str) else prompt_2
 
-            # We only use the pooled prompt output from the CLIPTextModel
-            pooled_prompt_embeds = self._get_clip_prompt_embeds(
-                prompt=prompt,
-                device=device,
-                num_images_per_prompt=num_images_per_prompt,
-            )
             prompt_embeds = self._get_t5_prompt_embeds(
                 prompt=prompt_2,
                 num_images_per_prompt=num_images_per_prompt,
@@ -396,15 +319,10 @@ def encode_prompt(
                 # Retrieve the original scale by scaling back the LoRA layers
                 unscale_lora_layers(self.text_encoder, lora_scale)
 
-        if self.text_encoder_2 is not None:
-            if isinstance(self, FluxLoraLoaderMixin) and USE_PEFT_BACKEND:
-                # Retrieve the original scale by scaling back the LoRA layers
-                unscale_lora_layers(self.text_encoder_2, lora_scale)
-
         dtype = self.text_encoder.dtype if self.text_encoder is not None else self.transformer.dtype
         text_ids = torch.zeros(prompt_embeds.shape[1], 3).to(device=device, dtype=dtype)
 
-        return prompt_embeds, pooled_prompt_embeds, text_ids
+        return prompt_embeds, text_ids
 
     def encode_image(self, image, device, num_images_per_prompt):
         dtype = next(self.image_encoder.parameters()).dtype
@@ -456,15 +374,12 @@ def prepare_ip_adapter_image_embeds(
     def check_inputs(
         self,
         prompt,
-        prompt_2,
         height,
         width,
         negative_prompt=None,
         negative_prompt_2=None,
         prompt_embeds=None,
         negative_prompt_embeds=None,
-        pooled_prompt_embeds=None,
-        negative_pooled_prompt_embeds=None,
         callback_on_step_end_tensor_inputs=None,
         max_sequence_length=None,
     ):
@@ -485,39 +400,18 @@ def check_inputs(
                 f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
                 " only forward one of the two."
             )
-        elif prompt_2 is not None and prompt_embeds is not None:
-            raise ValueError(
-                f"Cannot forward both `prompt_2`: {prompt_2} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
-                " only forward one of the two."
-            )
         elif prompt is None and prompt_embeds is None:
             raise ValueError(
                 "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
             )
         elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
             raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
-        elif prompt_2 is not None and (not isinstance(prompt_2, str) and not isinstance(prompt_2, list)):
-            raise ValueError(f"`prompt_2` has to be of type `str` or `list` but is {type(prompt_2)}")
 
         if negative_prompt is not None and negative_prompt_embeds is not None:
             raise ValueError(
                 f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
                 f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
             )
-        elif negative_prompt_2 is not None and negative_prompt_embeds is not None:
-            raise ValueError(
-                f"Cannot forward both `negative_prompt_2`: {negative_prompt_2} and `negative_prompt_embeds`:"
-                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
-            )
-
-        if prompt_embeds is not None and pooled_prompt_embeds is None:
-            raise ValueError(
-                "If `prompt_embeds` are provided, `pooled_prompt_embeds` also have to be passed. Make sure to generate `pooled_prompt_embeds` from the same text encoder that was used to generate `prompt_embeds`."
-            )
-        if negative_prompt_embeds is not None and negative_pooled_prompt_embeds is None:
-            raise ValueError(
-                "If `negative_prompt_embeds` are provided, `negative_pooled_prompt_embeds` also have to be passed. Make sure to generate `negative_pooled_prompt_embeds` from the same text encoder that was used to generate `negative_prompt_embeds`."
-            )
 
         if max_sequence_length is not None and max_sequence_length > 512:
             raise ValueError(f"`max_sequence_length` cannot be greater than 512 but is {max_sequence_length}")
@@ -649,10 +543,7 @@ def interrupt(self):
     def __call__(
         self,
         prompt: Union[str, List[str]] = None,
-        prompt_2: Optional[Union[str, List[str]]] = None,
         negative_prompt: Union[str, List[str]] = None,
-        negative_prompt_2: Optional[Union[str, List[str]]] = None,
-        true_cfg_scale: float = 1.0,
         height: Optional[int] = None,
         width: Optional[int] = None,
         num_inference_steps: int = 28,
@@ -662,13 +553,11 @@ def __call__(
         generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
         latents: Optional[torch.FloatTensor] = None,
         prompt_embeds: Optional[torch.FloatTensor] = None,
-        pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
         ip_adapter_image: Optional[PipelineImageInput] = None,
         ip_adapter_image_embeds: Optional[List[torch.Tensor]] = None,
         negative_ip_adapter_image: Optional[PipelineImageInput] = None,
         negative_ip_adapter_image_embeds: Optional[List[torch.Tensor]] = None,
         negative_prompt_embeds: Optional[torch.FloatTensor] = None,
-        negative_pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
         output_type: Optional[str] = "pil",
         return_dict: bool = True,
         joint_attention_kwargs: Optional[Dict[str, Any]] = None,
@@ -683,18 +572,10 @@ def __call__(
             prompt (`str` or `List[str]`, *optional*):
                 The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
                 instead.
-            prompt_2 (`str` or `List[str]`, *optional*):
-                The prompt or prompts to be sent to `tokenizer_2` and `text_encoder_2`. If not defined, `prompt` is
-                will be used instead.
             negative_prompt (`str` or `List[str]`, *optional*):
                 The prompt or prompts not to guide the image generation. If not defined, one has to pass
                 `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `true_cfg_scale` is
                 not greater than `1`).
-            negative_prompt_2 (`str` or `List[str]`, *optional*):
-                The prompt or prompts not to guide the image generation to be sent to `tokenizer_2` and
-                `text_encoder_2`. If not defined, `negative_prompt` is used in all the text-encoders.
-            true_cfg_scale (`float`, *optional*, defaults to 1.0):
-                When > 1.0 and a provided `negative_prompt`, enables true classifier-free guidance.
             height (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
                 The height in pixels of the generated image. This is set to 1024 by default for the best results.
             width (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
@@ -724,9 +605,6 @@ def __call__(
             prompt_embeds (`torch.FloatTensor`, *optional*):
                 Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
                 provided, text embeddings will be generated from `prompt` input argument.
-            pooled_prompt_embeds (`torch.FloatTensor`, *optional*):
-                Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting.
-                If not provided, pooled text embeddings will be generated from `prompt` input argument.
             ip_adapter_image: (`PipelineImageInput`, *optional*): Optional image input to work with IP Adapters.
             ip_adapter_image_embeds (`List[torch.Tensor]`, *optional*):
                 Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of
@@ -742,10 +620,6 @@ def __call__(
                 Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
                 weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
                 argument.
-            negative_pooled_prompt_embeds (`torch.FloatTensor`, *optional*):
-                Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
-                weighting. If not provided, pooled negative_prompt_embeds will be generated from `negative_prompt`
-                input argument.
             output_type (`str`, *optional*, defaults to `"pil"`):
                 The output format of the generate image. Choose between
                 [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
@@ -769,7 +643,7 @@ def __call__(
         Examples:
 
         Returns:
-            [`~pipelines.flux.FluxPipelineOutput`] or `tuple`: [`~pipelines.flux.FluxPipelineOutput`] if `return_dict`
+            [`~pipelines.chroma.ChromaPipelineOutput`] or `tuple`: [`~pipelines.chroma.ChromaPipelineOutput`] if `return_dict`
             is True, otherwise a `tuple`. When returning a tuple, the first element is a list with the generated
             images.
         """
@@ -780,15 +654,11 @@ def __call__(
         # 1. Check inputs. Raise error if not correct
         self.check_inputs(
             prompt,
-            prompt_2,
             height,
             width,
             negative_prompt=negative_prompt,
-            negative_prompt_2=negative_prompt_2,
             prompt_embeds=prompt_embeds,
             negative_prompt_embeds=negative_prompt_embeds,
-            pooled_prompt_embeds=pooled_prompt_embeds,
-            negative_pooled_prompt_embeds=negative_pooled_prompt_embeds,
             callback_on_step_end_tensor_inputs=callback_on_step_end_tensor_inputs,
             max_sequence_length=max_sequence_length,
         )
@@ -811,34 +681,25 @@ def __call__(
         lora_scale = (
             self.joint_attention_kwargs.get("scale", None) if self.joint_attention_kwargs is not None else None
         )
-        has_neg_prompt = negative_prompt is not None or (
-            negative_prompt_embeds is not None and negative_pooled_prompt_embeds is not None
-        )
-        do_true_cfg = true_cfg_scale > 1 and has_neg_prompt
+        do_cfg = guidance_scale > 1
         (
             prompt_embeds,
-            pooled_prompt_embeds,
             text_ids,
         ) = self.encode_prompt(
             prompt=prompt,
-            prompt_2=prompt_2,
             prompt_embeds=prompt_embeds,
-            pooled_prompt_embeds=pooled_prompt_embeds,
             device=device,
             num_images_per_prompt=num_images_per_prompt,
             max_sequence_length=max_sequence_length,
             lora_scale=lora_scale,
         )
-        if do_true_cfg:
+        if do_cfg:
             (
                 negative_prompt_embeds,
-                negative_pooled_prompt_embeds,
                 negative_text_ids,
             ) = self.encode_prompt(
                 prompt=negative_prompt,
-                prompt_2=negative_prompt_2,
                 prompt_embeds=negative_prompt_embeds,
-                pooled_prompt_embeds=negative_pooled_prompt_embeds,
                 device=device,
                 num_images_per_prompt=num_images_per_prompt,
                 max_sequence_length=max_sequence_length,
@@ -933,7 +794,6 @@ def __call__(
                     hidden_states=latents,
                     timestep=timestep / 1000,
                     guidance=guidance,
-                    pooled_projections=pooled_prompt_embeds,
                     encoder_hidden_states=prompt_embeds,
                     txt_ids=text_ids,
                     img_ids=latent_image_ids,
@@ -941,7 +801,7 @@ def __call__(
                     return_dict=False,
                 )[0]
 
-                if do_true_cfg:
+                if do_cfg:
                     if negative_image_embeds is not None:
                         self._joint_attention_kwargs["ip_adapter_image_embeds"] = negative_image_embeds
                     neg_noise_pred = self.transformer(

From c8cbb31614aa69321ee99f6fe4eadecd0e865d7c Mon Sep 17 00:00:00 2001
From: Edna <88869424+Ednaordinary@users.noreply.github.com>
Date: Tue, 10 Jun 2025 02:22:52 -0600
Subject: [PATCH 010/108] add chroma init

---
 src/diffusers/pipelines/chroma/__init__.py | 47 ++++++++++++++++++++++
 1 file changed, 47 insertions(+)
 create mode 100644 src/diffusers/pipelines/chroma/__init__.py

diff --git a/src/diffusers/pipelines/chroma/__init__.py b/src/diffusers/pipelines/chroma/__init__.py
new file mode 100644
index 000000000000..9faa7902a15c
--- /dev/null
+++ b/src/diffusers/pipelines/chroma/__init__.py
@@ -0,0 +1,47 @@
+from typing import TYPE_CHECKING
+
+from ...utils import (
+    DIFFUSERS_SLOW_IMPORT,
+    OptionalDependencyNotAvailable,
+    _LazyModule,
+    get_objects_from_module,
+    is_torch_available,
+    is_transformers_available,
+)
+
+
+_dummy_objects = {}
+_additional_imports = {}
+_import_structure = {"pipeline_output": ["ChromaPipelineOutput"]}
+
+try:
+    if not (is_transformers_available() and is_torch_available()):
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    from ...utils import dummy_torch_and_transformers_objects  # noqa F403
+
+    _dummy_objects.update(get_objects_from_module(dummy_torch_and_transformers_objects))
+else:
+    _import_structure["pipeline_chroma"] = ["ChromaPipeline"]
+if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
+    try:
+        if not (is_transformers_available() and is_torch_available()):
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        from ...utils.dummy_torch_and_transformers_objects import *  # noqa F403
+    else:
+        from .pipeline_chroma import ChromaPipeline
+else:
+    import sys
+
+    sys.modules[__name__] = _LazyModule(
+        __name__,
+        globals()["__file__"],
+        _import_structure,
+        module_spec=__spec__,
+    )
+
+    for name, value in _dummy_objects.items():
+        setattr(sys.modules[__name__], name, value)
+    for name, value in _additional_imports.items():
+        setattr(sys.modules[__name__], name, value)

From 32659236b22e7e13830726f3a4956bebf306d7db Mon Sep 17 00:00:00 2001
From: Edna <88869424+Ednaordinary@users.noreply.github.com>
Date: Tue, 10 Jun 2025 02:24:23 -0600
Subject: [PATCH 011/108] make chroma output class

---
 .../pipelines/chroma/pipeline_output.py       | 22 +++++++++++++++++++
 1 file changed, 22 insertions(+)
 create mode 100644 src/diffusers/pipelines/chroma/pipeline_output.py

diff --git a/src/diffusers/pipelines/chroma/pipeline_output.py b/src/diffusers/pipelines/chroma/pipeline_output.py
new file mode 100644
index 000000000000..bb0a52ceb53c
--- /dev/null
+++ b/src/diffusers/pipelines/chroma/pipeline_output.py
@@ -0,0 +1,22 @@
+from dataclasses import dataclass
+from typing import List, Union
+
+import numpy as np
+import PIL.Image
+import torch
+
+from ...utils import BaseOutput
+
+
+@dataclass
+class ChromaPipelineOutput(BaseOutput):
+    """
+    Output class for Stable Diffusion pipelines.
+
+    Args:
+        images (`List[PIL.Image.Image]` or `np.ndarray`)
+            List of denoised PIL images of length `batch_size` or numpy array of shape `(batch_size, height, width,
+            num_channels)`. PIL images or numpy array present the denoised images of the diffusion pipeline.
+    """
+
+    images: Union[List[PIL.Image.Image], np.ndarray]

From 7400278857cd1bac5af4572d45cdd0af9d0d4534 Mon Sep 17 00:00:00 2001
From: Edna <88869424+Ednaordinary@users.noreply.github.com>
Date: Wed, 11 Jun 2025 18:16:44 -0600
Subject: [PATCH 012/108] add chroma transformer to dummy tp

---
 src/diffusers/utils/dummy_pt_objects.py | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/src/diffusers/utils/dummy_pt_objects.py b/src/diffusers/utils/dummy_pt_objects.py
index 24b3c3d7be59..200e15c7abc0 100644
--- a/src/diffusers/utils/dummy_pt_objects.py
+++ b/src/diffusers/utils/dummy_pt_objects.py
@@ -324,6 +324,20 @@ def from_config(cls, *args, **kwargs):
     def from_pretrained(cls, *args, **kwargs):
         requires_backends(cls, ["torch"])
 
+class ChromaTransformer2DModel(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
 
 class CogVideoXTransformer3DModel(metaclass=DummyObject):
     _backends = ["torch"]

From c22930d7ccdb5ff90099a4a9e2e34e0784e5410c Mon Sep 17 00:00:00 2001
From: Edna <88869424+Ednaordinary@users.noreply.github.com>
Date: Wed, 11 Jun 2025 18:18:56 -0600
Subject: [PATCH 013/108] add chroma to init

---
 src/diffusers/__init__.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/diffusers/__init__.py b/src/diffusers/__init__.py
index ce0777fdef68..f660ab0521aa 100644
--- a/src/diffusers/__init__.py
+++ b/src/diffusers/__init__.py
@@ -159,6 +159,7 @@
             "AutoencoderTiny",
             "AutoModel",
             "CacheMixin",
+            "ChromaTransformer2DModel",
             "CogVideoXTransformer3DModel",
             "CogView3PlusTransformer2DModel",
             "CogView4Transformer2DModel",

From 4e698b1088c5ee5588692028803cba12baf4604b Mon Sep 17 00:00:00 2001
From: Edna <88869424+Ednaordinary@users.noreply.github.com>
Date: Wed, 11 Jun 2025 18:21:10 -0600
Subject: [PATCH 014/108] add chroma to init

---
 src/diffusers/__init__.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/src/diffusers/__init__.py b/src/diffusers/__init__.py
index f660ab0521aa..2067e7d9d55c 100644
--- a/src/diffusers/__init__.py
+++ b/src/diffusers/__init__.py
@@ -354,6 +354,7 @@
             "BlipDiffusionControlNetPipeline",
             "BlipDiffusionPipeline",
             "CLIPImageProjection",
+            "ChromaPipeline",
             "CogVideoXFunControlPipeline",
             "CogVideoXImageToVideoPipeline",
             "CogVideoXPipeline",
@@ -769,6 +770,7 @@
             AutoencoderTiny,
             AutoModel,
             CacheMixin,
+            ChromaTransformer2DModel,
             CogVideoXTransformer3DModel,
             CogView3PlusTransformer2DModel,
             CogView4Transformer2DModel,
@@ -942,6 +944,7 @@
             AudioLDMPipeline,
             AuraFlowPipeline,
             CLIPImageProjection,
+            ChromaPipeline,
             CogVideoXFunControlPipeline,
             CogVideoXImageToVideoPipeline,
             CogVideoXPipeline,

From 5eb4b822aee0e9ebe10e96a29cb81ef641fe9502 Mon Sep 17 00:00:00 2001
From: Edna <88869424+Ednaordinary@users.noreply.github.com>
Date: Wed, 11 Jun 2025 18:38:58 -0600
Subject: [PATCH 015/108] fix single file

---
 src/diffusers/loaders/single_file_model.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/diffusers/loaders/single_file_model.py b/src/diffusers/loaders/single_file_model.py
index 82e4db7283cc..e07370130889 100644
--- a/src/diffusers/loaders/single_file_model.py
+++ b/src/diffusers/loaders/single_file_model.py
@@ -101,7 +101,7 @@
     "ChromaTransformer2DModel": {
         "checkpoint_mapping_fn": convert_chroma_transformer_checkpoint_to_diffusers,
         "default_subfolder": "transformer",
-    }
+    },
     "LTXVideoTransformer3DModel": {
         "checkpoint_mapping_fn": convert_ltx_transformer_checkpoint_to_diffusers,
         "default_subfolder": "transformer",

From f0c75b6b6ffd6619afbb0b0cf625806cbd677766 Mon Sep 17 00:00:00 2001
From: Edna <88869424+Ednaordinary@users.noreply.github.com>
Date: Wed, 11 Jun 2025 18:46:51 -0600
Subject: [PATCH 016/108] update

---
 src/diffusers/models/__init__.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/diffusers/models/__init__.py b/src/diffusers/models/__init__.py
index 8723fbca2187..db8b5fc7eb7f 100755
--- a/src/diffusers/models/__init__.py
+++ b/src/diffusers/models/__init__.py
@@ -60,6 +60,7 @@
     _import_structure["embeddings"] = ["ImageProjection"]
     _import_structure["modeling_utils"] = ["ModelMixin"]
     _import_structure["transformers.auraflow_transformer_2d"] = ["AuraFlowTransformer2DModel"]
+    _import_structure["transformers.chroma_transformer_2d"] = ["ChromaTransformer2DModel"]
     _import_structure["transformers.cogvideox_transformer_3d"] = ["CogVideoXTransformer3DModel"]
     _import_structure["transformers.consisid_transformer_3d"] = ["ConsisIDTransformer3DModel"]
     _import_structure["transformers.dit_transformer_2d"] = ["DiTTransformer2DModel"]
@@ -151,6 +152,7 @@
         from .transformers import (
             AllegroTransformer3DModel,
             AuraFlowTransformer2DModel,
+            ChromaTransformer2DModel,
             CogVideoXTransformer3DModel,
             CogView3PlusTransformer2DModel,
             CogView4Transformer2DModel,

From 6441e70defff84b7855b83ad01010d369626586f Mon Sep 17 00:00:00 2001
From: Edna <88869424+Ednaordinary@users.noreply.github.com>
Date: Wed, 11 Jun 2025 18:48:44 -0600
Subject: [PATCH 017/108] update

---
 src/diffusers/models/__init__.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/diffusers/models/__init__.py b/src/diffusers/models/__init__.py
index db8b5fc7eb7f..b493d651f4ba 100755
--- a/src/diffusers/models/__init__.py
+++ b/src/diffusers/models/__init__.py
@@ -60,7 +60,6 @@
     _import_structure["embeddings"] = ["ImageProjection"]
     _import_structure["modeling_utils"] = ["ModelMixin"]
     _import_structure["transformers.auraflow_transformer_2d"] = ["AuraFlowTransformer2DModel"]
-    _import_structure["transformers.chroma_transformer_2d"] = ["ChromaTransformer2DModel"]
     _import_structure["transformers.cogvideox_transformer_3d"] = ["CogVideoXTransformer3DModel"]
     _import_structure["transformers.consisid_transformer_3d"] = ["ConsisIDTransformer3DModel"]
     _import_structure["transformers.dit_transformer_2d"] = ["DiTTransformer2DModel"]
@@ -75,6 +74,7 @@
     _import_structure["transformers.t5_film_transformer"] = ["T5FilmDecoder"]
     _import_structure["transformers.transformer_2d"] = ["Transformer2DModel"]
     _import_structure["transformers.transformer_allegro"] = ["AllegroTransformer3DModel"]
+    _import_structure["transformers.transformer_chroma"] = ["ChromaTransformer2DModel"]
     _import_structure["transformers.transformer_cogview3plus"] = ["CogView3PlusTransformer2DModel"]
     _import_structure["transformers.transformer_cogview4"] = ["CogView4Transformer2DModel"]
     _import_structure["transformers.transformer_cosmos"] = ["CosmosTransformer3DModel"]

From a6f231c7ce48e0200185056dcc86dca376a24ea3 Mon Sep 17 00:00:00 2001
From: Edna <88869424+Ednaordinary@users.noreply.github.com>
Date: Wed, 11 Jun 2025 18:51:45 -0600
Subject: [PATCH 018/108] add chroma to auto pipeline

---
 src/diffusers/pipelines/auto_pipeline.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/diffusers/pipelines/auto_pipeline.py b/src/diffusers/pipelines/auto_pipeline.py
index ed8ad79ca781..29aa321f5ca3 100644
--- a/src/diffusers/pipelines/auto_pipeline.py
+++ b/src/diffusers/pipelines/auto_pipeline.py
@@ -21,6 +21,7 @@
 from ..models.controlnets import ControlNetUnionModel
 from ..utils import is_sentencepiece_available
 from .aura_flow import AuraFlowPipeline
+from .chroma import ChromaPipeline
 from .cogview3 import CogView3PlusPipeline
 from .cogview4 import CogView4ControlPipeline, CogView4Pipeline
 from .controlnet import (
@@ -143,6 +144,7 @@
         ("flux-controlnet", FluxControlNetPipeline),
         ("lumina", LuminaPipeline),
         ("lumina2", Lumina2Pipeline),
+        ("chroma", ChromaPipeline)
         ("cogview3", CogView3PlusPipeline),
         ("cogview4", CogView4Pipeline),
         ("cogview4-control", CogView4ControlPipeline),

From 7445cf422aff613bb6745920795d4b6cdf7d69d6 Mon Sep 17 00:00:00 2001
From: Edna <88869424+Ednaordinary@users.noreply.github.com>
Date: Wed, 11 Jun 2025 18:53:06 -0600
Subject: [PATCH 019/108] add chroma to pipeline init

---
 src/diffusers/pipelines/__init__.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/diffusers/pipelines/__init__.py b/src/diffusers/pipelines/__init__.py
index 268e5c2a8c39..d20d609ff9c4 100644
--- a/src/diffusers/pipelines/__init__.py
+++ b/src/diffusers/pipelines/__init__.py
@@ -148,6 +148,7 @@
         "AudioLDM2UNet2DConditionModel",
     ]
     _import_structure["blip_diffusion"] = ["BlipDiffusionPipeline"]
+    _import_structure["chroma"] = ["ChromaPipeline"]
     _import_structure["cogvideo"] = [
         "CogVideoXPipeline",
         "CogVideoXImageToVideoPipeline",

From af918c89dd9fe3c3355ad3a0ad43fa505d3fccfa Mon Sep 17 00:00:00 2001
From: Edna <88869424+Ednaordinary@users.noreply.github.com>
Date: Wed, 11 Jun 2025 18:55:03 -0600
Subject: [PATCH 020/108] change to chroma transformer

---
 src/diffusers/pipelines/chroma/pipeline_chroma.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/diffusers/pipelines/chroma/pipeline_chroma.py b/src/diffusers/pipelines/chroma/pipeline_chroma.py
index f6d2e366e48e..7ef191a54de4 100644
--- a/src/diffusers/pipelines/chroma/pipeline_chroma.py
+++ b/src/diffusers/pipelines/chroma/pipeline_chroma.py
@@ -28,7 +28,7 @@
 
 from ...image_processor import PipelineImageInput, VaeImageProcessor
 from ...loaders import FluxIPAdapterMixin, FluxLoraLoaderMixin, FromSingleFileMixin, TextualInversionLoaderMixin
-from ...models import AutoencoderKL, FluxTransformer2DModel
+from ...models import AutoencoderKL, ChromaTransformer2DModel
 from ...schedulers import FlowMatchEulerDiscreteScheduler
 from ...utils import (
     USE_PEFT_BACKEND,

From 2fcc75a6d89ab010789f20963c1b38b872801afd Mon Sep 17 00:00:00 2001
From: Edna <88869424+Ednaordinary@users.noreply.github.com>
Date: Wed, 11 Jun 2025 18:55:56 -0600
Subject: [PATCH 021/108] take out variant from blocks

---
 src/diffusers/models/transformers/transformer_chroma.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/src/diffusers/models/transformers/transformer_chroma.py b/src/diffusers/models/transformers/transformer_chroma.py
index 1f726f5cb4b0..7b46ef9c4376 100644
--- a/src/diffusers/models/transformers/transformer_chroma.py
+++ b/src/diffusers/models/transformers/transformer_chroma.py
@@ -290,7 +290,6 @@ def __init__(
                     dim=self.inner_dim,
                     num_attention_heads=num_attention_heads,
                     attention_head_dim=attention_head_dim,
-                    variant=variant,
                 )
                 for _ in range(num_layers)
             ]
@@ -302,7 +301,6 @@ def __init__(
                     dim=self.inner_dim,
                     num_attention_heads=num_attention_heads,
                     attention_head_dim=attention_head_dim,
-                    variant=variant,
                 )
                 for _ in range(num_single_layers)
             ]

From 0b027a24533890171b1536f2942bb662ca1466d4 Mon Sep 17 00:00:00 2001
From: Edna <88869424+Ednaordinary@users.noreply.github.com>
Date: Wed, 11 Jun 2025 19:04:52 -0600
Subject: [PATCH 022/108] swap embedder location

---
 src/diffusers/loaders/single_file_utils.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/src/diffusers/loaders/single_file_utils.py b/src/diffusers/loaders/single_file_utils.py
index aace8fc7bffb..f406ba5ce7e4 100644
--- a/src/diffusers/loaders/single_file_utils.py
+++ b/src/diffusers/loaders/single_file_utils.py
@@ -3333,20 +3333,20 @@ def swap_scale_shift(weight):
         return new_weight
 
     # guidance
-    converted_state_dict["time_text_embed.embedder.in_proj.bias"] = checkpoint.pop(
+    converted_state_dict["distilled_guidance_layer.in_proj.bias"] = checkpoint.pop(
             "distilled_guidance_layer.in_proj.bias"
         )
-    converted_state_dict["time_text_embed.embedder.in_proj.weight"] = checkpoint.pop(
+    converted_state_dict["distilled_guidance_layer.in_proj.weight"] = checkpoint.pop(
             "distilled_guidance_layer.in_proj.weight"
         )
-    converted_state_dict["time_text_embed.embedder.out_proj.bias"] = checkpoint.pop(
+    converted_state_dict["distilled_guidance_layer.out_proj.bias"] = checkpoint.pop(
             "distilled_guidance_layer.out_proj.bias"
         )
-    converted_state_dict["time_text_embed.embedder.out_proj.weight"] = checkpoint.pop(
+    converted_state_dict["distilled_guidance_layer.out_proj.weight"] = checkpoint.pop(
             "distilled_guidance_layer.out_proj.weight"
         )
     for i in range(num_guidance_layers):
-        block_prefix = f"time_text_embed.embedder.layers.{i}."
+        block_prefix = f"distilled_guidance_layer.layers.{i}."
         converted_state_dict[f"{block_prefix}linear_1.bias"] = checkpoint.pop(
             f"distilled_guidance_layer.layers.{i}.in_layer.bias"
         )
@@ -3359,7 +3359,7 @@ def swap_scale_shift(weight):
         converted_state_dict[f"{block_prefix}linear_2.weight"] = checkpoint.pop(
             f"distilled_guidance_layer.layers.{i}.out_layer.weight"
         )
-        converted_state_dict[f"time_text_embed.embedder.norms.{i}.weight"] = checkpoint.pop(
+        converted_state_dict[f"distilled_guidance_layer.norms.{i}.weight"] = checkpoint.pop(
             f"distilled_guidance_layer.norms.{i}.scale"
         )
 

From 6c0aed14dbaab0fc76c7d90e2ae382c3dab18fe9 Mon Sep 17 00:00:00 2001
From: Edna <88869424+Ednaordinary@users.noreply.github.com>
Date: Wed, 11 Jun 2025 19:06:45 -0600
Subject: [PATCH 023/108] remove prompt_2

---
 src/diffusers/pipelines/chroma/pipeline_chroma.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/src/diffusers/pipelines/chroma/pipeline_chroma.py b/src/diffusers/pipelines/chroma/pipeline_chroma.py
index 7ef191a54de4..2c5f7988534c 100644
--- a/src/diffusers/pipelines/chroma/pipeline_chroma.py
+++ b/src/diffusers/pipelines/chroma/pipeline_chroma.py
@@ -308,7 +308,7 @@ def encode_prompt(
         if prompt_embeds is None:
 
             prompt_embeds = self._get_t5_prompt_embeds(
-                prompt=prompt_2,
+                prompt=prompt,
                 num_images_per_prompt=num_images_per_prompt,
                 max_sequence_length=max_sequence_length,
                 device=device,
@@ -377,7 +377,6 @@ def check_inputs(
         height,
         width,
         negative_prompt=None,
-        negative_prompt_2=None,
         prompt_embeds=None,
         negative_prompt_embeds=None,
         callback_on_step_end_tensor_inputs=None,

From f190c02af71b9dfbfa64bff2921d47b5b76220a0 Mon Sep 17 00:00:00 2001
From: Edna <88869424+Ednaordinary@users.noreply.github.com>
Date: Wed, 11 Jun 2025 19:09:37 -0600
Subject: [PATCH 024/108] work on swapping text encoders

---
 .../pipelines/chroma/pipeline_chroma.py       | 24 +++++--------------
 1 file changed, 6 insertions(+), 18 deletions(-)

diff --git a/src/diffusers/pipelines/chroma/pipeline_chroma.py b/src/diffusers/pipelines/chroma/pipeline_chroma.py
index 2c5f7988534c..88b435fb2917 100644
--- a/src/diffusers/pipelines/chroma/pipeline_chroma.py
+++ b/src/diffusers/pipelines/chroma/pipeline_chroma.py
@@ -19,8 +19,6 @@
 import torch
 from transformers import (
     CLIPImageProcessor,
-    CLIPTextModel,
-    CLIPTokenizer,
     CLIPVisionModelWithProjection,
     T5EncoderModel,
     T5TokenizerFast,
@@ -168,7 +166,7 @@ class ChromaPipeline(
             [T5TokenizerFast](https://huggingface.co/docs/transformers/en/model_doc/t5#transformers.T5TokenizerFast).
     """
 
-    model_cpu_offload_seq = "text_encoder->text_encoder_2->image_encoder->transformer->vae"
+    model_cpu_offload_seq = "text_encoder->image_encoder->transformer->vae"
     _optional_components = ["image_encoder", "feature_extractor"]
     _callback_tensor_inputs = ["latents", "prompt_embeds"]
 
@@ -198,9 +196,6 @@ def __init__(
         # Flux latents are turned into 2x2 patches and packed. This means the latent width and height has to be divisible
         # by the patch size. So the vae scale factor is multiplied by the patch size to account for this
         self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor * 2)
-        self.tokenizer_max_length = (
-            self.tokenizer.model_max_length if hasattr(self, "tokenizer") and self.tokenizer is not None else 77
-        )
         self.default_sample_size = 128
 
     def _get_chroma_attn_mask(self, length: torch.Tensor, max_sequence_length: int) -> torch.Tensor:
@@ -225,9 +220,9 @@ def _get_t5_prompt_embeds(
         batch_size = len(prompt)
 
         if isinstance(self, TextualInversionLoaderMixin):
-            prompt = self.maybe_convert_prompt(prompt, self.tokenizer_2)
+            prompt = self.maybe_convert_prompt(prompt, self.tokenizer)
 
-        text_inputs = self.tokenizer_2(
+        text_inputs = self.tokenizer(
             prompt,
             padding="max_length",
             max_length=max_sequence_length,
@@ -237,16 +232,9 @@ def _get_t5_prompt_embeds(
             return_tensors="pt",
         )
         text_input_ids = text_inputs.input_ids
-        untruncated_ids = self.tokenizer_2(prompt, padding="longest", return_tensors="pt").input_ids
-
-        if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(text_input_ids, untruncated_ids):
-            removed_text = self.tokenizer_2.batch_decode(untruncated_ids[:, self.tokenizer_max_length - 1 : -1])
-            logger.warning(
-                "The following part of your input was truncated because `max_sequence_length` is set to "
-                f" {max_sequence_length} tokens: {removed_text}"
-            )
+        untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pt").input_ids
 
-        prompt_embeds = self.text_encoder_2(
+        prompt_embeds = self.text_encoder(
             text_input_ids.to(device),
             output_hidden_states=False,
             attention_mask=(
@@ -254,7 +242,7 @@ def _get_t5_prompt_embeds(
             ),
         )[0]
 
-        dtype = self.text_encoder_2.dtype
+        dtype = self.text_encoder.dtype
         prompt_embeds = prompt_embeds.to(dtype=dtype, device=device)
 
         _, seq_len, _ = prompt_embeds.shape

From 38429ffcaccb49632c4f32804ab75082e78c2bc3 Mon Sep 17 00:00:00 2001
From: Edna <88869424+Ednaordinary@users.noreply.github.com>
Date: Wed, 11 Jun 2025 19:11:47 -0600
Subject: [PATCH 025/108] remove mask function

---
 src/diffusers/pipelines/chroma/pipeline_chroma.py | 13 +++----------
 1 file changed, 3 insertions(+), 10 deletions(-)

diff --git a/src/diffusers/pipelines/chroma/pipeline_chroma.py b/src/diffusers/pipelines/chroma/pipeline_chroma.py
index 88b435fb2917..09883f54c7b1 100644
--- a/src/diffusers/pipelines/chroma/pipeline_chroma.py
+++ b/src/diffusers/pipelines/chroma/pipeline_chroma.py
@@ -198,13 +198,6 @@ def __init__(
         self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor * 2)
         self.default_sample_size = 128
 
-    def _get_chroma_attn_mask(self, length: torch.Tensor, max_sequence_length: int) -> torch.Tensor:
-        attention_mask = torch.zeros((length.shape[0], max_sequence_length), dtype=torch.bool, device=length.device)
-        for i, n_tokens in enumerate(length):
-            n_tokens = torch.max(n_tokens + 1, max_sequence_length)
-            attention_mask[i, :n_tokens] = True
-        return attention_mask
-
     def _get_t5_prompt_embeds(
         self,
         prompt: Union[str, List[str]] = None,
@@ -234,12 +227,12 @@ def _get_t5_prompt_embeds(
         text_input_ids = text_inputs.input_ids
         untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pt").input_ids
 
+        text_inputs.attention_mask[:, : text_inputs.length + 1] = 1.0
+        
         prompt_embeds = self.text_encoder(
             text_input_ids.to(device),
             output_hidden_states=False,
-            attention_mask=(
-                self._get_chroma_attn_mask(text_inputs.length, max_sequence_length).to(device)
-            ),
+            attention_mask=text_inputs.attention_mask.to(device),
         )[0]
 
         dtype = self.text_encoder.dtype

From 7c75d8e98d88816f2a2d76d542b2814ec446f0dc Mon Sep 17 00:00:00 2001
From: Edna <88869424+Ednaordinary@users.noreply.github.com>
Date: Wed, 11 Jun 2025 19:15:18 -0600
Subject: [PATCH 026/108] dont modify mask (for now)

---
 src/diffusers/pipelines/chroma/pipeline_chroma.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/diffusers/pipelines/chroma/pipeline_chroma.py b/src/diffusers/pipelines/chroma/pipeline_chroma.py
index 09883f54c7b1..1ddce5fb717b 100644
--- a/src/diffusers/pipelines/chroma/pipeline_chroma.py
+++ b/src/diffusers/pipelines/chroma/pipeline_chroma.py
@@ -227,7 +227,7 @@ def _get_t5_prompt_embeds(
         text_input_ids = text_inputs.input_ids
         untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pt").input_ids
 
-        text_inputs.attention_mask[:, : text_inputs.length + 1] = 1.0
+        #text_inputs.attention_mask[:, : text_inputs.length + 1] = 1.0
         
         prompt_embeds = self.text_encoder(
             text_input_ids.to(device),

From c9b46af65f4cd51bf5c32cb2795bd5069b1a61a6 Mon Sep 17 00:00:00 2001
From: Edna <88869424+Ednaordinary@users.noreply.github.com>
Date: Wed, 11 Jun 2025 19:16:24 -0600
Subject: [PATCH 027/108] wrap attn mask

---
 src/diffusers/pipelines/chroma/pipeline_chroma.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/diffusers/pipelines/chroma/pipeline_chroma.py b/src/diffusers/pipelines/chroma/pipeline_chroma.py
index 1ddce5fb717b..62f601c0dc9c 100644
--- a/src/diffusers/pipelines/chroma/pipeline_chroma.py
+++ b/src/diffusers/pipelines/chroma/pipeline_chroma.py
@@ -227,12 +227,12 @@ def _get_t5_prompt_embeds(
         text_input_ids = text_inputs.input_ids
         untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pt").input_ids
 
-        #text_inputs.attention_mask[:, : text_inputs.length + 1] = 1.0
+        text_inputs.attention_mask[:, : text_inputs.length + 1] = 1.0
         
         prompt_embeds = self.text_encoder(
             text_input_ids.to(device),
             output_hidden_states=False,
-            attention_mask=text_inputs.attention_mask.to(device),
+            attention_mask=(text_inputs.attention_mask.to(device),),
         )[0]
 
         dtype = self.text_encoder.dtype

From 146255aba134360d4d11357d2711a205402528b1 Mon Sep 17 00:00:00 2001
From: Edna <88869424+Ednaordinary@users.noreply.github.com>
Date: Wed, 11 Jun 2025 19:17:29 -0600
Subject: [PATCH 028/108] no attn mask (can't get it to work)

---
 src/diffusers/pipelines/chroma/pipeline_chroma.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/diffusers/pipelines/chroma/pipeline_chroma.py b/src/diffusers/pipelines/chroma/pipeline_chroma.py
index 62f601c0dc9c..04c05372c488 100644
--- a/src/diffusers/pipelines/chroma/pipeline_chroma.py
+++ b/src/diffusers/pipelines/chroma/pipeline_chroma.py
@@ -227,12 +227,12 @@ def _get_t5_prompt_embeds(
         text_input_ids = text_inputs.input_ids
         untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pt").input_ids
 
-        text_inputs.attention_mask[:, : text_inputs.length + 1] = 1.0
+        #text_inputs.attention_mask[:, : text_inputs.length + 1] = 1.0
         
         prompt_embeds = self.text_encoder(
             text_input_ids.to(device),
             output_hidden_states=False,
-            attention_mask=(text_inputs.attention_mask.to(device),),
+            #attention_mask=(text_inputs.attention_mask.to(device),),
         )[0]
 
         dtype = self.text_encoder.dtype

From 3309ffef1ce43d4c74ff1beba7da97c1fd4c0a1b Mon Sep 17 00:00:00 2001
From: Edna <88869424+Ednaordinary@users.noreply.github.com>
Date: Wed, 11 Jun 2025 19:33:17 -0600
Subject: [PATCH 029/108] remove pooled prompt embeds

---
 src/diffusers/pipelines/chroma/pipeline_chroma.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/diffusers/pipelines/chroma/pipeline_chroma.py b/src/diffusers/pipelines/chroma/pipeline_chroma.py
index 04c05372c488..32135d2c21fe 100644
--- a/src/diffusers/pipelines/chroma/pipeline_chroma.py
+++ b/src/diffusers/pipelines/chroma/pipeline_chroma.py
@@ -788,7 +788,6 @@ def __call__(
                         hidden_states=latents,
                         timestep=timestep / 1000,
                         guidance=guidance,
-                        pooled_projections=negative_pooled_prompt_embeds,
                         encoder_hidden_states=negative_prompt_embeds,
                         txt_ids=negative_text_ids,
                         img_ids=latent_image_ids,

From 77b429eda416f0f6645b591b370971913f6bdbf5 Mon Sep 17 00:00:00 2001
From: Edna <88869424+Ednaordinary@users.noreply.github.com>
Date: Wed, 11 Jun 2025 19:35:10 -0600
Subject: [PATCH 030/108] change to my own unpooled embeddeer

---
 src/diffusers/models/embeddings.py | 32 ++++++++++++++++++++----------
 1 file changed, 21 insertions(+), 11 deletions(-)

diff --git a/src/diffusers/models/embeddings.py b/src/diffusers/models/embeddings.py
index 8aa2ea5841e9..0ba64eadf2c1 100644
--- a/src/diffusers/models/embeddings.py
+++ b/src/diffusers/models/embeddings.py
@@ -1636,36 +1636,46 @@ def forward(self, timestep, guidance, pooled_projection):
 
         return conditioning
 
-
 class CombinedTimestepTextProjChromaEmbeddings(nn.Module):
     def __init__(self, factor: int, hidden_dim: int, out_dim: int, n_layers: int, embedding_dim: int):
         super().__init__()
 
         self.time_proj = Timesteps(num_channels=factor, flip_sin_to_cos=True, downscale_freq_shift=0)
         self.guidance_proj = Timesteps(num_channels=factor, flip_sin_to_cos=True, downscale_freq_shift=0)
+        self.embedder = ChromaApproximator(
+            in_dim=factor * 4,
+            out_dim=out_dim,
+            hidden_dim=hidden_dim,
+            n_layers=n_layers,
+        )
+        self.embedding_dim = embedding_dim
 
         self.register_buffer(
             "mod_proj",
-            get_timestep_embedding(torch.arange(out_dim)*1000, 2 * factor, flip_sin_to_cos=True, downscale_freq_shift=0, ),
+            get_timestep_embedding(torch.arange(344), 2 * factor, flip_sin_to_cos=True, downscale_freq_shift=0),
             persistent=False,
         )
 
     def forward(
-        self, timestep: torch.Tensor, guidance: Optional[torch.Tensor], pooled_projections: torch.Tensor
+        self, timestep: torch.Tensor, guidance: Optional[torch.Tensor]
     ) -> torch.Tensor:
         mod_index_length = self.mod_proj.shape[0]
-        timesteps_proj = self.time_proj(timestep).to(dtype=timestep.dtype)
-        guidance_proj = self.guidance_proj(torch.tensor([0])).to(dtype=timestep.dtype, device=timestep.device)
-
-        mod_proj = self.mod_proj.to(dtype=timesteps_proj.dtype, device=timesteps_proj.device)
+        timesteps_proj = self.time_proj(timestep)
+        if guidance is not None:
+            guidance_proj = self.guidance_proj(guidance.repeat(timesteps_proj.shape[0]))
+        else:
+            guidance_proj = torch.zeros(
+                (1, self.guidance_proj.num_channels),
+                dtype=timesteps_proj.dtype,
+                device=timesteps_proj.device,
+            )
+        mod_proj = self.mod_proj.unsqueeze(0).repeat(timesteps_proj.shape[0], 1, 1).to(dtype=timesteps_proj.dtype, device=timesteps_proj.device)
         timestep_guidance = (
-            torch.cat([timesteps_proj, guidance_proj], dim=1).unsqueeze(1).repeat(1, mod_index_length, 1)
+            torch.cat([timesteps_proj, guidance_proj], dim=1).repeat(1, mod_index_length, 1)
         )
-        input_vec = torch.cat([timestep_guidance, mod_proj.unsqueeze(0)], dim=-1)
-
+        input_vec = torch.cat([timestep_guidance, mod_proj], dim=-1)
         return input_vec
 
-
 class CogView3CombinedTimestepSizeEmbeddings(nn.Module):
     def __init__(self, embedding_dim: int, condition_dim: int, pooled_projection_dim: int, timesteps_dim: int = 256):
         super().__init__()

From df7fde7a6d32b03a8ad77d337e6a2125edf4e9c8 Mon Sep 17 00:00:00 2001
From: Edna <88869424+Ednaordinary@users.noreply.github.com>
Date: Wed, 11 Jun 2025 19:36:34 -0600
Subject: [PATCH 031/108] fix load

---
 src/diffusers/models/embeddings.py | 9 +--------
 1 file changed, 1 insertion(+), 8 deletions(-)

diff --git a/src/diffusers/models/embeddings.py b/src/diffusers/models/embeddings.py
index 0ba64eadf2c1..8a89a5d1366a 100644
--- a/src/diffusers/models/embeddings.py
+++ b/src/diffusers/models/embeddings.py
@@ -1642,17 +1642,10 @@ def __init__(self, factor: int, hidden_dim: int, out_dim: int, n_layers: int, em
 
         self.time_proj = Timesteps(num_channels=factor, flip_sin_to_cos=True, downscale_freq_shift=0)
         self.guidance_proj = Timesteps(num_channels=factor, flip_sin_to_cos=True, downscale_freq_shift=0)
-        self.embedder = ChromaApproximator(
-            in_dim=factor * 4,
-            out_dim=out_dim,
-            hidden_dim=hidden_dim,
-            n_layers=n_layers,
-        )
-        self.embedding_dim = embedding_dim
 
         self.register_buffer(
             "mod_proj",
-            get_timestep_embedding(torch.arange(344), 2 * factor, flip_sin_to_cos=True, downscale_freq_shift=0),
+            get_timestep_embedding(torch.arange(out_dim)*1000, 2 * factor, flip_sin_to_cos=True, downscale_freq_shift=0),
             persistent=False,
         )
 

From 68f771bf43cc4732ddbb714341242f2ac37ce983 Mon Sep 17 00:00:00 2001
From: Edna <88869424+Ednaordinary@users.noreply.github.com>
Date: Wed, 11 Jun 2025 19:38:38 -0600
Subject: [PATCH 032/108] take pooled projections out of transformer

---
 src/diffusers/models/transformers/transformer_chroma.py | 8 +-------
 1 file changed, 1 insertion(+), 7 deletions(-)

diff --git a/src/diffusers/models/transformers/transformer_chroma.py b/src/diffusers/models/transformers/transformer_chroma.py
index 7b46ef9c4376..72cde1f60b67 100644
--- a/src/diffusers/models/transformers/transformer_chroma.py
+++ b/src/diffusers/models/transformers/transformer_chroma.py
@@ -236,8 +236,6 @@ class ChromaTransformer2DModel(
         joint_attention_dim (`int`, defaults to `4096`):
             The number of dimensions to use for the joint attention (embedding/channel dimension of
             `encoder_hidden_states`).
-        pooled_projection_dim (`int`, defaults to `768`):
-            The number of dimensions to use for the pooled projection.
         guidance_embeds (`bool`, defaults to `False`):
             Whether to use guidance embeddings for guidance-distilled variant of the model.
         axes_dims_rope (`Tuple[int]`, defaults to `(16, 56, 56)`):
@@ -259,7 +257,6 @@ def __init__(
         attention_head_dim: int = 128,
         num_attention_heads: int = 24,
         joint_attention_dim: int = 4096,
-        pooled_projection_dim: int = 768,
         guidance_embeds: bool = False,
         axes_dims_rope: Tuple[int, ...] = (16, 56, 56),
         approximator_in_factor: int = 16,
@@ -416,7 +413,6 @@ def forward(
         self,
         hidden_states: torch.Tensor,
         encoder_hidden_states: torch.Tensor = None,
-        pooled_projections: torch.Tensor = None,
         timestep: torch.LongTensor = None,
         img_ids: torch.Tensor = None,
         txt_ids: torch.Tensor = None,
@@ -435,8 +431,6 @@ def forward(
                 Input `hidden_states`.
             encoder_hidden_states (`torch.Tensor` of shape `(batch_size, text_sequence_length, joint_attention_dim)`):
                 Conditional embeddings (embeddings computed from the input conditions such as prompts) to use.
-            pooled_projections (`torch.Tensor` of shape `(batch_size, projection_dim)`): Embeddings projected
-                from the embeddings of input conditions.
             timestep ( `torch.LongTensor`):
                 Used to indicate denoising step.
             block_controlnet_hidden_states: (`list` of `torch.Tensor`):
@@ -474,7 +468,7 @@ def forward(
         if guidance is not None:
             guidance = guidance.to(hidden_states.dtype) * 1000
 
-        input_vec = self.time_text_embed(timestep, guidance, pooled_projections)
+        input_vec = self.time_text_embed(timestep, guidance)
         pooled_temb = self.distilled_guidance_layer(input_vec)
 
         encoder_hidden_states = self.context_embedder(encoder_hidden_states)

From f783f38883f6f9c04c6ccb0a5bb630cc76c07e98 Mon Sep 17 00:00:00 2001
From: Edna <88869424+Ednaordinary@users.noreply.github.com>
Date: Wed, 11 Jun 2025 19:52:43 -0600
Subject: [PATCH 033/108] ensure correct dtype for chroma embeddings

---
 src/diffusers/models/embeddings.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/diffusers/models/embeddings.py b/src/diffusers/models/embeddings.py
index a31999267506..dc39480b6506 100644
--- a/src/diffusers/models/embeddings.py
+++ b/src/diffusers/models/embeddings.py
@@ -1665,6 +1665,7 @@ def forward(
             torch.cat([timesteps_proj, guidance_proj], dim=1).repeat(1, mod_index_length, 1)
         )
         input_vec = torch.cat([timestep_guidance, mod_proj], dim=-1)
+        input_vec.to(dtype=timestep.dtype)
         return input_vec
 
 class CogView3CombinedTimestepSizeEmbeddings(nn.Module):

From f6de1afc3febd680b41ba4b16d643cb3b897c091 Mon Sep 17 00:00:00 2001
From: Edna <88869424+Ednaordinary@users.noreply.github.com>
Date: Wed, 11 Jun 2025 19:54:27 -0600
Subject: [PATCH 034/108] update

---
 src/diffusers/models/embeddings.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/src/diffusers/models/embeddings.py b/src/diffusers/models/embeddings.py
index dc39480b6506..8d3f7cbbe378 100644
--- a/src/diffusers/models/embeddings.py
+++ b/src/diffusers/models/embeddings.py
@@ -1665,8 +1665,7 @@ def forward(
             torch.cat([timesteps_proj, guidance_proj], dim=1).repeat(1, mod_index_length, 1)
         )
         input_vec = torch.cat([timestep_guidance, mod_proj], dim=-1)
-        input_vec.to(dtype=timestep.dtype)
-        return input_vec
+        return input_vec.to(dtype=timestep.dtype)
 
 class CogView3CombinedTimestepSizeEmbeddings(nn.Module):
     def __init__(self, embedding_dim: int, condition_dim: int, pooled_projection_dim: int, timesteps_dim: int = 256):

From ab7942174ad9debd5f3a41b1df54c1868e863e75 Mon Sep 17 00:00:00 2001
From: Edna <88869424+Ednaordinary@users.noreply.github.com>
Date: Wed, 11 Jun 2025 19:57:31 -0600
Subject: [PATCH 035/108] use dn6 attn mask + fix true_cfg_scale

---
 src/diffusers/pipelines/chroma/pipeline_chroma.py | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/src/diffusers/pipelines/chroma/pipeline_chroma.py b/src/diffusers/pipelines/chroma/pipeline_chroma.py
index 32135d2c21fe..de7e5deb201e 100644
--- a/src/diffusers/pipelines/chroma/pipeline_chroma.py
+++ b/src/diffusers/pipelines/chroma/pipeline_chroma.py
@@ -232,9 +232,14 @@ def _get_t5_prompt_embeds(
         prompt_embeds = self.text_encoder(
             text_input_ids.to(device),
             output_hidden_states=False,
-            #attention_mask=(text_inputs.attention_mask.to(device),),
+            attention_mask=text_inputs.attention_mask.to(device),
         )[0]
 
+        max_len = min(text_inputs.attention_mask.sum() + 1, max_sequence_length)
+        prompt_embeds = prompt_embeds[
+            :, :max_len
+        ]
+        
         dtype = self.text_encoder.dtype
         prompt_embeds = prompt_embeds.to(dtype=dtype, device=device)
 
@@ -554,7 +559,7 @@ def __call__(
                 instead.
             negative_prompt (`str` or `List[str]`, *optional*):
                 The prompt or prompts not to guide the image generation. If not defined, one has to pass
-                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `true_cfg_scale` is
+                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
                 not greater than `1`).
             height (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
                 The height in pixels of the generated image. This is set to 1024 by default for the best results.
@@ -794,7 +799,7 @@ def __call__(
                         joint_attention_kwargs=self.joint_attention_kwargs,
                         return_dict=False,
                     )[0]
-                    noise_pred = neg_noise_pred + true_cfg_scale * (noise_pred - neg_noise_pred)
+                    noise_pred = neg_noise_pred + guidance_scale * (noise_pred - neg_noise_pred)
 
                 # compute the previous noisy sample x_t -> x_t-1
                 latents_dtype = latents.dtype

From 442f77a2d7fc12f67310763b8e157d5751617205 Mon Sep 17 00:00:00 2001
From: Edna <88869424+Ednaordinary@users.noreply.github.com>
Date: Wed, 11 Jun 2025 19:59:43 -0600
Subject: [PATCH 036/108] use chroma pipeline output

---
 src/diffusers/pipelines/chroma/pipeline_chroma.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/diffusers/pipelines/chroma/pipeline_chroma.py b/src/diffusers/pipelines/chroma/pipeline_chroma.py
index de7e5deb201e..7a2fc90841b2 100644
--- a/src/diffusers/pipelines/chroma/pipeline_chroma.py
+++ b/src/diffusers/pipelines/chroma/pipeline_chroma.py
@@ -609,7 +609,7 @@ def __call__(
                 The output format of the generate image. Choose between
                 [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
             return_dict (`bool`, *optional*, defaults to `True`):
-                Whether or not to return a [`~pipelines.flux.FluxPipelineOutput`] instead of a plain tuple.
+                Whether or not to return a [`~pipelines.flux.ChromaPipelineOutput`] instead of a plain tuple.
             joint_attention_kwargs (`dict`, *optional*):
                 A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
                 `self.processor` in
@@ -842,4 +842,4 @@ def __call__(
         if not return_dict:
             return (image,)
 
-        return FluxPipelineOutput(images=image)
+        return ChromaPipelineOutput(images=image)

From e69d73099d0572748f0f078d7c97f94ff5fb5a6c Mon Sep 17 00:00:00 2001
From: Edna <88869424+Ednaordinary@users.noreply.github.com>
Date: Wed, 11 Jun 2025 20:05:28 -0600
Subject: [PATCH 037/108] use DN6 embeddings

---
 src/diffusers/models/embeddings.py | 19 +++++++------------
 1 file changed, 7 insertions(+), 12 deletions(-)

diff --git a/src/diffusers/models/embeddings.py b/src/diffusers/models/embeddings.py
index 8d3f7cbbe378..adb00b247560 100644
--- a/src/diffusers/models/embeddings.py
+++ b/src/diffusers/models/embeddings.py
@@ -1651,20 +1651,15 @@ def forward(
         self, timestep: torch.Tensor, guidance: Optional[torch.Tensor]
     ) -> torch.Tensor:
         mod_index_length = self.mod_proj.shape[0]
-        timesteps_proj = self.time_proj(timestep)
-        if guidance is not None:
-            guidance_proj = self.guidance_proj(guidance.repeat(timesteps_proj.shape[0]))
-        else:
-            guidance_proj = torch.zeros(
-                (1, self.guidance_proj.num_channels),
-                dtype=timesteps_proj.dtype,
-                device=timesteps_proj.device,
-            )
-        mod_proj = self.mod_proj.unsqueeze(0).repeat(timesteps_proj.shape[0], 1, 1).to(dtype=timesteps_proj.dtype, device=timesteps_proj.device)
+
+        timesteps_proj = self.time_proj(timestep).to(dtype=timestep.dtype)
+        guidance_proj = self.guidance_proj(torch.tensor([0])).to(dtype=timestep.dtype, device=timestep.device)
+
+        mod_proj = self.mod_proj.to(dtype=timesteps_proj.dtype, device=timesteps_proj.device)
         timestep_guidance = (
-            torch.cat([timesteps_proj, guidance_proj], dim=1).repeat(1, mod_index_length, 1)
+            torch.cat([timesteps_proj, guidance_proj], dim=1).unsqueeze(1).repeat(1, mod_index_length, 1)
         )
-        input_vec = torch.cat([timestep_guidance, mod_proj], dim=-1)
+        input_vec = torch.cat([timestep_guidance, mod_proj.unsqueeze(0)], dim=-1)
         return input_vec.to(dtype=timestep.dtype)
 
 class CogView3CombinedTimestepSizeEmbeddings(nn.Module):

From 01bc0dcc56b93d3df77a220920a2df037df15701 Mon Sep 17 00:00:00 2001
From: Edna <88869424+Ednaordinary@users.noreply.github.com>
Date: Wed, 11 Jun 2025 20:45:45 -0600
Subject: [PATCH 038/108] remove guidance

---
 src/diffusers/models/transformers/transformer_chroma.py | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/src/diffusers/models/transformers/transformer_chroma.py b/src/diffusers/models/transformers/transformer_chroma.py
index 72cde1f60b67..fd5b01d1ee53 100644
--- a/src/diffusers/models/transformers/transformer_chroma.py
+++ b/src/diffusers/models/transformers/transformer_chroma.py
@@ -416,7 +416,6 @@ def forward(
         timestep: torch.LongTensor = None,
         img_ids: torch.Tensor = None,
         txt_ids: torch.Tensor = None,
-        guidance: torch.Tensor = None,
         joint_attention_kwargs: Optional[Dict[str, Any]] = None,
         controlnet_block_samples=None,
         controlnet_single_block_samples=None,
@@ -465,10 +464,8 @@ def forward(
         hidden_states = self.x_embedder(hidden_states)
 
         timestep = timestep.to(hidden_states.dtype) * 1000
-        if guidance is not None:
-            guidance = guidance.to(hidden_states.dtype) * 1000
 
-        input_vec = self.time_text_embed(timestep, guidance)
+        input_vec = self.time_text_embed(timestep)
         pooled_temb = self.distilled_guidance_layer(input_vec)
 
         encoder_hidden_states = self.context_embedder(encoder_hidden_states)

From e31c94866d9c56433184f1ef906218b220f12b10 Mon Sep 17 00:00:00 2001
From: Edna <88869424+Ednaordinary@users.noreply.github.com>
Date: Wed, 11 Jun 2025 20:46:59 -0600
Subject: [PATCH 039/108] remove guidance embed (pipeline)

---
 src/diffusers/pipelines/chroma/pipeline_chroma.py | 9 ---------
 1 file changed, 9 deletions(-)

diff --git a/src/diffusers/pipelines/chroma/pipeline_chroma.py b/src/diffusers/pipelines/chroma/pipeline_chroma.py
index 7a2fc90841b2..e2081405c05e 100644
--- a/src/diffusers/pipelines/chroma/pipeline_chroma.py
+++ b/src/diffusers/pipelines/chroma/pipeline_chroma.py
@@ -724,13 +724,6 @@ def __call__(
         num_warmup_steps = max(len(timesteps) - num_inference_steps * self.scheduler.order, 0)
         self._num_timesteps = len(timesteps)
 
-        # handle guidance
-        if self.transformer.config.guidance_embeds:
-            guidance = torch.full([1], guidance_scale, device=device, dtype=torch.float32)
-            guidance = guidance.expand(latents.shape[0])
-        else:
-            guidance = None
-
         if (ip_adapter_image is not None or ip_adapter_image_embeds is not None) and (
             negative_ip_adapter_image is None and negative_ip_adapter_image_embeds is None
         ):
@@ -778,7 +771,6 @@ def __call__(
                 noise_pred = self.transformer(
                     hidden_states=latents,
                     timestep=timestep / 1000,
-                    guidance=guidance,
                     encoder_hidden_states=prompt_embeds,
                     txt_ids=text_ids,
                     img_ids=latent_image_ids,
@@ -792,7 +784,6 @@ def __call__(
                     neg_noise_pred = self.transformer(
                         hidden_states=latents,
                         timestep=timestep / 1000,
-                        guidance=guidance,
                         encoder_hidden_states=negative_prompt_embeds,
                         txt_ids=negative_text_ids,
                         img_ids=latent_image_ids,

From 406ab3b1e9696fbcd723658b45a5e2010109ddd5 Mon Sep 17 00:00:00 2001
From: Edna <88869424+Ednaordinary@users.noreply.github.com>
Date: Wed, 11 Jun 2025 20:47:59 -0600
Subject: [PATCH 040/108] remove guidance from embeddings

---
 src/diffusers/models/embeddings.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/src/diffusers/models/embeddings.py b/src/diffusers/models/embeddings.py
index adb00b247560..01a8f316be1e 100644
--- a/src/diffusers/models/embeddings.py
+++ b/src/diffusers/models/embeddings.py
@@ -1647,9 +1647,7 @@ def __init__(self, factor: int, hidden_dim: int, out_dim: int, n_layers: int, em
             persistent=False,
         )
 
-    def forward(
-        self, timestep: torch.Tensor, guidance: Optional[torch.Tensor]
-    ) -> torch.Tensor:
+    def forward(self, timestep: torch.Tensor) -> torch.Tensor:
         mod_index_length = self.mod_proj.shape[0]
 
         timesteps_proj = self.time_proj(timestep).to(dtype=timestep.dtype)

From 1bd8fdfcb6e43622a04e9477afd7cd7cfae4e441 Mon Sep 17 00:00:00 2001
From: Edna <88869424+Ednaordinary@users.noreply.github.com>
Date: Wed, 11 Jun 2025 20:56:27 -0600
Subject: [PATCH 041/108] don't return length

---
 src/diffusers/pipelines/chroma/pipeline_chroma.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/src/diffusers/pipelines/chroma/pipeline_chroma.py b/src/diffusers/pipelines/chroma/pipeline_chroma.py
index e2081405c05e..e376a402e52b 100644
--- a/src/diffusers/pipelines/chroma/pipeline_chroma.py
+++ b/src/diffusers/pipelines/chroma/pipeline_chroma.py
@@ -220,14 +220,12 @@ def _get_t5_prompt_embeds(
             padding="max_length",
             max_length=max_sequence_length,
             truncation=True,
-            return_length=True,
+            return_length=False,
             return_overflowing_tokens=False,
             return_tensors="pt",
         )
         text_input_ids = text_inputs.input_ids
         untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pt").input_ids
-
-        #text_inputs.attention_mask[:, : text_inputs.length + 1] = 1.0
         
         prompt_embeds = self.text_encoder(
             text_input_ids.to(device),

From 3e2452ded0ce07306dae684b8b74549bd30ca6dd Mon Sep 17 00:00:00 2001
From: Edna <88869424+Ednaordinary@users.noreply.github.com>
Date: Wed, 11 Jun 2025 21:23:35 -0600
Subject: [PATCH 042/108] dont change dtype

---
 src/diffusers/models/embeddings.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/diffusers/models/embeddings.py b/src/diffusers/models/embeddings.py
index 01a8f316be1e..0708f93299ab 100644
--- a/src/diffusers/models/embeddings.py
+++ b/src/diffusers/models/embeddings.py
@@ -1658,7 +1658,7 @@ def forward(self, timestep: torch.Tensor) -> torch.Tensor:
             torch.cat([timesteps_proj, guidance_proj], dim=1).unsqueeze(1).repeat(1, mod_index_length, 1)
         )
         input_vec = torch.cat([timestep_guidance, mod_proj.unsqueeze(0)], dim=-1)
-        return input_vec.to(dtype=timestep.dtype)
+        return input_vec
 
 class CogView3CombinedTimestepSizeEmbeddings(nn.Module):
     def __init__(self, embedding_dim: int, condition_dim: int, pooled_projection_dim: int, timesteps_dim: int = 256):

From 1efa772f696c1e2d7026110c17e25306224726b0 Mon Sep 17 00:00:00 2001
From: BuildTools <imnotednamode@gmail.com>
Date: Wed, 11 Jun 2025 21:46:40 -0600
Subject: [PATCH 043/108] remove unused stuff, fix up docs

---
 src/diffusers/models/transformers/transformer_chroma.py | 5 -----
 src/diffusers/pipelines/chroma/pipeline_chroma.py       | 4 ----
 2 files changed, 9 deletions(-)

diff --git a/src/diffusers/models/transformers/transformer_chroma.py b/src/diffusers/models/transformers/transformer_chroma.py
index fd5b01d1ee53..65ff7ac14763 100644
--- a/src/diffusers/models/transformers/transformer_chroma.py
+++ b/src/diffusers/models/transformers/transformer_chroma.py
@@ -34,9 +34,7 @@
 )
 from ..cache_utils import CacheMixin
 from ..embeddings import (
-    CombinedTimestepGuidanceTextProjEmbeddings,
     CombinedTimestepTextProjChromaEmbeddings,
-    CombinedTimestepTextProjEmbeddings,
     ChromaApproximator,
     FluxPosEmbed,
 )
@@ -236,8 +234,6 @@ class ChromaTransformer2DModel(
         joint_attention_dim (`int`, defaults to `4096`):
             The number of dimensions to use for the joint attention (embedding/channel dimension of
             `encoder_hidden_states`).
-        guidance_embeds (`bool`, defaults to `False`):
-            Whether to use guidance embeddings for guidance-distilled variant of the model.
         axes_dims_rope (`Tuple[int]`, defaults to `(16, 56, 56)`):
             The dimensions to use for the rotary positional embeddings.
     """
@@ -257,7 +253,6 @@ def __init__(
         attention_head_dim: int = 128,
         num_attention_heads: int = 24,
         joint_attention_dim: int = 4096,
-        guidance_embeds: bool = False,
         axes_dims_rope: Tuple[int, ...] = (16, 56, 56),
         approximator_in_factor: int = 16,
         approximator_hidden_dim: int = 5120,
diff --git a/src/diffusers/pipelines/chroma/pipeline_chroma.py b/src/diffusers/pipelines/chroma/pipeline_chroma.py
index e376a402e52b..d0aabed2a9e1 100644
--- a/src/diffusers/pipelines/chroma/pipeline_chroma.py
+++ b/src/diffusers/pipelines/chroma/pipeline_chroma.py
@@ -225,7 +225,6 @@ def _get_t5_prompt_embeds(
             return_tensors="pt",
         )
         text_input_ids = text_inputs.input_ids
-        untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pt").input_ids
         
         prompt_embeds = self.text_encoder(
             text_input_ids.to(device),
@@ -270,9 +269,6 @@ def encode_prompt(
             prompt_embeds (`torch.FloatTensor`, *optional*):
                 Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
                 provided, text embeddings will be generated from `prompt` input argument.
-            pooled_prompt_embeds (`torch.FloatTensor`, *optional*):
-                Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting.
-                If not provided, pooled text embeddings will be generated from `prompt` input argument.
             lora_scale (`float`, *optional*):
                 A lora scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded.
         """

From 619921ca22602577b09c69279b939ace00551264 Mon Sep 17 00:00:00 2001
From: Edna <88869424+Ednaordinary@users.noreply.github.com>
Date: Wed, 11 Jun 2025 21:53:27 -0600
Subject: [PATCH 044/108] add chroma autodoc

---
 docs/source/en/api/models/chroma_transformer | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)
 create mode 100644 docs/source/en/api/models/chroma_transformer

diff --git a/docs/source/en/api/models/chroma_transformer b/docs/source/en/api/models/chroma_transformer
new file mode 100644
index 000000000000..f8ee50165c64
--- /dev/null
+++ b/docs/source/en/api/models/chroma_transformer
@@ -0,0 +1,19 @@
+<!--Copyright 2024 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
+
+# ChromaTransformer2DModel
+
+A modified flux Transformer model from [Chroma](https://huggingface.co/lodestones/Chroma)
+
+## ChromaTransformer2DModel
+
+[[autodoc]] ChromaTransformer2DModel

From f821f2ad5ef544955271ee406d8b0ca8bf9d169e Mon Sep 17 00:00:00 2001
From: Edna <88869424+Ednaordinary@users.noreply.github.com>
Date: Wed, 11 Jun 2025 21:54:43 -0600
Subject: [PATCH 045/108] add .md (oops)

---
 .../en/api/models/{chroma_transformer => chroma_transformer.md}   | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename docs/source/en/api/models/{chroma_transformer => chroma_transformer.md} (100%)

diff --git a/docs/source/en/api/models/chroma_transformer b/docs/source/en/api/models/chroma_transformer.md
similarity index 100%
rename from docs/source/en/api/models/chroma_transformer
rename to docs/source/en/api/models/chroma_transformer.md

From b0cf6803a74a5f96efd3c83430c40263df0a5f3a Mon Sep 17 00:00:00 2001
From: Edna <88869424+Ednaordinary@users.noreply.github.com>
Date: Wed, 11 Jun 2025 22:07:21 -0600
Subject: [PATCH 046/108] initial chroma docs

---
 docs/source/en/api/pipelines/chroma.md | 90 ++++++++++++++++++++++++++
 1 file changed, 90 insertions(+)
 create mode 100644 docs/source/en/api/pipelines/chroma.md

diff --git a/docs/source/en/api/pipelines/chroma.md b/docs/source/en/api/pipelines/chroma.md
new file mode 100644
index 000000000000..d11bcfabdc99
--- /dev/null
+++ b/docs/source/en/api/pipelines/chroma.md
@@ -0,0 +1,90 @@
+<!--Copyright 2024 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
+
+# Chroma
+
+<div class="flex flex-wrap space-x-1">
+  <img alt="LoRA" src="https://img.shields.io/badge/LoRA-d8b4fe?style=flat"/>
+  <img alt="MPS" src="https://img.shields.io/badge/MPS-000000?style=flat&logo=apple&logoColor=white%22">
+</div>
+
+Chroma is a text to image generation model based on Flux.
+
+Original model checkpoints for Chroma can be found [here](https://huggingface.co/lodestones/Chroma).
+
+<Tip>
+
+Chroma can use all the same optimizations as Flux.
+
+
+### Inference
+
+```python
+import torch
+from diffusers import ChromaPipeline
+
+pipe = ChromaPipeline.from_pretrained("chroma-diffusers-repo", torch_dtype=torch.bfloat16)
+pipe.enable_model_cpu_offload()
+
+prompt = "A cat holding a sign that says hello world"
+out = pipe(
+    prompt=prompt,
+    guidance_scale=4.0,
+    height=1024,
+    width=1024,
+    num_inference_steps=26,
+).images[0]
+out.save("image.png")
+```
+
+## Single File Loading for the `ChromaTransformer2DModel`
+
+The `ChromaTransformer2DModel` supports loading checkpoints in the original format. This is also useful when trying to load finetunes or quantized versions of the models that have been published by the community.
+
+The following example demonstrates how to run Chroma from a single file.
+
+Then run the following example
+
+```python
+import torch
+from diffusers import ChromaTransformer2DModel, ChromaPipeline
+from transformers import T5EncoderModel
+
+bfl_repo = "black-forest-labs/FLUX.1-dev"
+dtype = torch.bfloat16
+
+transformer = ChromaTransformer2DModel.from_single_file("https://huggingface.co/lodestones/Chroma/blob/main/chroma-unlocked-v35.safetensors", torch_dtype=dtype)
+
+text_encoder = T5EncoderModel.from_pretrained(bfl_repo, subfolder="text_encoder_2", torch_dtype=dtype)
+tokenizer = T5Tokenizer.from_pretrained(bfl_repo, subfolder="tokenizer_2", torch_dtype=dtype)
+
+pipe = ChromaPipeline.from_pretrained(bfl_repo, transformer=transformer, text_encoder=text_encoder, tokenizer=tokenizer, torch_dtype=dtype)
+
+pipe.enable_model_cpu_offload()
+
+prompt = "A cat holding a sign that says hello world"
+image = pipe(
+    prompt,
+    guidance_scale=4.0,
+    output_type="pil",
+    num_inference_steps=26,
+    generator=torch.Generator("cpu").manual_seed(0)
+).images[0]
+
+image.save("image.png")
+```
+
+## ChromaPipeline
+
+[[autodoc]] ChromaPipeline
+	- all
+	- __call__

From 0c5eb4470164b30118644d6dbffb427b7fde2c33 Mon Sep 17 00:00:00 2001
From: Edna <88869424+Ednaordinary@users.noreply.github.com>
Date: Thu, 12 Jun 2025 00:46:41 -0600
Subject: [PATCH 047/108] undo don't change dtype

---
 src/diffusers/models/embeddings.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/diffusers/models/embeddings.py b/src/diffusers/models/embeddings.py
index 0708f93299ab..641944d67f0d 100644
--- a/src/diffusers/models/embeddings.py
+++ b/src/diffusers/models/embeddings.py
@@ -1658,7 +1658,7 @@ def forward(self, timestep: torch.Tensor) -> torch.Tensor:
             torch.cat([timesteps_proj, guidance_proj], dim=1).unsqueeze(1).repeat(1, mod_index_length, 1)
         )
         input_vec = torch.cat([timestep_guidance, mod_proj.unsqueeze(0)], dim=-1)
-        return input_vec
+        return input_vec.to(timestep.dtype)
 
 class CogView3CombinedTimestepSizeEmbeddings(nn.Module):
     def __init__(self, embedding_dim: int, condition_dim: int, pooled_projection_dim: int, timesteps_dim: int = 256):

From 42c0e8ecbebd3717b5cd7978fd2eb1ba30e84561 Mon Sep 17 00:00:00 2001
From: Edna <88869424+Ednaordinary@users.noreply.github.com>
Date: Thu, 12 Jun 2025 00:50:36 -0600
Subject: [PATCH 048/108] undo arxiv change

unsure why that happened
---
 src/diffusers/models/embeddings.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/diffusers/models/embeddings.py b/src/diffusers/models/embeddings.py
index 641944d67f0d..1a43994c1116 100644
--- a/src/diffusers/models/embeddings.py
+++ b/src/diffusers/models/embeddings.py
@@ -1399,7 +1399,7 @@ class ImagePositionalEmbeddings(nn.Module):
     Converts latent image classes into vector embeddings. Sums the vector embeddings with positional embeddings for the
     height and width of the latent space.
 
-    For more details, see figure 10 of the dall-e paper: https://arxiv.org/abs/2102.12092
+    For more details, see figure 10 of the dall-e paper: https://huggingface.co/papers/2102.12092
 
     For VQ-diffusion:
 

From da846d1fff09c4d4e1a1125e5d5b10d655b07469 Mon Sep 17 00:00:00 2001
From: Edna <88869424+Ednaordinary@users.noreply.github.com>
Date: Thu, 12 Jun 2025 00:53:40 -0600
Subject: [PATCH 049/108] fix hf papers regression in more places

---
 src/diffusers/models/normalization.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/diffusers/models/normalization.py b/src/diffusers/models/normalization.py
index f2b71bb6888e..b07ed2ca893c 100644
--- a/src/diffusers/models/normalization.py
+++ b/src/diffusers/models/normalization.py
@@ -306,7 +306,7 @@ class AdaLayerNormSingle(nn.Module):
     r"""
     Norm layer adaptive layer norm single (adaLN-single).
 
-    As proposed in PixArt-Alpha (see: https://arxiv.org/abs/2310.00426; Section 2.3).
+    As proposed in PixArt-Alpha (see: https://huggingface.co/papers/2310.00426; Section 2.3).
 
     Parameters:
         embedding_dim (`int`): The size of each embedding vector.
@@ -623,7 +623,7 @@ def forward(self, input):
 
 class RMSNorm(nn.Module):
     r"""
-    RMS Norm as introduced in https://arxiv.org/abs/1910.07467 by Zhang et al.
+    RMS Norm as introduced in https://huggingface.co/papers/1910.07467 by Zhang et al.
 
     Args:
         dim (`int`): Number of dimensions to use for `weights`. Only effective when `elementwise_affine` is True.
@@ -713,7 +713,7 @@ def forward(self, hidden_states):
 
 class GlobalResponseNorm(nn.Module):
     r"""
-    Global response normalization as introduced in ConvNeXt-v2 (https://arxiv.org/abs/2301.00808).
+    Global response normalization as introduced in ConvNeXt-v2 (https://huggingface.co/papers/2301.00808).
 
     Args:
         dim (`int`): Number of dimensions to use for the `gamma` and `beta`.

From 18327cb57cad4e1e0916fc2c7e50bf41bd7e5ea5 Mon Sep 17 00:00:00 2001
From: Edna <88869424+Ednaordinary@users.noreply.github.com>
Date: Thu, 12 Jun 2025 02:52:39 -0600
Subject: [PATCH 050/108] Update docs/source/en/api/pipelines/chroma.md

Co-authored-by: Dhruv Nair <dhruv.nair@gmail.com>
---
 docs/source/en/api/pipelines/chroma.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/source/en/api/pipelines/chroma.md b/docs/source/en/api/pipelines/chroma.md
index d11bcfabdc99..b4d718244fc7 100644
--- a/docs/source/en/api/pipelines/chroma.md
+++ b/docs/source/en/api/pipelines/chroma.md
@@ -1,4 +1,4 @@
-<!--Copyright 2024 The HuggingFace Team. All rights reserved.
+<!--Copyright 2025 The HuggingFace Team. All rights reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
 the License. You may obtain a copy of the License at

From 3f39b1a73042fcc8f5a2134adb6950772208897f Mon Sep 17 00:00:00 2001
From: Edna <88869424+Ednaordinary@users.noreply.github.com>
Date: Thu, 12 Jun 2025 02:56:24 -0600
Subject: [PATCH 051/108] do_cfg -> self.do_classifier_free_guidance

---
 src/diffusers/pipelines/chroma/pipeline_chroma.py | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/src/diffusers/pipelines/chroma/pipeline_chroma.py b/src/diffusers/pipelines/chroma/pipeline_chroma.py
index d0aabed2a9e1..38eec3ae3fef 100644
--- a/src/diffusers/pipelines/chroma/pipeline_chroma.py
+++ b/src/diffusers/pipelines/chroma/pipeline_chroma.py
@@ -505,6 +505,10 @@ def guidance_scale(self):
     def joint_attention_kwargs(self):
         return self._joint_attention_kwargs
 
+    @property
+    def do_classifier_free_guidance(self):
+        return self._guidance_scale > 1
+    
     @property
     def num_timesteps(self):
         return self._num_timesteps
@@ -660,7 +664,6 @@ def __call__(
         lora_scale = (
             self.joint_attention_kwargs.get("scale", None) if self.joint_attention_kwargs is not None else None
         )
-        do_cfg = guidance_scale > 1
         (
             prompt_embeds,
             text_ids,
@@ -672,7 +675,7 @@ def __call__(
             max_sequence_length=max_sequence_length,
             lora_scale=lora_scale,
         )
-        if do_cfg:
+        if self.do_classifier_free_guidance:
             (
                 negative_prompt_embeds,
                 negative_text_ids,
@@ -772,7 +775,7 @@ def __call__(
                     return_dict=False,
                 )[0]
 
-                if do_cfg:
+                if self.do_classifier_free_guidance:
                     if negative_image_embeds is not None:
                         self._joint_attention_kwargs["ip_adapter_image_embeds"] = negative_image_embeds
                     neg_noise_pred = self.transformer(

From a93e64d6fbc1ffca01f031148f3a50963d6ca8c8 Mon Sep 17 00:00:00 2001
From: Edna <88869424+Ednaordinary@users.noreply.github.com>
Date: Thu, 12 Jun 2025 02:57:28 -0600
Subject: [PATCH 052/108] Update
 docs/source/en/api/models/chroma_transformer.md

Co-authored-by: Dhruv Nair <dhruv.nair@gmail.com>
---
 docs/source/en/api/models/chroma_transformer.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/source/en/api/models/chroma_transformer.md b/docs/source/en/api/models/chroma_transformer.md
index f8ee50165c64..681e81f7a584 100644
--- a/docs/source/en/api/models/chroma_transformer.md
+++ b/docs/source/en/api/models/chroma_transformer.md
@@ -1,4 +1,4 @@
-<!--Copyright 2024 The HuggingFace Team. All rights reserved.
+<!--Copyright 2025 The HuggingFace Team. All rights reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
 the License. You may obtain a copy of the License at

From 3e36a21c8ed764317dc81b96e46b5ef4f70ca273 Mon Sep 17 00:00:00 2001
From: Edna <88869424+Ednaordinary@users.noreply.github.com>
Date: Thu, 12 Jun 2025 02:58:21 -0600
Subject: [PATCH 053/108] Update chroma.md

---
 docs/source/en/api/pipelines/chroma.md | 22 +---------------------
 1 file changed, 1 insertion(+), 21 deletions(-)

diff --git a/docs/source/en/api/pipelines/chroma.md b/docs/source/en/api/pipelines/chroma.md
index b4d718244fc7..0f8c9940f2ea 100644
--- a/docs/source/en/api/pipelines/chroma.md
+++ b/docs/source/en/api/pipelines/chroma.md
@@ -26,27 +26,7 @@ Original model checkpoints for Chroma can be found [here](https://huggingface.co
 Chroma can use all the same optimizations as Flux.
 
 
-### Inference
-
-```python
-import torch
-from diffusers import ChromaPipeline
-
-pipe = ChromaPipeline.from_pretrained("chroma-diffusers-repo", torch_dtype=torch.bfloat16)
-pipe.enable_model_cpu_offload()
-
-prompt = "A cat holding a sign that says hello world"
-out = pipe(
-    prompt=prompt,
-    guidance_scale=4.0,
-    height=1024,
-    width=1024,
-    num_inference_steps=26,
-).images[0]
-out.save("image.png")
-```
-
-## Single File Loading for the `ChromaTransformer2DModel`
+## Inference (Single File)
 
 The `ChromaTransformer2DModel` supports loading checkpoints in the original format. This is also useful when trying to load finetunes or quantized versions of the models that have been published by the community.
 

From a1fac68a2d156826dd2dc19cbfd73b60611720c2 Mon Sep 17 00:00:00 2001
From: Edna <88869424+Ednaordinary@users.noreply.github.com>
Date: Thu, 12 Jun 2025 03:04:41 -0600
Subject: [PATCH 054/108] Move chroma layers into transformer

---
 .../models/transformers/transformer_chroma.py | 126 ++++++++++++++++--
 1 file changed, 116 insertions(+), 10 deletions(-)

diff --git a/src/diffusers/models/transformers/transformer_chroma.py b/src/diffusers/models/transformers/transformer_chroma.py
index 65ff7ac14763..7e1d66bc3dec 100644
--- a/src/diffusers/models/transformers/transformer_chroma.py
+++ b/src/diffusers/models/transformers/transformer_chroma.py
@@ -40,16 +40,123 @@
 )
 from ..modeling_outputs import Transformer2DModelOutput
 from ..modeling_utils import ModelMixin
-from ..normalization import (
-    AdaLayerNormContinuousPruned,
-    AdaLayerNormZeroPruned,
-    AdaLayerNormZeroSinglePruned,
-)
 
 
 logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
 
 
+class ChromaAdaLayerNormZeroPruned(nn.Module):
+    r"""
+    Norm layer adaptive layer norm zero (adaLN-Zero).
+
+    Parameters:
+        embedding_dim (`int`): The size of each embedding vector.
+        num_embeddings (`int`): The size of the embeddings dictionary.
+    """
+
+    def __init__(self, embedding_dim: int, num_embeddings: Optional[int] = None, norm_type="layer_norm", bias=True):
+        super().__init__()
+        if num_embeddings is not None:
+            self.emb = CombinedTimestepLabelEmbeddings(num_embeddings, embedding_dim)
+        else:
+            self.emb = None
+
+        if norm_type == "layer_norm":
+            self.norm = nn.LayerNorm(embedding_dim, elementwise_affine=False, eps=1e-6)
+        elif norm_type == "fp32_layer_norm":
+            self.norm = FP32LayerNorm(embedding_dim, elementwise_affine=False, bias=False)
+        else:
+            raise ValueError(
+                f"Unsupported `norm_type` ({norm_type}) provided. Supported ones are: 'layer_norm', 'fp32_layer_norm'."
+            )
+
+    def forward(
+        self,
+        x: torch.Tensor,
+        timestep: Optional[torch.Tensor] = None,
+        class_labels: Optional[torch.LongTensor] = None,
+        hidden_dtype: Optional[torch.dtype] = None,
+        emb: Optional[torch.Tensor] = None,
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
+        if self.emb is not None:
+            emb = self.emb(timestep, class_labels, hidden_dtype=hidden_dtype)
+        shift_msa, scale_msa, gate_msa, shift_mlp, scale_mlp, gate_mlp = emb.squeeze(0).chunk(6, dim=0)
+        x = self.norm(x) * (1 + scale_msa[:, None]) + shift_msa[:, None]
+        return x, gate_msa, shift_mlp, scale_mlp, gate_mlp
+
+
+class ChromaAdaLayerNormZeroSinglePruned(nn.Module):
+    r"""
+    Norm layer adaptive layer norm zero (adaLN-Zero).
+
+    Parameters:
+        embedding_dim (`int`): The size of each embedding vector.
+        num_embeddings (`int`): The size of the embeddings dictionary.
+    """
+
+    def __init__(self, embedding_dim: int, norm_type="layer_norm", bias=True):
+        super().__init__()
+
+        if norm_type == "layer_norm":
+            self.norm = nn.LayerNorm(embedding_dim, elementwise_affine=False, eps=1e-6)
+        else:
+            raise ValueError(
+                f"Unsupported `norm_type` ({norm_type}) provided. Supported ones are: 'layer_norm', 'fp32_layer_norm'."
+            )
+
+    def forward(
+        self,
+        x: torch.Tensor,
+        emb: Optional[torch.Tensor] = None,
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
+        shift_msa, scale_msa, gate_msa = emb.squeeze(0).chunk(3, dim=0)
+        x = self.norm(x) * (1 + scale_msa[:, None]) + shift_msa[:, None]
+        return x, gate_msa
+
+
+class ChromaAdaLayerNormContinuousPruned(nn.Module):
+    r"""
+    Adaptive normalization layer with a norm layer (layer_norm or rms_norm).
+
+    Args:
+        embedding_dim (`int`): Embedding dimension to use during projection.
+        conditioning_embedding_dim (`int`): Dimension of the input condition.
+        elementwise_affine (`bool`, defaults to `True`):
+            Boolean flag to denote if affine transformation should be applied.
+        eps (`float`, defaults to 1e-5): Epsilon factor.
+        bias (`bias`, defaults to `True`): Boolean flag to denote if bias should be use.
+        norm_type (`str`, defaults to `"layer_norm"`):
+            Normalization layer to use. Values supported: "layer_norm", "rms_norm".
+    """
+
+    def __init__(
+        self,
+        embedding_dim: int,
+        conditioning_embedding_dim: int,
+        # NOTE: It is a bit weird that the norm layer can be configured to have scale and shift parameters
+        # because the output is immediately scaled and shifted by the projected conditioning embeddings.
+        # Note that AdaLayerNorm does not let the norm layer have scale and shift parameters.
+        # However, this is how it was implemented in the original code, and it's rather likely you should
+        # set `elementwise_affine` to False.
+        elementwise_affine=True,
+        eps=1e-5,
+        bias=True,
+        norm_type="layer_norm",
+    ):
+        super().__init__()
+        if norm_type == "layer_norm":
+            self.norm = LayerNorm(embedding_dim, eps, elementwise_affine, bias)
+        elif norm_type == "rms_norm":
+            self.norm = RMSNorm(embedding_dim, eps, elementwise_affine)
+        else:
+            raise ValueError(f"unknown norm_type {norm_type}")
+
+    def forward(self, x: torch.Tensor, emb: torch.Tensor) -> torch.Tensor:
+        # convert back to the original dtype in case `conditioning_embedding`` is upcasted to float32 (needed for hunyuanDiT)
+        shift, scale = torch.chunk(emb.squeeze(0).to(x.dtype), 2, dim=0)
+        x = self.norm(x) * (1 + scale)[:, None, :] + shift[:, None, :]
+        return x
+
 @maybe_allow_in_graph
 class ChromaSingleTransformerBlock(nn.Module):
     def __init__(
@@ -61,7 +168,7 @@ def __init__(
     ):
         super().__init__()
         self.mlp_hidden_dim = int(dim * mlp_ratio)
-        self.norm = AdaLayerNormZeroSinglePruned(dim)
+        self.norm = ChromaAdaLayerNormZeroSinglePruned(dim)
         self.proj_mlp = nn.Linear(dim, self.mlp_hidden_dim)
         self.act_mlp = nn.GELU(approximate="tanh")
         self.proj_out = nn.Linear(dim + self.mlp_hidden_dim, dim)
@@ -127,8 +234,8 @@ def __init__(
         eps: float = 1e-6,
     ):
         super().__init__()
-        self.norm1 = AdaLayerNormZeroPruned(dim)
-        self.norm1_context = AdaLayerNormZeroPruned(dim)
+        self.norm1 = ChromaAdaLayerNormZeroPruned(dim)
+        self.norm1_context = ChromaAdaLayerNormZeroPruned(dim)
 
         self.attn = Attention(
             query_dim=dim,
@@ -298,8 +405,7 @@ def __init__(
             ]
         )
 
-        norm_out_cls = AdaLayerNormContinuousPruned
-        self.norm_out = norm_out_cls(self.inner_dim, self.inner_dim, elementwise_affine=False, eps=1e-6)
+        self.norm_out = ChromaAdaLayerNormContinuousPruned(self.inner_dim, self.inner_dim, elementwise_affine=False, eps=1e-6)
         self.proj_out = nn.Linear(self.inner_dim, patch_size * patch_size * self.out_channels, bias=True)
 
         self.gradient_checkpointing = False

From 1442c9789a4a9bc41e80b6d261310f6db11094ce Mon Sep 17 00:00:00 2001
From: Edna <88869424+Ednaordinary@users.noreply.github.com>
Date: Thu, 12 Jun 2025 03:05:10 -0600
Subject: [PATCH 055/108] Remove pruned AdaLayerNorms

---
 src/diffusers/models/normalization.py | 113 --------------------------
 1 file changed, 113 deletions(-)

diff --git a/src/diffusers/models/normalization.py b/src/diffusers/models/normalization.py
index b07ed2ca893c..4a512c5cb166 100644
--- a/src/diffusers/models/normalization.py
+++ b/src/diffusers/models/normalization.py
@@ -171,46 +171,6 @@ def forward(
         return x, gate_msa, shift_mlp, scale_mlp, gate_mlp
 
 
-class AdaLayerNormZeroPruned(nn.Module):
-    r"""
-    Norm layer adaptive layer norm zero (adaLN-Zero).
-
-    Parameters:
-        embedding_dim (`int`): The size of each embedding vector.
-        num_embeddings (`int`): The size of the embeddings dictionary.
-    """
-
-    def __init__(self, embedding_dim: int, num_embeddings: Optional[int] = None, norm_type="layer_norm", bias=True):
-        super().__init__()
-        if num_embeddings is not None:
-            self.emb = CombinedTimestepLabelEmbeddings(num_embeddings, embedding_dim)
-        else:
-            self.emb = None
-
-        if norm_type == "layer_norm":
-            self.norm = nn.LayerNorm(embedding_dim, elementwise_affine=False, eps=1e-6)
-        elif norm_type == "fp32_layer_norm":
-            self.norm = FP32LayerNorm(embedding_dim, elementwise_affine=False, bias=False)
-        else:
-            raise ValueError(
-                f"Unsupported `norm_type` ({norm_type}) provided. Supported ones are: 'layer_norm', 'fp32_layer_norm'."
-            )
-
-    def forward(
-        self,
-        x: torch.Tensor,
-        timestep: Optional[torch.Tensor] = None,
-        class_labels: Optional[torch.LongTensor] = None,
-        hidden_dtype: Optional[torch.dtype] = None,
-        emb: Optional[torch.Tensor] = None,
-    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
-        if self.emb is not None:
-            emb = self.emb(timestep, class_labels, hidden_dtype=hidden_dtype)
-        shift_msa, scale_msa, gate_msa, shift_mlp, scale_mlp, gate_mlp = emb.squeeze(0).chunk(6, dim=0)
-        x = self.norm(x) * (1 + scale_msa[:, None]) + shift_msa[:, None]
-        return x, gate_msa, shift_mlp, scale_mlp, gate_mlp
-
-
 class AdaLayerNormZeroSingle(nn.Module):
     r"""
     Norm layer adaptive layer norm zero (adaLN-Zero).
@@ -243,35 +203,6 @@ def forward(
         return x, gate_msa
 
 
-class AdaLayerNormZeroSinglePruned(nn.Module):
-    r"""
-    Norm layer adaptive layer norm zero (adaLN-Zero).
-
-    Parameters:
-        embedding_dim (`int`): The size of each embedding vector.
-        num_embeddings (`int`): The size of the embeddings dictionary.
-    """
-
-    def __init__(self, embedding_dim: int, norm_type="layer_norm", bias=True):
-        super().__init__()
-
-        if norm_type == "layer_norm":
-            self.norm = nn.LayerNorm(embedding_dim, elementwise_affine=False, eps=1e-6)
-        else:
-            raise ValueError(
-                f"Unsupported `norm_type` ({norm_type}) provided. Supported ones are: 'layer_norm', 'fp32_layer_norm'."
-            )
-
-    def forward(
-        self,
-        x: torch.Tensor,
-        emb: Optional[torch.Tensor] = None,
-    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
-        shift_msa, scale_msa, gate_msa = emb.squeeze(0).chunk(3, dim=0)
-        x = self.norm(x) * (1 + scale_msa[:, None]) + shift_msa[:, None]
-        return x, gate_msa
-
-
 class LuminaRMSNormZero(nn.Module):
     """
     Norm layer adaptive RMS normalization zero.
@@ -374,50 +305,6 @@ def forward(self, x: torch.Tensor, emb: torch.Tensor) -> torch.Tensor:
         return x
 
 
-class AdaLayerNormContinuousPruned(nn.Module):
-    r"""
-    Adaptive normalization layer with a norm layer (layer_norm or rms_norm).
-
-    Args:
-        embedding_dim (`int`): Embedding dimension to use during projection.
-        conditioning_embedding_dim (`int`): Dimension of the input condition.
-        elementwise_affine (`bool`, defaults to `True`):
-            Boolean flag to denote if affine transformation should be applied.
-        eps (`float`, defaults to 1e-5): Epsilon factor.
-        bias (`bias`, defaults to `True`): Boolean flag to denote if bias should be use.
-        norm_type (`str`, defaults to `"layer_norm"`):
-            Normalization layer to use. Values supported: "layer_norm", "rms_norm".
-    """
-
-    def __init__(
-        self,
-        embedding_dim: int,
-        conditioning_embedding_dim: int,
-        # NOTE: It is a bit weird that the norm layer can be configured to have scale and shift parameters
-        # because the output is immediately scaled and shifted by the projected conditioning embeddings.
-        # Note that AdaLayerNorm does not let the norm layer have scale and shift parameters.
-        # However, this is how it was implemented in the original code, and it's rather likely you should
-        # set `elementwise_affine` to False.
-        elementwise_affine=True,
-        eps=1e-5,
-        bias=True,
-        norm_type="layer_norm",
-    ):
-        super().__init__()
-        if norm_type == "layer_norm":
-            self.norm = LayerNorm(embedding_dim, eps, elementwise_affine, bias)
-        elif norm_type == "rms_norm":
-            self.norm = RMSNorm(embedding_dim, eps, elementwise_affine)
-        else:
-            raise ValueError(f"unknown norm_type {norm_type}")
-
-    def forward(self, x: torch.Tensor, emb: torch.Tensor) -> torch.Tensor:
-        # convert back to the original dtype in case `conditioning_embedding`` is upcasted to float32 (needed for hunyuanDiT)
-        shift, scale = torch.chunk(emb.squeeze(0).to(x.dtype), 2, dim=0)
-        x = self.norm(x) * (1 + scale)[:, None, :] + shift[:, None, :]
-        return x
-
-
 class AdaLayerNormContinuous(nn.Module):
     r"""
     Adaptive normalization layer with a norm layer (layer_norm or rms_norm).

From 03fbd520f452678c99346a663d9c6c9faf3f5988 Mon Sep 17 00:00:00 2001
From: Edna <88869424+Ednaordinary@users.noreply.github.com>
Date: Thu, 12 Jun 2025 03:11:48 -0600
Subject: [PATCH 056/108] Add chroma fast tests

---
 tests/pipelines/chroma.py | 222 ++++++++++++++++++++++++++++++++++++++
 1 file changed, 222 insertions(+)
 create mode 100644 tests/pipelines/chroma.py

diff --git a/tests/pipelines/chroma.py b/tests/pipelines/chroma.py
new file mode 100644
index 000000000000..4ea369ca0ecb
--- /dev/null
+++ b/tests/pipelines/chroma.py
@@ -0,0 +1,222 @@
+import gc
+import unittest
+
+import numpy as np
+import pytest
+import torch
+from huggingface_hub import hf_hub_download
+from transformers import AutoTokenizer, CLIPTextConfig, CLIPTextModel, CLIPTokenizer, T5EncoderModel
+
+from diffusers import (
+    AutoencoderKL,
+    FasterCacheConfig,
+    FlowMatchEulerDiscreteScheduler,
+    ChromaPipeline,
+    ChromaTransformer2DModel,
+)
+from diffusers.utils.testing_utils import (
+    backend_empty_cache,
+    nightly,
+    numpy_cosine_similarity_distance,
+    require_big_accelerator,
+    slow,
+    torch_device,
+)
+
+from ..test_pipelines_common import (
+    FasterCacheTesterMixin,
+    FluxIPAdapterTesterMixin,
+    PipelineTesterMixin,
+    PyramidAttentionBroadcastTesterMixin,
+    check_qkv_fusion_matches_attn_procs_length,
+    check_qkv_fusion_processors_exist,
+)
+
+
+class ChromaPipelineFastTests(
+    unittest.TestCase,
+    PipelineTesterMixin,
+    FluxIPAdapterTesterMixin,
+    PyramidAttentionBroadcastTesterMixin,
+    FasterCacheTesterMixin,
+):
+    pipeline_class = ChromaPipeline
+    params = frozenset(["prompt", "height", "width", "guidance_scale", "prompt_embeds", "pooled_prompt_embeds"])
+    batch_params = frozenset(["prompt"])
+
+    # there is no xformers processor for Flux
+    test_xformers_attention = False
+    test_layerwise_casting = True
+    test_group_offloading = True
+
+    faster_cache_config = FasterCacheConfig(
+        spatial_attention_block_skip_range=2,
+        spatial_attention_timestep_skip_range=(-1, 901),
+        unconditional_batch_skip_range=2,
+        attention_weight_callback=lambda _: 0.5,
+        is_guidance_distilled=True,
+    )
+
+    def get_dummy_components(self, num_layers: int = 1, num_single_layers: int = 1):
+        torch.manual_seed(0)
+        transformer = ChromaTransformer2DModel(
+            patch_size=1,
+            in_channels=4,
+            num_layers=num_layers,
+            num_single_layers=num_single_layers,
+            attention_head_dim=16,
+            num_attention_heads=2,
+            joint_attention_dim=32,
+            pooled_projection_dim=32,
+            axes_dims_rope=[4, 4, 8],
+        )
+        clip_text_encoder_config = CLIPTextConfig(
+            bos_token_id=0,
+            eos_token_id=2,
+            hidden_size=32,
+            intermediate_size=37,
+            layer_norm_eps=1e-05,
+            num_attention_heads=4,
+            num_hidden_layers=5,
+            pad_token_id=1,
+            vocab_size=1000,
+            hidden_act="gelu",
+            projection_dim=32,
+        )
+
+        torch.manual_seed(0)
+        text_encoder = CLIPTextModel(clip_text_encoder_config)
+
+        torch.manual_seed(0)
+        text_encoder_2 = T5EncoderModel.from_pretrained("hf-internal-testing/tiny-random-t5")
+
+        tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
+        tokenizer_2 = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-t5")
+
+        torch.manual_seed(0)
+        vae = AutoencoderKL(
+            sample_size=32,
+            in_channels=3,
+            out_channels=3,
+            block_out_channels=(4,),
+            layers_per_block=1,
+            latent_channels=1,
+            norm_num_groups=1,
+            use_quant_conv=False,
+            use_post_quant_conv=False,
+            shift_factor=0.0609,
+            scaling_factor=1.5035,
+        )
+
+        scheduler = FlowMatchEulerDiscreteScheduler()
+
+        return {
+            "scheduler": scheduler,
+            "text_encoder": text_encoder,
+            "text_encoder_2": text_encoder_2,
+            "tokenizer": tokenizer,
+            "tokenizer_2": tokenizer_2,
+            "transformer": transformer,
+            "vae": vae,
+            "image_encoder": None,
+            "feature_extractor": None,
+        }
+
+    def get_dummy_inputs(self, device, seed=0):
+        if str(device).startswith("mps"):
+            generator = torch.manual_seed(seed)
+        else:
+            generator = torch.Generator(device="cpu").manual_seed(seed)
+
+        inputs = {
+            "prompt": "A painting of a squirrel eating a burger",
+            "generator": generator,
+            "num_inference_steps": 2,
+            "guidance_scale": 5.0,
+            "height": 8,
+            "width": 8,
+            "max_sequence_length": 48,
+            "output_type": "np",
+        }
+        return inputs
+
+    def test_flux_different_prompts(self):
+        pipe = self.pipeline_class(**self.get_dummy_components()).to(torch_device)
+
+        inputs = self.get_dummy_inputs(torch_device)
+        output_same_prompt = pipe(**inputs).images[0]
+
+        inputs = self.get_dummy_inputs(torch_device)
+        inputs["prompt_2"] = "a different prompt"
+        output_different_prompts = pipe(**inputs).images[0]
+
+        max_diff = np.abs(output_same_prompt - output_different_prompts).max()
+
+        # Outputs should be different here
+        # For some reasons, they don't show large differences
+        assert max_diff > 1e-6
+
+    def test_fused_qkv_projections(self):
+        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
+        components = self.get_dummy_components()
+        pipe = self.pipeline_class(**components)
+        pipe = pipe.to(device)
+        pipe.set_progress_bar_config(disable=None)
+
+        inputs = self.get_dummy_inputs(device)
+        image = pipe(**inputs).images
+        original_image_slice = image[0, -3:, -3:, -1]
+
+        # TODO (sayakpaul): will refactor this once `fuse_qkv_projections()` has been added
+        # to the pipeline level.
+        pipe.transformer.fuse_qkv_projections()
+        assert check_qkv_fusion_processors_exist(pipe.transformer), (
+            "Something wrong with the fused attention processors. Expected all the attention processors to be fused."
+        )
+        assert check_qkv_fusion_matches_attn_procs_length(
+            pipe.transformer, pipe.transformer.original_attn_processors
+        ), "Something wrong with the attention processors concerning the fused QKV projections."
+
+        inputs = self.get_dummy_inputs(device)
+        image = pipe(**inputs).images
+        image_slice_fused = image[0, -3:, -3:, -1]
+
+        pipe.transformer.unfuse_qkv_projections()
+        inputs = self.get_dummy_inputs(device)
+        image = pipe(**inputs).images
+        image_slice_disabled = image[0, -3:, -3:, -1]
+
+        assert np.allclose(original_image_slice, image_slice_fused, atol=1e-3, rtol=1e-3), (
+            "Fusion of QKV projections shouldn't affect the outputs."
+        )
+        assert np.allclose(image_slice_fused, image_slice_disabled, atol=1e-3, rtol=1e-3), (
+            "Outputs, with QKV projection fusion enabled, shouldn't change when fused QKV projections are disabled."
+        )
+        assert np.allclose(original_image_slice, image_slice_disabled, atol=1e-2, rtol=1e-2), (
+            "Original outputs should match when fused QKV projections are disabled."
+        )
+
+    def test_flux_image_output_shape(self):
+        pipe = self.pipeline_class(**self.get_dummy_components()).to(torch_device)
+        inputs = self.get_dummy_inputs(torch_device)
+
+        height_width_pairs = [(32, 32), (72, 57)]
+        for height, width in height_width_pairs:
+            expected_height = height - height % (pipe.vae_scale_factor * 2)
+            expected_width = width - width % (pipe.vae_scale_factor * 2)
+
+            inputs.update({"height": height, "width": width})
+            image = pipe(**inputs).images[0]
+            output_height, output_width, _ = image.shape
+            assert (output_height, output_width) == (expected_height, expected_width)
+
+    def test_flux_true_cfg(self):
+        pipe = self.pipeline_class(**self.get_dummy_components()).to(torch_device)
+        inputs = self.get_dummy_inputs(torch_device)
+        inputs.pop("generator")
+
+        no_true_cfg_out = pipe(**inputs, generator=torch.manual_seed(0)).images[0]
+        inputs["negative_prompt"] = "bad quality"
+        inputs["true_cfg_scale"] = 2.0
+        true_cfg_out = pipe(**inputs, generator=torch.manual_seed(0)).images[0]
+        assert not np.allclose(no_true_cfg_out, true_cfg_out)

From bedb32087a4402253d0f8362eab49164b9462553 Mon Sep 17 00:00:00 2001
From: Edna <88869424+Ednaordinary@users.noreply.github.com>
Date: Thu, 12 Jun 2025 03:18:33 -0600
Subject: [PATCH 057/108] (untested) batch cond and uncond

---
 .../pipelines/chroma/pipeline_chroma.py       | 22 +++++++++----------
 1 file changed, 10 insertions(+), 12 deletions(-)

diff --git a/src/diffusers/pipelines/chroma/pipeline_chroma.py b/src/diffusers/pipelines/chroma/pipeline_chroma.py
index 38eec3ae3fef..5179314f5dca 100644
--- a/src/diffusers/pipelines/chroma/pipeline_chroma.py
+++ b/src/diffusers/pipelines/chroma/pipeline_chroma.py
@@ -688,6 +688,9 @@ def __call__(
                 lora_scale=lora_scale,
             )
 
+        if self.do_classifier_free_guidance:
+            prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds], dim=0)
+        
         # 4. Prepare latent variables
         num_channels_latents = self.transformer.config.in_channels // 4
         latents, latent_image_ids = self.prepare_latents(
@@ -762,11 +765,14 @@ def __call__(
                 self._current_timestep = t
                 if image_embeds is not None:
                     self._joint_attention_kwargs["ip_adapter_image_embeds"] = image_embeds
+
+                # expand the latents if we are doing classifier free guidance
+                latent_model_input = torch.cat([latents] * 2) if self.do_classifier_free_guidance else latents
                 # broadcast to batch dimension in a way that's compatible with ONNX/Core ML
-                timestep = t.expand(latents.shape[0]).to(latents.dtype)
+                timestep = t.expand(latent_model_input.shape[0]).to(latents.dtype)
 
                 noise_pred = self.transformer(
-                    hidden_states=latents,
+                    hidden_states=latent_model_input,
                     timestep=timestep / 1000,
                     encoder_hidden_states=prompt_embeds,
                     txt_ids=text_ids,
@@ -778,16 +784,8 @@ def __call__(
                 if self.do_classifier_free_guidance:
                     if negative_image_embeds is not None:
                         self._joint_attention_kwargs["ip_adapter_image_embeds"] = negative_image_embeds
-                    neg_noise_pred = self.transformer(
-                        hidden_states=latents,
-                        timestep=timestep / 1000,
-                        encoder_hidden_states=negative_prompt_embeds,
-                        txt_ids=negative_text_ids,
-                        img_ids=latent_image_ids,
-                        joint_attention_kwargs=self.joint_attention_kwargs,
-                        return_dict=False,
-                    )[0]
-                    noise_pred = neg_noise_pred + guidance_scale * (noise_pred - neg_noise_pred)
+                    noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                    noise_pred = noise_pred_uncond + self.guidance_scale * (noise_pred_text - noise_pred_uncond)
 
                 # compute the previous noisy sample x_t -> x_t-1
                 latents_dtype = latents.dtype

From fe5af79a191563da5e6bac036d2e0078b2ee524a Mon Sep 17 00:00:00 2001
From: Edna <88869424+Ednaordinary@users.noreply.github.com>
Date: Thu, 12 Jun 2025 03:23:09 -0600
Subject: [PATCH 058/108] Add # Copied from for shift

---
 src/diffusers/pipelines/chroma/pipeline_chroma.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/diffusers/pipelines/chroma/pipeline_chroma.py b/src/diffusers/pipelines/chroma/pipeline_chroma.py
index 5179314f5dca..b47a67dc77a0 100644
--- a/src/diffusers/pipelines/chroma/pipeline_chroma.py
+++ b/src/diffusers/pipelines/chroma/pipeline_chroma.py
@@ -65,7 +65,7 @@
         ```
 """
 
-
+# Copied from diffusers.pipelines.flux.pipeline_flux.calculate_shift
 def calculate_shift(
     image_seq_len,
     base_seq_len: int = 256,

From 6a0db55af8c1071271cd6fd0cd06c17a8a7ea039 Mon Sep 17 00:00:00 2001
From: Edna <88869424+Ednaordinary@users.noreply.github.com>
Date: Thu, 12 Jun 2025 03:27:35 -0600
Subject: [PATCH 059/108] Update # Copied from statements

---
 src/diffusers/pipelines/chroma/pipeline_chroma.py | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/src/diffusers/pipelines/chroma/pipeline_chroma.py b/src/diffusers/pipelines/chroma/pipeline_chroma.py
index b47a67dc77a0..0274c3e5d0c7 100644
--- a/src/diffusers/pipelines/chroma/pipeline_chroma.py
+++ b/src/diffusers/pipelines/chroma/pipeline_chroma.py
@@ -304,6 +304,7 @@ def encode_prompt(
 
         return prompt_embeds, text_ids
 
+    # Copied from diffusers.pipelines.flux.pipeline_flux.FluxPipeline.encode_image
     def encode_image(self, image, device, num_images_per_prompt):
         dtype = next(self.image_encoder.parameters()).dtype
 
@@ -315,6 +316,7 @@ def encode_image(self, image, device, num_images_per_prompt):
         image_embeds = image_embeds.repeat_interleave(num_images_per_prompt, dim=0)
         return image_embeds
 
+    # Copied from diffusers.pipelines.flux.pipeline_flux.FluxPipeline.prepare_ip_adapter_image_embeds
     def prepare_ip_adapter_image_embeds(
         self, ip_adapter_image, ip_adapter_image_embeds, device, num_images_per_prompt
     ):
@@ -395,6 +397,7 @@ def check_inputs(
         if max_sequence_length is not None and max_sequence_length > 512:
             raise ValueError(f"`max_sequence_length` cannot be greater than 512 but is {max_sequence_length}")
 
+    # Copied from diffusers.pipelines.flux.pipeline_flux.FluxPipeline.prepare_latent_image_ids
     @staticmethod
     def _prepare_latent_image_ids(batch_size, height, width, device, dtype):
         latent_image_ids = torch.zeros(height, width, 3)
@@ -409,6 +412,7 @@ def _prepare_latent_image_ids(batch_size, height, width, device, dtype):
 
         return latent_image_ids.to(device=device, dtype=dtype)
 
+    # Copied from diffusers.pipelines.flux.pipeline_flux.FluxPipeline._pack_latents
     @staticmethod
     def _pack_latents(latents, batch_size, num_channels_latents, height, width):
         latents = latents.view(batch_size, num_channels_latents, height // 2, 2, width // 2, 2)
@@ -417,6 +421,7 @@ def _pack_latents(latents, batch_size, num_channels_latents, height, width):
 
         return latents
 
+    # Copied from diffusers.pipelines.flux.pipeline_flux.FluxPipeline._unpack_latents
     @staticmethod
     def _unpack_latents(latents, height, width, vae_scale_factor):
         batch_size, num_patches, channels = latents.shape
@@ -462,6 +467,8 @@ def disable_vae_tiling(self):
         """
         self.vae.disable_tiling()
 
+
+    # Copied from diffusers.pipelines.flux.pipeline_flux.FluxPipeline.prepare_latents
     def prepare_latents(
         self,
         batch_size,

From abf8a33a963e24009d3307518647d6851e2b1ad9 Mon Sep 17 00:00:00 2001
From: Edna <88869424+Ednaordinary@users.noreply.github.com>
Date: Thu, 12 Jun 2025 03:33:23 -0600
Subject: [PATCH 060/108] update norm imports

---
 src/diffusers/models/transformers/transformer_chroma.py | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/src/diffusers/models/transformers/transformer_chroma.py b/src/diffusers/models/transformers/transformer_chroma.py
index 7e1d66bc3dec..8708a861674c 100644
--- a/src/diffusers/models/transformers/transformer_chroma.py
+++ b/src/diffusers/models/transformers/transformer_chroma.py
@@ -40,6 +40,11 @@
 )
 from ..modeling_outputs import Transformer2DModelOutput
 from ..modeling_utils import ModelMixin
+from ..normalization import (
+    FP32LayerNorm,
+    CombinedTimestepLabelEmbeddings,
+    RMSNorm,
+)
 
 
 logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
@@ -145,7 +150,7 @@ def __init__(
     ):
         super().__init__()
         if norm_type == "layer_norm":
-            self.norm = LayerNorm(embedding_dim, eps, elementwise_affine, bias)
+            self.norm = nn.LayerNorm(embedding_dim, eps, elementwise_affine, bias)
         elif norm_type == "rms_norm":
             self.norm = RMSNorm(embedding_dim, eps, elementwise_affine)
         else:

From 7235805e752641bb30dc4cbbb881c3c24addfc29 Mon Sep 17 00:00:00 2001
From: Edna <88869424+Ednaordinary@users.noreply.github.com>
Date: Thu, 12 Jun 2025 03:40:52 -0600
Subject: [PATCH 061/108] Revert cond + uncond batching

---
 .../pipelines/chroma/pipeline_chroma.py       | 21 +++++++++++--------
 1 file changed, 12 insertions(+), 9 deletions(-)

diff --git a/src/diffusers/pipelines/chroma/pipeline_chroma.py b/src/diffusers/pipelines/chroma/pipeline_chroma.py
index 0274c3e5d0c7..d20ae43b360a 100644
--- a/src/diffusers/pipelines/chroma/pipeline_chroma.py
+++ b/src/diffusers/pipelines/chroma/pipeline_chroma.py
@@ -694,9 +694,6 @@ def __call__(
                 max_sequence_length=max_sequence_length,
                 lora_scale=lora_scale,
             )
-
-        if self.do_classifier_free_guidance:
-            prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds], dim=0)
         
         # 4. Prepare latent variables
         num_channels_latents = self.transformer.config.in_channels // 4
@@ -773,13 +770,11 @@ def __call__(
                 if image_embeds is not None:
                     self._joint_attention_kwargs["ip_adapter_image_embeds"] = image_embeds
 
-                # expand the latents if we are doing classifier free guidance
-                latent_model_input = torch.cat([latents] * 2) if self.do_classifier_free_guidance else latents
                 # broadcast to batch dimension in a way that's compatible with ONNX/Core ML
-                timestep = t.expand(latent_model_input.shape[0]).to(latents.dtype)
+                timestep = t.expand(latents.shape[0]).to(latents.dtype)
 
                 noise_pred = self.transformer(
-                    hidden_states=latent_model_input,
+                    hidden_states=latents,
                     timestep=timestep / 1000,
                     encoder_hidden_states=prompt_embeds,
                     txt_ids=text_ids,
@@ -791,8 +786,16 @@ def __call__(
                 if self.do_classifier_free_guidance:
                     if negative_image_embeds is not None:
                         self._joint_attention_kwargs["ip_adapter_image_embeds"] = negative_image_embeds
-                    noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
-                    noise_pred = noise_pred_uncond + self.guidance_scale * (noise_pred_text - noise_pred_uncond)
+                    neg_noise_pred = self.transformer(
+                        hidden_states=latents,
+                        timestep=timestep / 1000,
+                        encoder_hidden_states=negative_prompt_embeds,
+                        txt_ids=negative_text_ids,
+                        img_ids=latent_image_ids,
+                        joint_attention_kwargs=self.joint_attention_kwargs,
+                        return_dict=False,
+                    )[0]
+                    noise_pred = neg_noise_pred + guidance_scale * (noise_pred - neg_noise_pred)
 
                 # compute the previous noisy sample x_t -> x_t-1
                 latents_dtype = latents.dtype

From 15ca813e3e0b7fc197f5666e2800d8c288b62cad Mon Sep 17 00:00:00 2001
From: Edna <88869424+Ednaordinary@users.noreply.github.com>
Date: Thu, 12 Jun 2025 03:45:43 -0600
Subject: [PATCH 062/108] Add transformer tests

---
 .../test_models_transformer_chroma.py         | 180 ++++++++++++++++++
 1 file changed, 180 insertions(+)
 create mode 100644 tests/models/transformers/test_models_transformer_chroma.py

diff --git a/tests/models/transformers/test_models_transformer_chroma.py b/tests/models/transformers/test_models_transformer_chroma.py
new file mode 100644
index 000000000000..fdf4678b9a84
--- /dev/null
+++ b/tests/models/transformers/test_models_transformer_chroma.py
@@ -0,0 +1,180 @@
+# coding=utf-8
+# Copyright 2024 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import torch
+
+from diffusers import ChromaTransformer2DModel
+from diffusers.models.attention_processor import FluxIPAdapterJointAttnProcessor2_0
+from diffusers.models.embeddings import ImageProjection
+from diffusers.utils.testing_utils import enable_full_determinism, torch_device
+
+from ..test_modeling_common import LoraHotSwappingForModelTesterMixin, ModelTesterMixin, TorchCompileTesterMixin
+
+
+enable_full_determinism()
+
+
+def create_chroma_ip_adapter_state_dict(model):
+    # "ip_adapter" (cross-attention weights)
+    ip_cross_attn_state_dict = {}
+    key_id = 0
+
+    for name in model.attn_processors.keys():
+        if name.startswith("single_transformer_blocks"):
+            continue
+
+        joint_attention_dim = model.config["joint_attention_dim"]
+        hidden_size = model.config["num_attention_heads"] * model.config["attention_head_dim"]
+        sd = FluxIPAdapterJointAttnProcessor2_0(
+            hidden_size=hidden_size, cross_attention_dim=joint_attention_dim, scale=1.0
+        ).state_dict()
+        ip_cross_attn_state_dict.update(
+            {
+                f"{key_id}.to_k_ip.weight": sd["to_k_ip.0.weight"],
+                f"{key_id}.to_v_ip.weight": sd["to_v_ip.0.weight"],
+                f"{key_id}.to_k_ip.bias": sd["to_k_ip.0.bias"],
+                f"{key_id}.to_v_ip.bias": sd["to_v_ip.0.bias"],
+            }
+        )
+
+        key_id += 1
+
+    # "image_proj" (ImageProjection layer weights)
+
+    image_projection = ImageProjection(
+        cross_attention_dim=model.config["joint_attention_dim"],
+        image_embed_dim=model.config["pooled_projection_dim"],
+        num_image_text_embeds=4,
+    )
+
+    ip_image_projection_state_dict = {}
+    sd = image_projection.state_dict()
+    ip_image_projection_state_dict.update(
+        {
+            "proj.weight": sd["image_embeds.weight"],
+            "proj.bias": sd["image_embeds.bias"],
+            "norm.weight": sd["norm.weight"],
+            "norm.bias": sd["norm.bias"],
+        }
+    )
+
+    del sd
+    ip_state_dict = {}
+    ip_state_dict.update({"image_proj": ip_image_projection_state_dict, "ip_adapter": ip_cross_attn_state_dict})
+    return ip_state_dict
+
+
+class ChromaTransformerTests(ModelTesterMixin, unittest.TestCase):
+    model_class = ChromaTransformer2DModel
+    main_input_name = "hidden_states"
+    # We override the items here because the transformer under consideration is small.
+    model_split_percents = [0.7, 0.6, 0.6]
+
+    # Skip setting testing with default: AttnProcessor
+    uses_custom_attn_processor = True
+
+    @property
+    def dummy_input(self):
+        batch_size = 1
+        num_latent_channels = 4
+        num_image_channels = 3
+        height = width = 4
+        sequence_length = 48
+        embedding_dim = 32
+
+        hidden_states = torch.randn((batch_size, height * width, num_latent_channels)).to(torch_device)
+        encoder_hidden_states = torch.randn((batch_size, sequence_length, embedding_dim)).to(torch_device)
+        text_ids = torch.randn((sequence_length, num_image_channels)).to(torch_device)
+        image_ids = torch.randn((height * width, num_image_channels)).to(torch_device)
+        timestep = torch.tensor([1.0]).to(torch_device).expand(batch_size)
+
+        return {
+            "hidden_states": hidden_states,
+            "encoder_hidden_states": encoder_hidden_states,
+            "img_ids": image_ids,
+            "txt_ids": text_ids,
+            "timestep": timestep,
+        }
+
+    @property
+    def input_shape(self):
+        return (16, 4)
+
+    @property
+    def output_shape(self):
+        return (16, 4)
+
+    def prepare_init_args_and_inputs_for_common(self):
+        init_dict = {
+            "patch_size": 1,
+            "in_channels": 4,
+            "num_layers": 1,
+            "num_single_layers": 1,
+            "attention_head_dim": 16,
+            "num_attention_heads": 2,
+            "joint_attention_dim": 32,
+            "axes_dims_rope": [4, 4, 8],
+        }
+
+        inputs_dict = self.dummy_input
+        return init_dict, inputs_dict
+
+    def test_deprecated_inputs_img_txt_ids_3d(self):
+        init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
+        model = self.model_class(**init_dict)
+        model.to(torch_device)
+        model.eval()
+
+        with torch.no_grad():
+            output_1 = model(**inputs_dict).to_tuple()[0]
+
+        # update inputs_dict with txt_ids and img_ids as 3d tensors (deprecated)
+        text_ids_3d = inputs_dict["txt_ids"].unsqueeze(0)
+        image_ids_3d = inputs_dict["img_ids"].unsqueeze(0)
+
+        assert text_ids_3d.ndim == 3, "text_ids_3d should be a 3d tensor"
+        assert image_ids_3d.ndim == 3, "img_ids_3d should be a 3d tensor"
+
+        inputs_dict["txt_ids"] = text_ids_3d
+        inputs_dict["img_ids"] = image_ids_3d
+
+        with torch.no_grad():
+            output_2 = model(**inputs_dict).to_tuple()[0]
+
+        self.assertEqual(output_1.shape, output_2.shape)
+        self.assertTrue(
+            torch.allclose(output_1, output_2, atol=1e-5),
+            msg="output with deprecated inputs (img_ids and txt_ids as 3d torch tensors) are not equal as them as 2d inputs",
+        )
+
+    def test_gradient_checkpointing_is_applied(self):
+        expected_set = {"ChromaTransformer2DModel"}
+        super().test_gradient_checkpointing_is_applied(expected_set=expected_set)
+
+
+class ChromaTransformerCompileTests(TorchCompileTesterMixin, unittest.TestCase):
+    model_class = FluxTransformer2DModel
+
+    def prepare_init_args_and_inputs_for_common(self):
+        return ChromaTransformerTests().prepare_init_args_and_inputs_for_common()
+
+
+class ChromaTransformerLoRAHotSwapTests(LoraHotSwappingForModelTesterMixin, unittest.TestCase):
+    model_class = ChromaTransformer2DModel
+
+    def prepare_init_args_and_inputs_for_common(self):
+        return ChromaTransformerTests().prepare_init_args_and_inputs_for_common()

From f8d4a1a77421c388e794b2eb5ce0f73e94896139 Mon Sep 17 00:00:00 2001
From: Edna <88869424+Ednaordinary@users.noreply.github.com>
Date: Thu, 12 Jun 2025 03:46:28 -0600
Subject: [PATCH 063/108] move chroma test (oops)

---
 tests/pipelines/{ => chroma}/chroma.py | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename tests/pipelines/{ => chroma}/chroma.py (100%)

diff --git a/tests/pipelines/chroma.py b/tests/pipelines/chroma/chroma.py
similarity index 100%
rename from tests/pipelines/chroma.py
rename to tests/pipelines/chroma/chroma.py

From c8d6aef936c65869e9854fc64a7f587f238bcdbb Mon Sep 17 00:00:00 2001
From: Edna <88869424+Ednaordinary@users.noreply.github.com>
Date: Thu, 12 Jun 2025 03:47:24 -0600
Subject: [PATCH 064/108] chroma init

---
 tests/pipelines/chroma/__init__.py | 1 +
 1 file changed, 1 insertion(+)
 create mode 100644 tests/pipelines/chroma/__init__.py

diff --git a/tests/pipelines/chroma/__init__.py b/tests/pipelines/chroma/__init__.py
new file mode 100644
index 000000000000..8b137891791f
--- /dev/null
+++ b/tests/pipelines/chroma/__init__.py
@@ -0,0 +1 @@
+

From cfd5b340518c17b6617a09e90f64036f165e8302 Mon Sep 17 00:00:00 2001
From: Edna <88869424+Ednaordinary@users.noreply.github.com>
Date: Thu, 12 Jun 2025 03:49:39 -0600
Subject: [PATCH 065/108] fix chroma pipeline fast tests

---
 tests/pipelines/chroma/chroma.py | 40 +++++---------------------------
 1 file changed, 6 insertions(+), 34 deletions(-)

diff --git a/tests/pipelines/chroma/chroma.py b/tests/pipelines/chroma/chroma.py
index 4ea369ca0ecb..3bd30996dc9e 100644
--- a/tests/pipelines/chroma/chroma.py
+++ b/tests/pipelines/chroma/chroma.py
@@ -67,31 +67,13 @@ def get_dummy_components(self, num_layers: int = 1, num_single_layers: int = 1):
             attention_head_dim=16,
             num_attention_heads=2,
             joint_attention_dim=32,
-            pooled_projection_dim=32,
             axes_dims_rope=[4, 4, 8],
         )
-        clip_text_encoder_config = CLIPTextConfig(
-            bos_token_id=0,
-            eos_token_id=2,
-            hidden_size=32,
-            intermediate_size=37,
-            layer_norm_eps=1e-05,
-            num_attention_heads=4,
-            num_hidden_layers=5,
-            pad_token_id=1,
-            vocab_size=1000,
-            hidden_act="gelu",
-            projection_dim=32,
-        )
-
-        torch.manual_seed(0)
-        text_encoder = CLIPTextModel(clip_text_encoder_config)
 
         torch.manual_seed(0)
-        text_encoder_2 = T5EncoderModel.from_pretrained("hf-internal-testing/tiny-random-t5")
+        text_encoder = T5EncoderModel.from_pretrained("hf-internal-testing/tiny-random-t5")
 
-        tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
-        tokenizer_2 = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-t5")
+        tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-t5")
 
         torch.manual_seed(0)
         vae = AutoencoderKL(
@@ -113,7 +95,6 @@ def get_dummy_components(self, num_layers: int = 1, num_single_layers: int = 1):
         return {
             "scheduler": scheduler,
             "text_encoder": text_encoder,
-            "text_encoder_2": text_encoder_2,
             "tokenizer": tokenizer,
             "tokenizer_2": tokenizer_2,
             "transformer": transformer,
@@ -130,6 +111,7 @@ def get_dummy_inputs(self, device, seed=0):
 
         inputs = {
             "prompt": "A painting of a squirrel eating a burger",
+            "negative_prompt": "bad, ugly",
             "generator": generator,
             "num_inference_steps": 2,
             "guidance_scale": 5.0,
@@ -140,14 +122,14 @@ def get_dummy_inputs(self, device, seed=0):
         }
         return inputs
 
-    def test_flux_different_prompts(self):
+    def test_chroma_different_prompts(self):
         pipe = self.pipeline_class(**self.get_dummy_components()).to(torch_device)
 
         inputs = self.get_dummy_inputs(torch_device)
         output_same_prompt = pipe(**inputs).images[0]
 
         inputs = self.get_dummy_inputs(torch_device)
-        inputs["prompt_2"] = "a different prompt"
+        inputs["prompt"] = "a different prompt"
         output_different_prompts = pipe(**inputs).images[0]
 
         max_diff = np.abs(output_same_prompt - output_different_prompts).max()
@@ -196,7 +178,7 @@ def test_fused_qkv_projections(self):
             "Original outputs should match when fused QKV projections are disabled."
         )
 
-    def test_flux_image_output_shape(self):
+    def test_chroma_image_output_shape(self):
         pipe = self.pipeline_class(**self.get_dummy_components()).to(torch_device)
         inputs = self.get_dummy_inputs(torch_device)
 
@@ -210,13 +192,3 @@ def test_flux_image_output_shape(self):
             output_height, output_width, _ = image.shape
             assert (output_height, output_width) == (expected_height, expected_width)
 
-    def test_flux_true_cfg(self):
-        pipe = self.pipeline_class(**self.get_dummy_components()).to(torch_device)
-        inputs = self.get_dummy_inputs(torch_device)
-        inputs.pop("generator")
-
-        no_true_cfg_out = pipe(**inputs, generator=torch.manual_seed(0)).images[0]
-        inputs["negative_prompt"] = "bad quality"
-        inputs["true_cfg_scale"] = 2.0
-        true_cfg_out = pipe(**inputs, generator=torch.manual_seed(0)).images[0]
-        assert not np.allclose(no_true_cfg_out, true_cfg_out)

From 2347d53f904607039cff8e3548aa17db2c4156d5 Mon Sep 17 00:00:00 2001
From: Edna <88869424+Ednaordinary@users.noreply.github.com>
Date: Thu, 12 Jun 2025 10:12:27 -0600
Subject: [PATCH 066/108] Update
 src/diffusers/models/transformers/transformer_chroma.py

Co-authored-by: Dhruv Nair <dhruv.nair@gmail.com>
---
 src/diffusers/models/transformers/transformer_chroma.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/diffusers/models/transformers/transformer_chroma.py b/src/diffusers/models/transformers/transformer_chroma.py
index 8708a861674c..fd1d1145bdcd 100644
--- a/src/diffusers/models/transformers/transformer_chroma.py
+++ b/src/diffusers/models/transformers/transformer_chroma.py
@@ -1,4 +1,4 @@
-# Copyright 2024 Black Forest Labs, The HuggingFace Team and The InstantX Team. All rights reserved.
+# Copyright 2025 Black Forest Labs, The HuggingFace Team and loadstone-rock . All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.

From d31cf81566ac0f1f1ef8dde6768f5f89f9e1d772 Mon Sep 17 00:00:00 2001
From: BuildTools <imnotednamode@gmail.com>
Date: Thu, 12 Jun 2025 10:20:27 -0600
Subject: [PATCH 067/108] Move Approximator and Embeddings

---
 src/diffusers/models/embeddings.py            | 44 ----------------
 .../models/transformers/transformer_chroma.py | 51 ++++++++++++++++++-
 2 files changed, 49 insertions(+), 46 deletions(-)

diff --git a/src/diffusers/models/embeddings.py b/src/diffusers/models/embeddings.py
index 1a43994c1116..cfc501c47ed9 100644
--- a/src/diffusers/models/embeddings.py
+++ b/src/diffusers/models/embeddings.py
@@ -1634,31 +1634,6 @@ def forward(self, timestep, guidance, pooled_projection):
 
         return conditioning
 
-class CombinedTimestepTextProjChromaEmbeddings(nn.Module):
-    def __init__(self, factor: int, hidden_dim: int, out_dim: int, n_layers: int, embedding_dim: int):
-        super().__init__()
-
-        self.time_proj = Timesteps(num_channels=factor, flip_sin_to_cos=True, downscale_freq_shift=0)
-        self.guidance_proj = Timesteps(num_channels=factor, flip_sin_to_cos=True, downscale_freq_shift=0)
-
-        self.register_buffer(
-            "mod_proj",
-            get_timestep_embedding(torch.arange(out_dim)*1000, 2 * factor, flip_sin_to_cos=True, downscale_freq_shift=0),
-            persistent=False,
-        )
-
-    def forward(self, timestep: torch.Tensor) -> torch.Tensor:
-        mod_index_length = self.mod_proj.shape[0]
-
-        timesteps_proj = self.time_proj(timestep).to(dtype=timestep.dtype)
-        guidance_proj = self.guidance_proj(torch.tensor([0])).to(dtype=timestep.dtype, device=timestep.device)
-
-        mod_proj = self.mod_proj.to(dtype=timesteps_proj.dtype, device=timesteps_proj.device)
-        timestep_guidance = (
-            torch.cat([timesteps_proj, guidance_proj], dim=1).unsqueeze(1).repeat(1, mod_index_length, 1)
-        )
-        input_vec = torch.cat([timestep_guidance, mod_proj.unsqueeze(0)], dim=-1)
-        return input_vec.to(timestep.dtype)
 
 class CogView3CombinedTimestepSizeEmbeddings(nn.Module):
     def __init__(self, embedding_dim: int, condition_dim: int, pooled_projection_dim: int, timesteps_dim: int = 256):
@@ -2253,25 +2228,6 @@ def forward(self, caption):
         return hidden_states
 
 
-class ChromaApproximator(nn.Module):
-    def __init__(self, in_dim: int, out_dim: int, hidden_dim: int, n_layers: int = 5):
-        super().__init__()
-        self.in_proj = nn.Linear(in_dim, hidden_dim, bias=True)
-        self.layers = nn.ModuleList(
-            [PixArtAlphaTextProjection(hidden_dim, hidden_dim, act_fn="silu") for _ in range(n_layers)]
-        )
-        self.norms = nn.ModuleList([nn.RMSNorm(hidden_dim) for _ in range(n_layers)])
-        self.out_proj = nn.Linear(hidden_dim, out_dim)
-
-    def forward(self, x):
-        x = self.in_proj(x)
-
-        for layer, norms in zip(self.layers, self.norms):
-            x = x + layer(norms(x))
-
-        return self.out_proj(x)
-
-
 class IPAdapterPlusImageProjectionBlock(nn.Module):
     def __init__(
         self,
diff --git a/src/diffusers/models/transformers/transformer_chroma.py b/src/diffusers/models/transformers/transformer_chroma.py
index fd1d1145bdcd..730277a15422 100644
--- a/src/diffusers/models/transformers/transformer_chroma.py
+++ b/src/diffusers/models/transformers/transformer_chroma.py
@@ -34,9 +34,10 @@
 )
 from ..cache_utils import CacheMixin
 from ..embeddings import (
-    CombinedTimestepTextProjChromaEmbeddings,
-    ChromaApproximator,
     FluxPosEmbed,
+    Timesteps,
+    PixArtAlphaTextProjection,
+    get_timestep_embedding,
 )
 from ..modeling_outputs import Transformer2DModelOutput
 from ..modeling_utils import ModelMixin
@@ -162,6 +163,52 @@ def forward(self, x: torch.Tensor, emb: torch.Tensor) -> torch.Tensor:
         x = self.norm(x) * (1 + scale)[:, None, :] + shift[:, None, :]
         return x
 
+
+class CombinedTimestepTextProjChromaEmbeddings(nn.Module):
+    def __init__(self, factor: int, hidden_dim: int, out_dim: int, n_layers: int, embedding_dim: int):
+        super().__init__()
+
+        self.time_proj = Timesteps(num_channels=factor, flip_sin_to_cos=True, downscale_freq_shift=0)
+        self.guidance_proj = Timesteps(num_channels=factor, flip_sin_to_cos=True, downscale_freq_shift=0)
+
+        self.register_buffer(
+            "mod_proj",
+            get_timestep_embedding(torch.arange(out_dim)*1000, 2 * factor, flip_sin_to_cos=True, downscale_freq_shift=0),
+            persistent=False,
+        )
+
+    def forward(self, timestep: torch.Tensor) -> torch.Tensor:
+        mod_index_length = self.mod_proj.shape[0]
+
+        timesteps_proj = self.time_proj(timestep).to(dtype=timestep.dtype)
+        guidance_proj = self.guidance_proj(torch.tensor([0])).to(dtype=timestep.dtype, device=timestep.device)
+
+        mod_proj = self.mod_proj.to(dtype=timesteps_proj.dtype, device=timesteps_proj.device)
+        timestep_guidance = (
+            torch.cat([timesteps_proj, guidance_proj], dim=1).unsqueeze(1).repeat(1, mod_index_length, 1)
+        )
+        input_vec = torch.cat([timestep_guidance, mod_proj.unsqueeze(0)], dim=-1)
+        return input_vec.to(timestep.dtype)
+
+
+class ChromaApproximator(nn.Module):
+    def __init__(self, in_dim: int, out_dim: int, hidden_dim: int, n_layers: int = 5):
+        super().__init__()
+        self.in_proj = nn.Linear(in_dim, hidden_dim, bias=True)
+        self.layers = nn.ModuleList(
+            [PixArtAlphaTextProjection(hidden_dim, hidden_dim, act_fn="silu") for _ in range(n_layers)]
+        )
+        self.norms = nn.ModuleList([nn.RMSNorm(hidden_dim) for _ in range(n_layers)])
+        self.out_proj = nn.Linear(hidden_dim, out_dim)
+
+    def forward(self, x):
+        x = self.in_proj(x)
+
+        for layer, norms in zip(self.layers, self.norms):
+            x = x + layer(norms(x))
+
+        return self.out_proj(x)
+
 @maybe_allow_in_graph
 class ChromaSingleTransformerBlock(nn.Module):
     def __init__(

From c85e46bd42e8914d5d6448aefb6a3d9e55b99bbf Mon Sep 17 00:00:00 2001
From: BuildTools <imnotednamode@gmail.com>
Date: Thu, 12 Jun 2025 10:31:02 -0600
Subject: [PATCH 068/108] Fix auto pipeline + make style, quality

---
 src/diffusers/__init__.py                     |  2 +-
 src/diffusers/loaders/single_file_model.py    |  2 +-
 src/diffusers/loaders/single_file_utils.py    | 21 +++++++++++--------
 .../models/transformers/transformer_chroma.py | 13 ++++++++----
 src/diffusers/pipelines/auto_pipeline.py      |  2 +-
 .../pipelines/chroma/pipeline_chroma.py       | 15 ++++++-------
 .../pipelines/chroma/pipeline_output.py       |  1 -
 src/diffusers/utils/dummy_pt_objects.py       |  1 +
 .../test_models_transformer_chroma.py         |  2 +-
 tests/pipelines/chroma/chroma.py              | 16 +++-----------
 10 files changed, 35 insertions(+), 40 deletions(-)

diff --git a/src/diffusers/__init__.py b/src/diffusers/__init__.py
index 2067e7d9d55c..1acb4494e178 100644
--- a/src/diffusers/__init__.py
+++ b/src/diffusers/__init__.py
@@ -943,8 +943,8 @@
             AudioLDM2UNet2DConditionModel,
             AudioLDMPipeline,
             AuraFlowPipeline,
-            CLIPImageProjection,
             ChromaPipeline,
+            CLIPImageProjection,
             CogVideoXFunControlPipeline,
             CogVideoXImageToVideoPipeline,
             CogVideoXPipeline,
diff --git a/src/diffusers/loaders/single_file_model.py b/src/diffusers/loaders/single_file_model.py
index e07370130889..c2eb62ba1222 100644
--- a/src/diffusers/loaders/single_file_model.py
+++ b/src/diffusers/loaders/single_file_model.py
@@ -29,8 +29,8 @@
     convert_animatediff_checkpoint_to_diffusers,
     convert_auraflow_transformer_checkpoint_to_diffusers,
     convert_autoencoder_dc_checkpoint_to_diffusers,
-    convert_controlnet_checkpoint,
     convert_chroma_transformer_checkpoint_to_diffusers,
+    convert_controlnet_checkpoint,
     convert_flux_transformer_checkpoint_to_diffusers,
     convert_hidream_transformer_to_diffusers,
     convert_hunyuan_video_transformer_to_diffusers,
diff --git a/src/diffusers/loaders/single_file_utils.py b/src/diffusers/loaders/single_file_utils.py
index f406ba5ce7e4..fc145547e1f8 100644
--- a/src/diffusers/loaders/single_file_utils.py
+++ b/src/diffusers/loaders/single_file_utils.py
@@ -3311,6 +3311,7 @@ def convert_hidream_transformer_to_diffusers(checkpoint, **kwargs):
 
     return checkpoint
 
+
 def convert_chroma_transformer_checkpoint_to_diffusers(checkpoint, **kwargs):
     converted_state_dict = {}
     keys = list(checkpoint.keys())
@@ -3321,7 +3322,9 @@ def convert_chroma_transformer_checkpoint_to_diffusers(checkpoint, **kwargs):
 
     num_layers = list(set(int(k.split(".", 2)[1]) for k in checkpoint if "double_blocks." in k))[-1] + 1  # noqa: C401
     num_single_layers = list(set(int(k.split(".", 2)[1]) for k in checkpoint if "single_blocks." in k))[-1] + 1  # noqa: C401
-    num_guidance_layers = list(set(int(k.split(".", 3)[2]) for k in checkpoint if "distilled_guidance_layer.layers." in k))[-1] + 1  # noqa: C401
+    num_guidance_layers = (
+        list({int(k.split(".", 3)[2]) for k in checkpoint if "distilled_guidance_layer.layers." in k})[-1] + 1
+    )  # noqa: C401
     mlp_ratio = 4.0
     inner_dim = 3072
 
@@ -3334,17 +3337,17 @@ def swap_scale_shift(weight):
 
     # guidance
     converted_state_dict["distilled_guidance_layer.in_proj.bias"] = checkpoint.pop(
-            "distilled_guidance_layer.in_proj.bias"
-        )
+        "distilled_guidance_layer.in_proj.bias"
+    )
     converted_state_dict["distilled_guidance_layer.in_proj.weight"] = checkpoint.pop(
-            "distilled_guidance_layer.in_proj.weight"
-        )
+        "distilled_guidance_layer.in_proj.weight"
+    )
     converted_state_dict["distilled_guidance_layer.out_proj.bias"] = checkpoint.pop(
-            "distilled_guidance_layer.out_proj.bias"
-        )
+        "distilled_guidance_layer.out_proj.bias"
+    )
     converted_state_dict["distilled_guidance_layer.out_proj.weight"] = checkpoint.pop(
-            "distilled_guidance_layer.out_proj.weight"
-        )
+        "distilled_guidance_layer.out_proj.weight"
+    )
     for i in range(num_guidance_layers):
         block_prefix = f"distilled_guidance_layer.layers.{i}."
         converted_state_dict[f"{block_prefix}linear_1.bias"] = checkpoint.pop(
diff --git a/src/diffusers/models/transformers/transformer_chroma.py b/src/diffusers/models/transformers/transformer_chroma.py
index 730277a15422..73ef6aed10f8 100644
--- a/src/diffusers/models/transformers/transformer_chroma.py
+++ b/src/diffusers/models/transformers/transformer_chroma.py
@@ -35,15 +35,15 @@
 from ..cache_utils import CacheMixin
 from ..embeddings import (
     FluxPosEmbed,
-    Timesteps,
     PixArtAlphaTextProjection,
+    Timesteps,
     get_timestep_embedding,
 )
 from ..modeling_outputs import Transformer2DModelOutput
 from ..modeling_utils import ModelMixin
 from ..normalization import (
-    FP32LayerNorm,
     CombinedTimestepLabelEmbeddings,
+    FP32LayerNorm,
     RMSNorm,
 )
 
@@ -173,7 +173,9 @@ def __init__(self, factor: int, hidden_dim: int, out_dim: int, n_layers: int, em
 
         self.register_buffer(
             "mod_proj",
-            get_timestep_embedding(torch.arange(out_dim)*1000, 2 * factor, flip_sin_to_cos=True, downscale_freq_shift=0),
+            get_timestep_embedding(
+                torch.arange(out_dim) * 1000, 2 * factor, flip_sin_to_cos=True, downscale_freq_shift=0
+            ),
             persistent=False,
         )
 
@@ -209,6 +211,7 @@ def forward(self, x):
 
         return self.out_proj(x)
 
+
 @maybe_allow_in_graph
 class ChromaSingleTransformerBlock(nn.Module):
     def __init__(
@@ -457,7 +460,9 @@ def __init__(
             ]
         )
 
-        self.norm_out = ChromaAdaLayerNormContinuousPruned(self.inner_dim, self.inner_dim, elementwise_affine=False, eps=1e-6)
+        self.norm_out = ChromaAdaLayerNormContinuousPruned(
+            self.inner_dim, self.inner_dim, elementwise_affine=False, eps=1e-6
+        )
         self.proj_out = nn.Linear(self.inner_dim, patch_size * patch_size * self.out_channels, bias=True)
 
         self.gradient_checkpointing = False
diff --git a/src/diffusers/pipelines/auto_pipeline.py b/src/diffusers/pipelines/auto_pipeline.py
index 29aa321f5ca3..b1a7ffaaea9c 100644
--- a/src/diffusers/pipelines/auto_pipeline.py
+++ b/src/diffusers/pipelines/auto_pipeline.py
@@ -144,7 +144,7 @@
         ("flux-controlnet", FluxControlNetPipeline),
         ("lumina", LuminaPipeline),
         ("lumina2", Lumina2Pipeline),
-        ("chroma", ChromaPipeline)
+        ("chroma", ChromaPipeline),
         ("cogview3", CogView3PlusPipeline),
         ("cogview4", CogView4Pipeline),
         ("cogview4-control", CogView4ControlPipeline),
diff --git a/src/diffusers/pipelines/chroma/pipeline_chroma.py b/src/diffusers/pipelines/chroma/pipeline_chroma.py
index d20ae43b360a..2b1b516ffc9c 100644
--- a/src/diffusers/pipelines/chroma/pipeline_chroma.py
+++ b/src/diffusers/pipelines/chroma/pipeline_chroma.py
@@ -65,6 +65,7 @@
         ```
 """
 
+
 # Copied from diffusers.pipelines.flux.pipeline_flux.calculate_shift
 def calculate_shift(
     image_seq_len,
@@ -225,7 +226,7 @@ def _get_t5_prompt_embeds(
             return_tensors="pt",
         )
         text_input_ids = text_inputs.input_ids
-        
+
         prompt_embeds = self.text_encoder(
             text_input_ids.to(device),
             output_hidden_states=False,
@@ -233,10 +234,8 @@ def _get_t5_prompt_embeds(
         )[0]
 
         max_len = min(text_inputs.attention_mask.sum() + 1, max_sequence_length)
-        prompt_embeds = prompt_embeds[
-            :, :max_len
-        ]
-        
+        prompt_embeds = prompt_embeds[:, :max_len]
+
         dtype = self.text_encoder.dtype
         prompt_embeds = prompt_embeds.to(dtype=dtype, device=device)
 
@@ -286,7 +285,6 @@ def encode_prompt(
         prompt = [prompt] if isinstance(prompt, str) else prompt
 
         if prompt_embeds is None:
-
             prompt_embeds = self._get_t5_prompt_embeds(
                 prompt=prompt,
                 num_images_per_prompt=num_images_per_prompt,
@@ -467,7 +465,6 @@ def disable_vae_tiling(self):
         """
         self.vae.disable_tiling()
 
-
     # Copied from diffusers.pipelines.flux.pipeline_flux.FluxPipeline.prepare_latents
     def prepare_latents(
         self,
@@ -515,7 +512,7 @@ def joint_attention_kwargs(self):
     @property
     def do_classifier_free_guidance(self):
         return self._guidance_scale > 1
-    
+
     @property
     def num_timesteps(self):
         return self._num_timesteps
@@ -694,7 +691,7 @@ def __call__(
                 max_sequence_length=max_sequence_length,
                 lora_scale=lora_scale,
             )
-        
+
         # 4. Prepare latent variables
         num_channels_latents = self.transformer.config.in_channels // 4
         latents, latent_image_ids = self.prepare_latents(
diff --git a/src/diffusers/pipelines/chroma/pipeline_output.py b/src/diffusers/pipelines/chroma/pipeline_output.py
index bb0a52ceb53c..951d132dba2e 100644
--- a/src/diffusers/pipelines/chroma/pipeline_output.py
+++ b/src/diffusers/pipelines/chroma/pipeline_output.py
@@ -3,7 +3,6 @@
 
 import numpy as np
 import PIL.Image
-import torch
 
 from ...utils import BaseOutput
 
diff --git a/src/diffusers/utils/dummy_pt_objects.py b/src/diffusers/utils/dummy_pt_objects.py
index 200e15c7abc0..2981f3a420d6 100644
--- a/src/diffusers/utils/dummy_pt_objects.py
+++ b/src/diffusers/utils/dummy_pt_objects.py
@@ -324,6 +324,7 @@ def from_config(cls, *args, **kwargs):
     def from_pretrained(cls, *args, **kwargs):
         requires_backends(cls, ["torch"])
 
+
 class ChromaTransformer2DModel(metaclass=DummyObject):
     _backends = ["torch"]
 
diff --git a/tests/models/transformers/test_models_transformer_chroma.py b/tests/models/transformers/test_models_transformer_chroma.py
index fdf4678b9a84..8ed7538aaf40 100644
--- a/tests/models/transformers/test_models_transformer_chroma.py
+++ b/tests/models/transformers/test_models_transformer_chroma.py
@@ -167,7 +167,7 @@ def test_gradient_checkpointing_is_applied(self):
 
 
 class ChromaTransformerCompileTests(TorchCompileTesterMixin, unittest.TestCase):
-    model_class = FluxTransformer2DModel
+    model_class = ChromaTransformer2DModel
 
     def prepare_init_args_and_inputs_for_common(self):
         return ChromaTransformerTests().prepare_init_args_and_inputs_for_common()
diff --git a/tests/pipelines/chroma/chroma.py b/tests/pipelines/chroma/chroma.py
index 3bd30996dc9e..6f3e0ea807b5 100644
--- a/tests/pipelines/chroma/chroma.py
+++ b/tests/pipelines/chroma/chroma.py
@@ -1,25 +1,17 @@
-import gc
 import unittest
 
 import numpy as np
-import pytest
 import torch
-from huggingface_hub import hf_hub_download
-from transformers import AutoTokenizer, CLIPTextConfig, CLIPTextModel, CLIPTokenizer, T5EncoderModel
+from transformers import AutoTokenizer, T5EncoderModel
 
 from diffusers import (
     AutoencoderKL,
-    FasterCacheConfig,
-    FlowMatchEulerDiscreteScheduler,
     ChromaPipeline,
     ChromaTransformer2DModel,
+    FasterCacheConfig,
+    FlowMatchEulerDiscreteScheduler,
 )
 from diffusers.utils.testing_utils import (
-    backend_empty_cache,
-    nightly,
-    numpy_cosine_similarity_distance,
-    require_big_accelerator,
-    slow,
     torch_device,
 )
 
@@ -96,7 +88,6 @@ def get_dummy_components(self, num_layers: int = 1, num_single_layers: int = 1):
             "scheduler": scheduler,
             "text_encoder": text_encoder,
             "tokenizer": tokenizer,
-            "tokenizer_2": tokenizer_2,
             "transformer": transformer,
             "vae": vae,
             "image_encoder": None,
@@ -191,4 +182,3 @@ def test_chroma_image_output_shape(self):
             image = pipe(**inputs).images[0]
             output_height, output_width, _ = image.shape
             assert (output_height, output_width) == (expected_height, expected_width)
-

From 19733af2fc7731d913cad4670d43bb2b2dc63659 Mon Sep 17 00:00:00 2001
From: DN6 <dhruv.nair@gmail.com>
Date: Fri, 13 Jun 2025 07:22:45 +0530
Subject: [PATCH 069/108] make style

---
 src/diffusers/__init__.py                         |  2 +-
 src/diffusers/pipelines/chroma/pipeline_chroma.py | 10 ++++++----
 2 files changed, 7 insertions(+), 5 deletions(-)

diff --git a/src/diffusers/__init__.py b/src/diffusers/__init__.py
index 1acb4494e178..27bbd3501680 100644
--- a/src/diffusers/__init__.py
+++ b/src/diffusers/__init__.py
@@ -353,8 +353,8 @@
             "AuraFlowPipeline",
             "BlipDiffusionControlNetPipeline",
             "BlipDiffusionPipeline",
-            "CLIPImageProjection",
             "ChromaPipeline",
+            "CLIPImageProjection",
             "CogVideoXFunControlPipeline",
             "CogVideoXImageToVideoPipeline",
             "CogVideoXPipeline",
diff --git a/src/diffusers/pipelines/chroma/pipeline_chroma.py b/src/diffusers/pipelines/chroma/pipeline_chroma.py
index 2b1b516ffc9c..564faad8387d 100644
--- a/src/diffusers/pipelines/chroma/pipeline_chroma.py
+++ b/src/diffusers/pipelines/chroma/pipeline_chroma.py
@@ -57,7 +57,9 @@
         >>> import torch
         >>> from diffusers import ChromaPipeline
 
-        >>> pipe = ChromaPipeline.from_single_file("chroma-unlocked-v35-detail-calibrated.safetensors", torch_dtype=torch.bfloat16)
+        >>> pipe = ChromaPipeline.from_single_file(
+        ...     "chroma-unlocked-v35-detail-calibrated.safetensors", torch_dtype=torch.bfloat16
+        ... )
         >>> pipe.to("cuda")
         >>> prompt = "A cat holding a sign that says hello world"
         >>> image = pipe(prompt, num_inference_steps=28, guidance_scale=4.0).images[0]
@@ -630,9 +632,9 @@ def __call__(
         Examples:
 
         Returns:
-            [`~pipelines.chroma.ChromaPipelineOutput`] or `tuple`: [`~pipelines.chroma.ChromaPipelineOutput`] if `return_dict`
-            is True, otherwise a `tuple`. When returning a tuple, the first element is a list with the generated
-            images.
+            [`~pipelines.chroma.ChromaPipelineOutput`] or `tuple`: [`~pipelines.chroma.ChromaPipelineOutput`] if
+            `return_dict` is True, otherwise a `tuple`. When returning a tuple, the first element is a list with the
+            generated images.
         """
 
         height = height or self.default_sample_size * self.vae_scale_factor

From f49b149c1c86d5e673d55364fcd82faa3ed2717c Mon Sep 17 00:00:00 2001
From: "github-actions[bot]" <github-actions[bot]@users.noreply.github.com>
Date: Fri, 13 Jun 2025 02:02:25 +0000
Subject: [PATCH 070/108] Apply style fixes

---
 src/diffusers/__init__.py                         |  2 +-
 src/diffusers/pipelines/chroma/pipeline_chroma.py | 10 ++++++----
 2 files changed, 7 insertions(+), 5 deletions(-)

diff --git a/src/diffusers/__init__.py b/src/diffusers/__init__.py
index 1acb4494e178..27bbd3501680 100644
--- a/src/diffusers/__init__.py
+++ b/src/diffusers/__init__.py
@@ -353,8 +353,8 @@
             "AuraFlowPipeline",
             "BlipDiffusionControlNetPipeline",
             "BlipDiffusionPipeline",
-            "CLIPImageProjection",
             "ChromaPipeline",
+            "CLIPImageProjection",
             "CogVideoXFunControlPipeline",
             "CogVideoXImageToVideoPipeline",
             "CogVideoXPipeline",
diff --git a/src/diffusers/pipelines/chroma/pipeline_chroma.py b/src/diffusers/pipelines/chroma/pipeline_chroma.py
index 2b1b516ffc9c..564faad8387d 100644
--- a/src/diffusers/pipelines/chroma/pipeline_chroma.py
+++ b/src/diffusers/pipelines/chroma/pipeline_chroma.py
@@ -57,7 +57,9 @@
         >>> import torch
         >>> from diffusers import ChromaPipeline
 
-        >>> pipe = ChromaPipeline.from_single_file("chroma-unlocked-v35-detail-calibrated.safetensors", torch_dtype=torch.bfloat16)
+        >>> pipe = ChromaPipeline.from_single_file(
+        ...     "chroma-unlocked-v35-detail-calibrated.safetensors", torch_dtype=torch.bfloat16
+        ... )
         >>> pipe.to("cuda")
         >>> prompt = "A cat holding a sign that says hello world"
         >>> image = pipe(prompt, num_inference_steps=28, guidance_scale=4.0).images[0]
@@ -630,9 +632,9 @@ def __call__(
         Examples:
 
         Returns:
-            [`~pipelines.chroma.ChromaPipelineOutput`] or `tuple`: [`~pipelines.chroma.ChromaPipelineOutput`] if `return_dict`
-            is True, otherwise a `tuple`. When returning a tuple, the first element is a list with the generated
-            images.
+            [`~pipelines.chroma.ChromaPipelineOutput`] or `tuple`: [`~pipelines.chroma.ChromaPipelineOutput`] if
+            `return_dict` is True, otherwise a `tuple`. When returning a tuple, the first element is a list with the
+            generated images.
         """
 
         height = height or self.default_sample_size * self.vae_scale_factor

From 68b9cce89712aa4e2555a3cb7514798fa581e83c Mon Sep 17 00:00:00 2001
From: BuildTools <imnotednamode@gmail.com>
Date: Thu, 12 Jun 2025 21:06:43 -0600
Subject: [PATCH 071/108] switch to new input ids

---
 src/diffusers/pipelines/chroma/pipeline_chroma.py | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

diff --git a/src/diffusers/pipelines/chroma/pipeline_chroma.py b/src/diffusers/pipelines/chroma/pipeline_chroma.py
index 564faad8387d..19a78b0757f0 100644
--- a/src/diffusers/pipelines/chroma/pipeline_chroma.py
+++ b/src/diffusers/pipelines/chroma/pipeline_chroma.py
@@ -220,24 +220,20 @@ def _get_t5_prompt_embeds(
 
         text_inputs = self.tokenizer(
             prompt,
-            padding="max_length",
+            padding=False,
             max_length=max_sequence_length,
             truncation=True,
             return_length=False,
             return_overflowing_tokens=False,
             return_tensors="pt",
         )
-        text_input_ids = text_inputs.input_ids
+        text_input_ids = text_inputs.input_ids + self.tokenizer.pad_token_id
 
         prompt_embeds = self.text_encoder(
             text_input_ids.to(device),
             output_hidden_states=False,
-            attention_mask=text_inputs.attention_mask.to(device),
         )[0]
 
-        max_len = min(text_inputs.attention_mask.sum() + 1, max_sequence_length)
-        prompt_embeds = prompt_embeds[:, :max_len]
-
         dtype = self.text_encoder.dtype
         prompt_embeds = prompt_embeds.to(dtype=dtype, device=device)
 

From e97a4dd0c72647aa9ba48003e8661f440a87c94a Mon Sep 17 00:00:00 2001
From: Edna <88869424+Ednaordinary@users.noreply.github.com>
Date: Thu, 12 Jun 2025 21:13:12 -0600
Subject: [PATCH 072/108] fix # Copied from error

---
 src/diffusers/pipelines/chroma/pipeline_chroma.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/diffusers/pipelines/chroma/pipeline_chroma.py b/src/diffusers/pipelines/chroma/pipeline_chroma.py
index 19a78b0757f0..a190cccc1942 100644
--- a/src/diffusers/pipelines/chroma/pipeline_chroma.py
+++ b/src/diffusers/pipelines/chroma/pipeline_chroma.py
@@ -393,7 +393,7 @@ def check_inputs(
         if max_sequence_length is not None and max_sequence_length > 512:
             raise ValueError(f"`max_sequence_length` cannot be greater than 512 but is {max_sequence_length}")
 
-    # Copied from diffusers.pipelines.flux.pipeline_flux.FluxPipeline.prepare_latent_image_ids
+    # Copied from diffusers.pipelines.flux.pipeline_flux.FluxPipeline._prepare_latent_image_ids
     @staticmethod
     def _prepare_latent_image_ids(batch_size, height, width, device, dtype):
         latent_image_ids = torch.zeros(height, width, 3)

From fd369246201283013fd68ef402314e3cc3b5b066 Mon Sep 17 00:00:00 2001
From: BuildTools <imnotednamode@gmail.com>
Date: Thu, 12 Jun 2025 21:20:32 -0600
Subject: [PATCH 073/108] remove # Copied from on protected members

---
 src/diffusers/pipelines/chroma/pipeline_chroma.py | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/src/diffusers/pipelines/chroma/pipeline_chroma.py b/src/diffusers/pipelines/chroma/pipeline_chroma.py
index a190cccc1942..b009554a4729 100644
--- a/src/diffusers/pipelines/chroma/pipeline_chroma.py
+++ b/src/diffusers/pipelines/chroma/pipeline_chroma.py
@@ -393,7 +393,6 @@ def check_inputs(
         if max_sequence_length is not None and max_sequence_length > 512:
             raise ValueError(f"`max_sequence_length` cannot be greater than 512 but is {max_sequence_length}")
 
-    # Copied from diffusers.pipelines.flux.pipeline_flux.FluxPipeline._prepare_latent_image_ids
     @staticmethod
     def _prepare_latent_image_ids(batch_size, height, width, device, dtype):
         latent_image_ids = torch.zeros(height, width, 3)
@@ -408,7 +407,6 @@ def _prepare_latent_image_ids(batch_size, height, width, device, dtype):
 
         return latent_image_ids.to(device=device, dtype=dtype)
 
-    # Copied from diffusers.pipelines.flux.pipeline_flux.FluxPipeline._pack_latents
     @staticmethod
     def _pack_latents(latents, batch_size, num_channels_latents, height, width):
         latents = latents.view(batch_size, num_channels_latents, height // 2, 2, width // 2, 2)
@@ -417,7 +415,6 @@ def _pack_latents(latents, batch_size, num_channels_latents, height, width):
 
         return latents
 
-    # Copied from diffusers.pipelines.flux.pipeline_flux.FluxPipeline._unpack_latents
     @staticmethod
     def _unpack_latents(latents, height, width, vae_scale_factor):
         batch_size, num_patches, channels = latents.shape

From 2bc51c838793c5089ad26662a63710247df3c991 Mon Sep 17 00:00:00 2001
From: BuildTools <imnotednamode@gmail.com>
Date: Thu, 12 Jun 2025 21:36:09 -0600
Subject: [PATCH 074/108] try to fix import

---
 src/diffusers/pipelines/__init__.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/diffusers/pipelines/__init__.py b/src/diffusers/pipelines/__init__.py
index d20d609ff9c4..058411bd65f9 100644
--- a/src/diffusers/pipelines/__init__.py
+++ b/src/diffusers/pipelines/__init__.py
@@ -532,6 +532,7 @@
         )
         from .aura_flow import AuraFlowPipeline
         from .blip_diffusion import BlipDiffusionPipeline
+        from .chroma import ChromaPipeline
         from .cogvideo import (
             CogVideoXFunControlPipeline,
             CogVideoXImageToVideoPipeline,

From 523150fb2c4c509ea4260348ba4f43343277fe1b Mon Sep 17 00:00:00 2001
From: BuildTools <imnotednamode@gmail.com>
Date: Thu, 12 Jun 2025 21:47:35 -0600
Subject: [PATCH 075/108] fix import

---
 src/diffusers/pipelines/chroma/pipeline_chroma.py |  1 -
 .../utils/dummy_torch_and_transformers_objects.py | 15 +++++++++++++++
 2 files changed, 15 insertions(+), 1 deletion(-)

diff --git a/src/diffusers/pipelines/chroma/pipeline_chroma.py b/src/diffusers/pipelines/chroma/pipeline_chroma.py
index b009554a4729..f3074e0d0940 100644
--- a/src/diffusers/pipelines/chroma/pipeline_chroma.py
+++ b/src/diffusers/pipelines/chroma/pipeline_chroma.py
@@ -182,7 +182,6 @@ def __init__(
         transformer: ChromaTransformer2DModel,
         image_encoder: CLIPVisionModelWithProjection = None,
         feature_extractor: CLIPImageProcessor = None,
-        variant: str = "flux",
     ):
         super().__init__()
 
diff --git a/src/diffusers/utils/dummy_torch_and_transformers_objects.py b/src/diffusers/utils/dummy_torch_and_transformers_objects.py
index cc8f3e01ee78..28d5b1beb504 100644
--- a/src/diffusers/utils/dummy_torch_and_transformers_objects.py
+++ b/src/diffusers/utils/dummy_torch_and_transformers_objects.py
@@ -287,6 +287,21 @@ def from_pretrained(cls, *args, **kwargs):
         requires_backends(cls, ["torch", "transformers"])
 
 
+class ChromaPipeline(metaclass=DummyObject):
+    _backends = ["torch", "transformers"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch", "transformers"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+
 class CogVideoXFunControlPipeline(metaclass=DummyObject):
     _backends = ["torch", "transformers"]
 

From c330f08fa27b1f9ddd271b3aeee255bcecf8882f Mon Sep 17 00:00:00 2001
From: BuildTools <imnotednamode@gmail.com>
Date: Thu, 12 Jun 2025 21:53:55 -0600
Subject: [PATCH 076/108] make fix-copes

---
 src/diffusers/utils/dummy_torch_and_transformers_objects.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/diffusers/utils/dummy_torch_and_transformers_objects.py b/src/diffusers/utils/dummy_torch_and_transformers_objects.py
index 28d5b1beb504..deebdc757faa 100644
--- a/src/diffusers/utils/dummy_torch_and_transformers_objects.py
+++ b/src/diffusers/utils/dummy_torch_and_transformers_objects.py
@@ -272,7 +272,7 @@ def from_pretrained(cls, *args, **kwargs):
         requires_backends(cls, ["torch", "transformers"])
 
 
-class CLIPImageProjection(metaclass=DummyObject):
+class ChromaPipeline(metaclass=DummyObject):
     _backends = ["torch", "transformers"]
 
     def __init__(self, *args, **kwargs):
@@ -287,7 +287,7 @@ def from_pretrained(cls, *args, **kwargs):
         requires_backends(cls, ["torch", "transformers"])
 
 
-class ChromaPipeline(metaclass=DummyObject):
+class CLIPImageProjection(metaclass=DummyObject):
     _backends = ["torch", "transformers"]
 
     def __init__(self, *args, **kwargs):

From 381e64b966a7a0b4a7aaf4b456765b1b474b7cd5 Mon Sep 17 00:00:00 2001
From: BuildTools <imnotednamode@gmail.com>
Date: Thu, 12 Jun 2025 22:22:39 -0600
Subject: [PATCH 077/108] revert style fix

---
 src/diffusers/loaders/single_file_utils.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/diffusers/loaders/single_file_utils.py b/src/diffusers/loaders/single_file_utils.py
index fc145547e1f8..d8d183304e9a 100644
--- a/src/diffusers/loaders/single_file_utils.py
+++ b/src/diffusers/loaders/single_file_utils.py
@@ -3323,8 +3323,8 @@ def convert_chroma_transformer_checkpoint_to_diffusers(checkpoint, **kwargs):
     num_layers = list(set(int(k.split(".", 2)[1]) for k in checkpoint if "double_blocks." in k))[-1] + 1  # noqa: C401
     num_single_layers = list(set(int(k.split(".", 2)[1]) for k in checkpoint if "single_blocks." in k))[-1] + 1  # noqa: C401
     num_guidance_layers = (
-        list({int(k.split(".", 3)[2]) for k in checkpoint if "distilled_guidance_layer.layers." in k})[-1] + 1
-    )  # noqa: C401
+        list(set(int(k.split(".", 3)[2]) for k in checkpoint if "distilled_guidance_layer.layers." in k))[-1] + 1  # noqa: C401
+    )
     mlp_ratio = 4.0
     inner_dim = 3072
 

From 35dc65b7dacb49ccbfaa04bce9feb50cb1830ce9 Mon Sep 17 00:00:00 2001
From: DN6 <dhruv.nair@gmail.com>
Date: Fri, 13 Jun 2025 13:30:04 +0530
Subject: [PATCH 078/108] update chroma transformer params

---
 .../models/transformers/transformer_chroma.py | 22 ++++++++++---------
 .../test_models_transformer_chroma.py         |  3 +++
 2 files changed, 15 insertions(+), 10 deletions(-)

diff --git a/src/diffusers/models/transformers/transformer_chroma.py b/src/diffusers/models/transformers/transformer_chroma.py
index 73ef6aed10f8..597e1d34eef3 100644
--- a/src/diffusers/models/transformers/transformer_chroma.py
+++ b/src/diffusers/models/transformers/transformer_chroma.py
@@ -164,17 +164,17 @@ def forward(self, x: torch.Tensor, emb: torch.Tensor) -> torch.Tensor:
         return x
 
 
-class CombinedTimestepTextProjChromaEmbeddings(nn.Module):
-    def __init__(self, factor: int, hidden_dim: int, out_dim: int, n_layers: int, embedding_dim: int):
+class ChromaCombinedTimestepTextProjEmbeddings(nn.Module):
+    def __init__(self, num_channels: int, out_dim: int):
         super().__init__()
 
-        self.time_proj = Timesteps(num_channels=factor, flip_sin_to_cos=True, downscale_freq_shift=0)
-        self.guidance_proj = Timesteps(num_channels=factor, flip_sin_to_cos=True, downscale_freq_shift=0)
+        self.time_proj = Timesteps(num_channels=num_channels, flip_sin_to_cos=True, downscale_freq_shift=0)
+        self.guidance_proj = Timesteps(num_channels=num_channels, flip_sin_to_cos=True, downscale_freq_shift=0)
 
         self.register_buffer(
             "mod_proj",
             get_timestep_embedding(
-                torch.arange(out_dim) * 1000, 2 * factor, flip_sin_to_cos=True, downscale_freq_shift=0
+                torch.arange(out_dim) * 1000, 2 * num_channels, flip_sin_to_cos=True, downscale_freq_shift=0
             ),
             persistent=False,
         )
@@ -426,14 +426,16 @@ def __init__(
 
         self.pos_embed = FluxPosEmbed(theta=10000, axes_dim=axes_dims_rope)
 
-        self.time_text_embed = CombinedTimestepTextProjChromaEmbeddings(
-            factor=approximator_in_factor,
-            hidden_dim=approximator_hidden_dim,
+        self.time_text_embed = ChromaCombinedTimestepTextProjEmbeddings(
+            num_channels=approximator_in_factor,
             out_dim=3 * num_single_layers + 2 * 6 * num_layers + 2,
-            embedding_dim=self.inner_dim,
+        )
+        self.distilled_guidance_layer = ChromaApproximator(
+            in_dim=64,
+            out_dim=self.inner_dim,
+            hidden_dim=approximator_hidden_dim,
             n_layers=approximator_layers,
         )
-        self.distilled_guidance_layer = ChromaApproximator(in_dim=64, out_dim=3072, hidden_dim=5120, n_layers=5)
 
         self.context_embedder = nn.Linear(joint_attention_dim, self.inner_dim)
         self.x_embedder = nn.Linear(in_channels, self.inner_dim)
diff --git a/tests/models/transformers/test_models_transformer_chroma.py b/tests/models/transformers/test_models_transformer_chroma.py
index 8ed7538aaf40..37be388d032d 100644
--- a/tests/models/transformers/test_models_transformer_chroma.py
+++ b/tests/models/transformers/test_models_transformer_chroma.py
@@ -128,6 +128,9 @@ def prepare_init_args_and_inputs_for_common(self):
             "num_attention_heads": 2,
             "joint_attention_dim": 32,
             "axes_dims_rope": [4, 4, 8],
+            "approximator_in_factor": 32,
+            "approximator_hidden_dim": 16,
+            "approximator_layers": 1,
         }
 
         inputs_dict = self.dummy_input

From 74fe45e82373c5e4cc7c188f12425bcd5227e051 Mon Sep 17 00:00:00 2001
From: DN6 <dhruv.nair@gmail.com>
Date: Fri, 13 Jun 2025 13:36:39 +0530
Subject: [PATCH 079/108] update chroma transformer approximator init params

---
 src/diffusers/models/transformers/transformer_chroma.py     | 6 +++---
 tests/models/transformers/test_models_transformer_chroma.py | 2 +-
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/diffusers/models/transformers/transformer_chroma.py b/src/diffusers/models/transformers/transformer_chroma.py
index 597e1d34eef3..62f739fc6bcf 100644
--- a/src/diffusers/models/transformers/transformer_chroma.py
+++ b/src/diffusers/models/transformers/transformer_chroma.py
@@ -416,7 +416,7 @@ def __init__(
         num_attention_heads: int = 24,
         joint_attention_dim: int = 4096,
         axes_dims_rope: Tuple[int, ...] = (16, 56, 56),
-        approximator_in_factor: int = 16,
+        approximator_num_channels: int = 64,
         approximator_hidden_dim: int = 5120,
         approximator_layers: int = 5,
     ):
@@ -427,11 +427,11 @@ def __init__(
         self.pos_embed = FluxPosEmbed(theta=10000, axes_dim=axes_dims_rope)
 
         self.time_text_embed = ChromaCombinedTimestepTextProjEmbeddings(
-            num_channels=approximator_in_factor,
+            num_channels=approximator_num_channels // 4,
             out_dim=3 * num_single_layers + 2 * 6 * num_layers + 2,
         )
         self.distilled_guidance_layer = ChromaApproximator(
-            in_dim=64,
+            in_dim=approximator_num_channels,
             out_dim=self.inner_dim,
             hidden_dim=approximator_hidden_dim,
             n_layers=approximator_layers,
diff --git a/tests/models/transformers/test_models_transformer_chroma.py b/tests/models/transformers/test_models_transformer_chroma.py
index 37be388d032d..d1a061ce10e5 100644
--- a/tests/models/transformers/test_models_transformer_chroma.py
+++ b/tests/models/transformers/test_models_transformer_chroma.py
@@ -128,7 +128,7 @@ def prepare_init_args_and_inputs_for_common(self):
             "num_attention_heads": 2,
             "joint_attention_dim": 32,
             "axes_dims_rope": [4, 4, 8],
-            "approximator_in_factor": 32,
+            "approximator_num_channels": 8,
             "approximator_hidden_dim": 16,
             "approximator_layers": 1,
         }

From 926dcc63195b35ffef35d05037b39cbe0470dee1 Mon Sep 17 00:00:00 2001
From: DN6 <dhruv.nair@gmail.com>
Date: Fri, 13 Jun 2025 13:43:17 +0530
Subject: [PATCH 080/108] update to pad tokens

---
 src/diffusers/pipelines/chroma/pipeline_chroma.py | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/src/diffusers/pipelines/chroma/pipeline_chroma.py b/src/diffusers/pipelines/chroma/pipeline_chroma.py
index f3074e0d0940..a215d972e6a4 100644
--- a/src/diffusers/pipelines/chroma/pipeline_chroma.py
+++ b/src/diffusers/pipelines/chroma/pipeline_chroma.py
@@ -226,7 +226,14 @@ def _get_t5_prompt_embeds(
             return_overflowing_tokens=False,
             return_tensors="pt",
         )
-        text_input_ids = text_inputs.input_ids + self.tokenizer.pad_token_id
+        pad_token_id = self.tokenizer.pad_token_id
+        text_input_ids = torch.cat(
+            [
+                text_inputs.input_ids,
+                torch.full((text_inputs.input_ids.size(0), 1), pad_token_id, dtype=text_inputs.input_ids.dtype),
+            ],
+            dim=1,
+        )
 
         prompt_embeds = self.text_encoder(
             text_input_ids.to(device),

From 89faa71f04e1235cd379a081a7c4d5c62e2b688e Mon Sep 17 00:00:00 2001
From: Dhruv Nair <dhruv.nair@gmail.com>
Date: Fri, 13 Jun 2025 12:31:11 +0200
Subject: [PATCH 081/108] fix batch inference

---
 .../models/transformers/transformer_chroma.py     | 15 +++++++++------
 src/diffusers/pipelines/chroma/pipeline_chroma.py |  5 ++++-
 2 files changed, 13 insertions(+), 7 deletions(-)

diff --git a/src/diffusers/models/transformers/transformer_chroma.py b/src/diffusers/models/transformers/transformer_chroma.py
index 62f739fc6bcf..c3de16a363b0 100644
--- a/src/diffusers/models/transformers/transformer_chroma.py
+++ b/src/diffusers/models/transformers/transformer_chroma.py
@@ -86,7 +86,7 @@ def forward(
     ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
         if self.emb is not None:
             emb = self.emb(timestep, class_labels, hidden_dtype=hidden_dtype)
-        shift_msa, scale_msa, gate_msa, shift_mlp, scale_mlp, gate_mlp = emb.squeeze(0).chunk(6, dim=0)
+        shift_msa, scale_msa, gate_msa, shift_mlp, scale_mlp, gate_mlp = emb.flatten(1, 2).chunk(6, dim=1)
         x = self.norm(x) * (1 + scale_msa[:, None]) + shift_msa[:, None]
         return x, gate_msa, shift_mlp, scale_mlp, gate_mlp
 
@@ -115,7 +115,7 @@ def forward(
         x: torch.Tensor,
         emb: Optional[torch.Tensor] = None,
     ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
-        shift_msa, scale_msa, gate_msa = emb.squeeze(0).chunk(3, dim=0)
+        shift_msa, scale_msa, gate_msa = emb.flatten(1, 2).chunk(3, dim=1)
         x = self.norm(x) * (1 + scale_msa[:, None]) + shift_msa[:, None]
         return x, gate_msa
 
@@ -159,7 +159,7 @@ def __init__(
 
     def forward(self, x: torch.Tensor, emb: torch.Tensor) -> torch.Tensor:
         # convert back to the original dtype in case `conditioning_embedding`` is upcasted to float32 (needed for hunyuanDiT)
-        shift, scale = torch.chunk(emb.squeeze(0).to(x.dtype), 2, dim=0)
+        shift, scale = torch.chunk(emb.flatten(1, 2).to(x.dtype), 2, dim=1)
         x = self.norm(x) * (1 + scale)[:, None, :] + shift[:, None, :]
         return x
 
@@ -181,15 +181,18 @@ def __init__(self, num_channels: int, out_dim: int):
 
     def forward(self, timestep: torch.Tensor) -> torch.Tensor:
         mod_index_length = self.mod_proj.shape[0]
+        batch_size = timestep.shape[0]
 
         timesteps_proj = self.time_proj(timestep).to(dtype=timestep.dtype)
-        guidance_proj = self.guidance_proj(torch.tensor([0])).to(dtype=timestep.dtype, device=timestep.device)
+        guidance_proj = self.guidance_proj(torch.tensor([0] * batch_size)).to(
+            dtype=timestep.dtype, device=timestep.device
+        )
 
-        mod_proj = self.mod_proj.to(dtype=timesteps_proj.dtype, device=timesteps_proj.device)
+        mod_proj = self.mod_proj.to(dtype=timesteps_proj.dtype, device=timesteps_proj.device).repeat(batch_size, 1, 1)
         timestep_guidance = (
             torch.cat([timesteps_proj, guidance_proj], dim=1).unsqueeze(1).repeat(1, mod_index_length, 1)
         )
-        input_vec = torch.cat([timestep_guidance, mod_proj.unsqueeze(0)], dim=-1)
+        input_vec = torch.cat([timestep_guidance, mod_proj], dim=-1)
         return input_vec.to(timestep.dtype)
 
 
diff --git a/src/diffusers/pipelines/chroma/pipeline_chroma.py b/src/diffusers/pipelines/chroma/pipeline_chroma.py
index a215d972e6a4..9cb7a190819a 100644
--- a/src/diffusers/pipelines/chroma/pipeline_chroma.py
+++ b/src/diffusers/pipelines/chroma/pipeline_chroma.py
@@ -681,6 +681,10 @@ def __call__(
             lora_scale=lora_scale,
         )
         if self.do_classifier_free_guidance:
+            negative_prompt = negative_prompt or ""
+            if isinstance(negative_prompt, str):
+                negative_prompt = [negative_prompt] * batch_size
+
             (
                 negative_prompt_embeds,
                 negative_text_ids,
@@ -705,7 +709,6 @@ def __call__(
             generator,
             latents,
         )
-
         # 5. Prepare timesteps
         sigmas = np.linspace(1.0, 1 / num_inference_steps, num_inference_steps) if sigmas is None else sigmas
         image_seq_len = latents.shape[1]

From 829c6f199e826774436a78e9940323fcc6ecb269 Mon Sep 17 00:00:00 2001
From: BuildTools <imnotednamode@gmail.com>
Date: Fri, 13 Jun 2025 04:38:13 -0600
Subject: [PATCH 082/108] Make more pipeline tests work

---
 .../chroma/{chroma.py => test_pipeline_chroma.py} | 15 ++-------------
 1 file changed, 2 insertions(+), 13 deletions(-)
 rename tests/pipelines/chroma/{chroma.py => test_pipeline_chroma.py} (92%)

diff --git a/tests/pipelines/chroma/chroma.py b/tests/pipelines/chroma/test_pipeline_chroma.py
similarity index 92%
rename from tests/pipelines/chroma/chroma.py
rename to tests/pipelines/chroma/test_pipeline_chroma.py
index 6f3e0ea807b5..dafc2cd5e781 100644
--- a/tests/pipelines/chroma/chroma.py
+++ b/tests/pipelines/chroma/test_pipeline_chroma.py
@@ -28,12 +28,9 @@
 class ChromaPipelineFastTests(
     unittest.TestCase,
     PipelineTesterMixin,
-    FluxIPAdapterTesterMixin,
-    PyramidAttentionBroadcastTesterMixin,
-    FasterCacheTesterMixin,
 ):
     pipeline_class = ChromaPipeline
-    params = frozenset(["prompt", "height", "width", "guidance_scale", "prompt_embeds", "pooled_prompt_embeds"])
+    params = frozenset(["prompt", "negative_prompt", "height", "width", "guidance_scale", "prompt_embeds"])
     batch_params = frozenset(["prompt"])
 
     # there is no xformers processor for Flux
@@ -41,14 +38,6 @@ class ChromaPipelineFastTests(
     test_layerwise_casting = True
     test_group_offloading = True
 
-    faster_cache_config = FasterCacheConfig(
-        spatial_attention_block_skip_range=2,
-        spatial_attention_timestep_skip_range=(-1, 901),
-        unconditional_batch_skip_range=2,
-        attention_weight_callback=lambda _: 0.5,
-        is_guidance_distilled=True,
-    )
-
     def get_dummy_components(self, num_layers: int = 1, num_single_layers: int = 1):
         torch.manual_seed(0)
         transformer = ChromaTransformer2DModel(
@@ -57,7 +46,7 @@ def get_dummy_components(self, num_layers: int = 1, num_single_layers: int = 1):
             num_layers=num_layers,
             num_single_layers=num_single_layers,
             attention_head_dim=16,
-            num_attention_heads=2,
+            num_attention_heads=192,
             joint_attention_dim=32,
             axes_dims_rope=[4, 4, 8],
         )

From 876649336e3094ef3dd47583d8e7ee9ecee9d76c Mon Sep 17 00:00:00 2001
From: BuildTools <imnotednamode@gmail.com>
Date: Fri, 13 Jun 2025 04:43:31 -0600
Subject: [PATCH 083/108] Make most transformer tests work

---
 tests/models/transformers/test_models_transformer_chroma.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/models/transformers/test_models_transformer_chroma.py b/tests/models/transformers/test_models_transformer_chroma.py
index 8ed7538aaf40..a75e7fab477b 100644
--- a/tests/models/transformers/test_models_transformer_chroma.py
+++ b/tests/models/transformers/test_models_transformer_chroma.py
@@ -125,7 +125,7 @@ def prepare_init_args_and_inputs_for_common(self):
             "num_layers": 1,
             "num_single_layers": 1,
             "attention_head_dim": 16,
-            "num_attention_heads": 2,
+            "num_attention_heads": 192,
             "joint_attention_dim": 32,
             "axes_dims_rope": [4, 4, 8],
         }

From 28dea06b3d93aac971466b1ff2d3c2045419f144 Mon Sep 17 00:00:00 2001
From: BuildTools <imnotednamode@gmail.com>
Date: Fri, 13 Jun 2025 04:53:30 -0600
Subject: [PATCH 084/108] fix docs

---
 docs/source/en/_toctree.yml | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml
index f13b7d54aec4..5492dff04cae 100644
--- a/docs/source/en/_toctree.yml
+++ b/docs/source/en/_toctree.yml
@@ -283,6 +283,8 @@
         title: AllegroTransformer3DModel
       - local: api/models/aura_flow_transformer2d
         title: AuraFlowTransformer2DModel
+      - local: api/models/chroma_transformer
+        title: ChromaTransformer2DModel
       - local: api/models/cogvideox_transformer3d
         title: CogVideoXTransformer3DModel
       - local: api/models/cogview3plus_transformer2d
@@ -405,6 +407,8 @@
       title: AutoPipeline
     - local: api/pipelines/blip_diffusion
       title: BLIP-Diffusion
+    - local: api/pipelines/chroma
+      title: Chroma
     - local: api/pipelines/cogvideox
       title: CogVideoX
     - local: api/pipelines/cogview3

From bea8b0d86ef96353d3fc8d1bdd6eb8859a02be62 Mon Sep 17 00:00:00 2001
From: BuildTools <imnotednamode@gmail.com>
Date: Fri, 13 Jun 2025 04:54:33 -0600
Subject: [PATCH 085/108] make style, make quality

---
 tests/pipelines/chroma/test_pipeline_chroma.py | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/tests/pipelines/chroma/test_pipeline_chroma.py b/tests/pipelines/chroma/test_pipeline_chroma.py
index dafc2cd5e781..8bb16d8b404a 100644
--- a/tests/pipelines/chroma/test_pipeline_chroma.py
+++ b/tests/pipelines/chroma/test_pipeline_chroma.py
@@ -8,7 +8,6 @@
     AutoencoderKL,
     ChromaPipeline,
     ChromaTransformer2DModel,
-    FasterCacheConfig,
     FlowMatchEulerDiscreteScheduler,
 )
 from diffusers.utils.testing_utils import (
@@ -16,10 +15,7 @@
 )
 
 from ..test_pipelines_common import (
-    FasterCacheTesterMixin,
-    FluxIPAdapterTesterMixin,
     PipelineTesterMixin,
-    PyramidAttentionBroadcastTesterMixin,
     check_qkv_fusion_matches_attn_procs_length,
     check_qkv_fusion_processors_exist,
 )

From 00ebba9725ab5bd79b00f831caba06cd0e4804ae Mon Sep 17 00:00:00 2001
From: BuildTools <imnotednamode@gmail.com>
Date: Fri, 13 Jun 2025 05:25:11 -0600
Subject: [PATCH 086/108] skip batch tests

---
 tests/pipelines/test_pipelines_common.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/tests/pipelines/test_pipelines_common.py b/tests/pipelines/test_pipelines_common.py
index 91ffc0ae537d..ee0af949ce01 100644
--- a/tests/pipelines/test_pipelines_common.py
+++ b/tests/pipelines/test_pipelines_common.py
@@ -1208,6 +1208,7 @@ def test_pipeline_call_signature(self):
             f"Required optional parameters not present: {remaining_required_optional_parameters}",
         )
 
+    @unittest.skipIf("Chroma" in pipeline_class.__name__, reason="Chroma does not support batching")
     def test_inference_batch_consistent(self, batch_sizes=[2]):
         self._test_inference_batch_consistent(batch_sizes=batch_sizes)
 
@@ -1260,6 +1261,7 @@ def _test_inference_batch_consistent(
             output = pipe(**batched_input)
             assert len(output[0]) == batch_size
 
+    @unittest.skipIf("Chroma" in pipeline_class.__name__, reason="Chroma does not support batching")
     def test_inference_batch_single_identical(self, batch_size=3, expected_max_diff=1e-4):
         self._test_inference_batch_single_identical(batch_size=batch_size, expected_max_diff=expected_max_diff)
 
@@ -1829,6 +1831,7 @@ def _test_xformers_attention_forwardGenerator_pass(
         if test_mean_pixel_difference:
             assert_mean_pixel_difference(output_with_offload[0], output_without_offload[0])
 
+    @unittest.skipIf("Chroma" in pipeline_class.__name__, reason="Chroma does not yet support num_images_per_prompt")
     def test_num_images_per_prompt(self):
         sig = inspect.signature(self.pipeline_class.__call__)
 

From 2b6722ecea88d7fc6996a5412d9e216c8bfb3db6 Mon Sep 17 00:00:00 2001
From: BuildTools <imnotednamode@gmail.com>
Date: Fri, 13 Jun 2025 05:35:58 -0600
Subject: [PATCH 087/108] fix test skipping

---
 tests/pipelines/test_pipelines_common.py | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/tests/pipelines/test_pipelines_common.py b/tests/pipelines/test_pipelines_common.py
index ee0af949ce01..6dcc81bdac77 100644
--- a/tests/pipelines/test_pipelines_common.py
+++ b/tests/pipelines/test_pipelines_common.py
@@ -1208,8 +1208,9 @@ def test_pipeline_call_signature(self):
             f"Required optional parameters not present: {remaining_required_optional_parameters}",
         )
 
-    @unittest.skipIf("Chroma" in pipeline_class.__name__, reason="Chroma does not support batching")
     def test_inference_batch_consistent(self, batch_sizes=[2]):
+        if "Chroma" in self.pipeline_class.__name__:
+            self.skipTest("Chroma does not support batching")
         self._test_inference_batch_consistent(batch_sizes=batch_sizes)
 
     def _test_inference_batch_consistent(
@@ -1261,8 +1262,9 @@ def _test_inference_batch_consistent(
             output = pipe(**batched_input)
             assert len(output[0]) == batch_size
 
-    @unittest.skipIf("Chroma" in pipeline_class.__name__, reason="Chroma does not support batching")
     def test_inference_batch_single_identical(self, batch_size=3, expected_max_diff=1e-4):
+        if "Chroma" in self.pipeline_class.__name__:
+            self.skipTest("Chroma does not support batching")
         self._test_inference_batch_single_identical(batch_size=batch_size, expected_max_diff=expected_max_diff)
 
     def _test_inference_batch_single_identical(
@@ -1831,8 +1833,9 @@ def _test_xformers_attention_forwardGenerator_pass(
         if test_mean_pixel_difference:
             assert_mean_pixel_difference(output_with_offload[0], output_without_offload[0])
 
-    @unittest.skipIf("Chroma" in pipeline_class.__name__, reason="Chroma does not yet support num_images_per_prompt")
     def test_num_images_per_prompt(self):
+        if "Chroma" in self.pipeline_class.__name__:
+            self.skipTest("Chroma does not yet support num_images_per_prompt")
         sig = inspect.signature(self.pipeline_class.__call__)
 
         if "num_images_per_prompt" not in sig.parameters:

From de9a07fc204076923a95624ceebb14b3b74d3150 Mon Sep 17 00:00:00 2001
From: BuildTools <imnotednamode@gmail.com>
Date: Fri, 13 Jun 2025 05:47:41 -0600
Subject: [PATCH 088/108] fix test skipping again

---
 tests/pipelines/chroma/test_pipeline_chroma.py | 12 ++++++++++++
 tests/pipelines/test_pipelines_common.py       |  6 ------
 2 files changed, 12 insertions(+), 6 deletions(-)

diff --git a/tests/pipelines/chroma/test_pipeline_chroma.py b/tests/pipelines/chroma/test_pipeline_chroma.py
index 8bb16d8b404a..8103800c3f42 100644
--- a/tests/pipelines/chroma/test_pipeline_chroma.py
+++ b/tests/pipelines/chroma/test_pipeline_chroma.py
@@ -167,3 +167,15 @@ def test_chroma_image_output_shape(self):
             image = pipe(**inputs).images[0]
             output_height, output_width, _ = image.shape
             assert (output_height, output_width) == (expected_height, expected_width)
+
+    @unittest.skip("Not supported in Chroma.")
+    def test_inference_batch_consistent(self, *args):
+        pass
+
+    @unittest.skip("Not supported in Chroma.")
+    def test_inference_batch_single_identical(self, *args):
+        pass
+
+    @unittest.skip("Not supported in Chroma.")
+    def test_num_images_per_prompt(self):
+        pass
diff --git a/tests/pipelines/test_pipelines_common.py b/tests/pipelines/test_pipelines_common.py
index 6dcc81bdac77..91ffc0ae537d 100644
--- a/tests/pipelines/test_pipelines_common.py
+++ b/tests/pipelines/test_pipelines_common.py
@@ -1209,8 +1209,6 @@ def test_pipeline_call_signature(self):
         )
 
     def test_inference_batch_consistent(self, batch_sizes=[2]):
-        if "Chroma" in self.pipeline_class.__name__:
-            self.skipTest("Chroma does not support batching")
         self._test_inference_batch_consistent(batch_sizes=batch_sizes)
 
     def _test_inference_batch_consistent(
@@ -1263,8 +1261,6 @@ def _test_inference_batch_consistent(
             assert len(output[0]) == batch_size
 
     def test_inference_batch_single_identical(self, batch_size=3, expected_max_diff=1e-4):
-        if "Chroma" in self.pipeline_class.__name__:
-            self.skipTest("Chroma does not support batching")
         self._test_inference_batch_single_identical(batch_size=batch_size, expected_max_diff=expected_max_diff)
 
     def _test_inference_batch_single_identical(
@@ -1834,8 +1830,6 @@ def _test_xformers_attention_forwardGenerator_pass(
             assert_mean_pixel_difference(output_with_offload[0], output_without_offload[0])
 
     def test_num_images_per_prompt(self):
-        if "Chroma" in self.pipeline_class.__name__:
-            self.skipTest("Chroma does not yet support num_images_per_prompt")
         sig = inspect.signature(self.pipeline_class.__call__)
 
         if "num_images_per_prompt" not in sig.parameters:

From 67355077051068b289defc2dfbc584a51b134ecc Mon Sep 17 00:00:00 2001
From: Dhruv Nair <dhruv.nair@gmail.com>
Date: Fri, 13 Jun 2025 14:51:42 +0200
Subject: [PATCH 089/108] fix for tests

---
 .../models/transformers/transformer_chroma.py |  1 +
 .../pipelines/chroma/pipeline_chroma.py       | 20 +++++++++----------
 tests/pipelines/chroma/chroma.py              | 12 +++++++----
 tests/pipelines/test_pipelines_common.py      |  4 +++-
 4 files changed, 21 insertions(+), 16 deletions(-)

diff --git a/src/diffusers/models/transformers/transformer_chroma.py b/src/diffusers/models/transformers/transformer_chroma.py
index c3de16a363b0..02eee7442355 100644
--- a/src/diffusers/models/transformers/transformer_chroma.py
+++ b/src/diffusers/models/transformers/transformer_chroma.py
@@ -418,6 +418,7 @@ def __init__(
         attention_head_dim: int = 128,
         num_attention_heads: int = 24,
         joint_attention_dim: int = 4096,
+        pooled_projection_dim: int = 768,
         axes_dims_rope: Tuple[int, ...] = (16, 56, 56),
         approximator_num_channels: int = 64,
         approximator_hidden_dim: int = 5120,
diff --git a/src/diffusers/pipelines/chroma/pipeline_chroma.py b/src/diffusers/pipelines/chroma/pipeline_chroma.py
index 9cb7a190819a..b6c398ead039 100644
--- a/src/diffusers/pipelines/chroma/pipeline_chroma.py
+++ b/src/diffusers/pipelines/chroma/pipeline_chroma.py
@@ -219,25 +219,23 @@ def _get_t5_prompt_embeds(
 
         text_inputs = self.tokenizer(
             prompt,
-            padding=False,
+            padding=True,
             max_length=max_sequence_length,
             truncation=True,
             return_length=False,
             return_overflowing_tokens=False,
             return_tensors="pt",
         )
-        pad_token_id = self.tokenizer.pad_token_id
-        text_input_ids = torch.cat(
-            [
-                text_inputs.input_ids,
-                torch.full((text_inputs.input_ids.size(0), 1), pad_token_id, dtype=text_inputs.input_ids.dtype),
-            ],
-            dim=1,
-        )
+        text_input_ids = text_inputs.input_ids
+        attention_mask = text_inputs.attention_mask.clone()
+
+        # Chroma requires the attention mask to include one padding token
+        seq_lengths = attention_mask.sum(dim=1)
+        mask_indices = torch.arange(attention_mask.size(1)).unsqueeze(0).expand(batch_size, -1)
+        attention_mask = (mask_indices <= seq_lengths.unsqueeze(1)).long()
 
         prompt_embeds = self.text_encoder(
-            text_input_ids.to(device),
-            output_hidden_states=False,
+            text_input_ids.to(device), output_hidden_states=False, attention_mask=attention_mask.to(device)
         )[0]
 
         dtype = self.text_encoder.dtype
diff --git a/tests/pipelines/chroma/chroma.py b/tests/pipelines/chroma/chroma.py
index 6f3e0ea807b5..5025769c9adc 100644
--- a/tests/pipelines/chroma/chroma.py
+++ b/tests/pipelines/chroma/chroma.py
@@ -19,7 +19,6 @@
     FasterCacheTesterMixin,
     FluxIPAdapterTesterMixin,
     PipelineTesterMixin,
-    PyramidAttentionBroadcastTesterMixin,
     check_qkv_fusion_matches_attn_procs_length,
     check_qkv_fusion_processors_exist,
 )
@@ -29,11 +28,10 @@ class ChromaPipelineFastTests(
     unittest.TestCase,
     PipelineTesterMixin,
     FluxIPAdapterTesterMixin,
-    PyramidAttentionBroadcastTesterMixin,
     FasterCacheTesterMixin,
 ):
     pipeline_class = ChromaPipeline
-    params = frozenset(["prompt", "height", "width", "guidance_scale", "prompt_embeds", "pooled_prompt_embeds"])
+    params = frozenset(["prompt", "height", "width", "guidance_scale", "prompt_embeds"])
     batch_params = frozenset(["prompt"])
 
     # there is no xformers processor for Flux
@@ -46,7 +44,7 @@ class ChromaPipelineFastTests(
         spatial_attention_timestep_skip_range=(-1, 901),
         unconditional_batch_skip_range=2,
         attention_weight_callback=lambda _: 0.5,
-        is_guidance_distilled=True,
+        is_guidance_distilled=False,
     )
 
     def get_dummy_components(self, num_layers: int = 1, num_single_layers: int = 1):
@@ -182,3 +180,9 @@ def test_chroma_image_output_shape(self):
             image = pipe(**inputs).images[0]
             output_height, output_width, _ = image.shape
             assert (output_height, output_width) == (expected_height, expected_width)
+
+    @unittest.skip(
+        "Chroma uses Flux encode_prompt but uses CFG. This test is incompatible with the pipeline since the test case does not use a negative prompt embeds"
+    )
+    def test_encode_prompt_works_in_isolation(self):
+        pass
diff --git a/tests/pipelines/test_pipelines_common.py b/tests/pipelines/test_pipelines_common.py
index 91ffc0ae537d..22c17085a297 100644
--- a/tests/pipelines/test_pipelines_common.py
+++ b/tests/pipelines/test_pipelines_common.py
@@ -307,6 +307,7 @@ def test_ip_adapter(self, expected_max_diff: float = 1e-4, expected_pipe_slice=N
 
         # forward pass without ip adapter
         inputs = self._modify_inputs_for_ip_adapter_test(self.get_dummy_inputs(torch_device))
+        __import__("ipdb").set_trace()
         if expected_pipe_slice is None:
             output_without_adapter = pipe(**inputs)[0]
         else:
@@ -521,7 +522,8 @@ def _get_dummy_image_embeds(self, image_embed_dim: int = 768):
 
     def _modify_inputs_for_ip_adapter_test(self, inputs: Dict[str, Any]):
         inputs["negative_prompt"] = ""
-        inputs["true_cfg_scale"] = 4.0
+        if "true_cfg_scale" in inspect.signature(self.pipeline_class.__call__).parameters:
+            inputs["true_cfg_scale"] = 4.0
         inputs["output_type"] = "np"
         inputs["return_dict"] = False
         return inputs

From b85229e262099eed9e300cd59680966d141e8465 Mon Sep 17 00:00:00 2001
From: BuildTools <imnotednamode@gmail.com>
Date: Fri, 13 Jun 2025 07:07:05 -0600
Subject: [PATCH 090/108] Fix all pipeline test

---
 .../models/transformers/transformer_chroma.py | 11 ++++---
 .../pipelines/chroma/pipeline_chroma.py       | 32 +++++++++++--------
 .../pipelines/chroma/test_pipeline_chroma.py  |  8 +++--
 3 files changed, 32 insertions(+), 19 deletions(-)

diff --git a/src/diffusers/models/transformers/transformer_chroma.py b/src/diffusers/models/transformers/transformer_chroma.py
index 73ef6aed10f8..8fa2da918bac 100644
--- a/src/diffusers/models/transformers/transformer_chroma.py
+++ b/src/diffusers/models/transformers/transformer_chroma.py
@@ -165,7 +165,7 @@ def forward(self, x: torch.Tensor, emb: torch.Tensor) -> torch.Tensor:
 
 
 class CombinedTimestepTextProjChromaEmbeddings(nn.Module):
-    def __init__(self, factor: int, hidden_dim: int, out_dim: int, n_layers: int, embedding_dim: int):
+    def __init__(self, factor: int, out_dim: int):
         super().__init__()
 
         self.time_proj = Timesteps(num_channels=factor, flip_sin_to_cos=True, downscale_freq_shift=0)
@@ -418,6 +418,7 @@ def __init__(
         axes_dims_rope: Tuple[int, ...] = (16, 56, 56),
         approximator_in_factor: int = 16,
         approximator_hidden_dim: int = 5120,
+        approximator_out_dim: int = 3072,
         approximator_layers: int = 5,
     ):
         super().__init__()
@@ -428,12 +429,14 @@ def __init__(
 
         self.time_text_embed = CombinedTimestepTextProjChromaEmbeddings(
             factor=approximator_in_factor,
-            hidden_dim=approximator_hidden_dim,
             out_dim=3 * num_single_layers + 2 * 6 * num_layers + 2,
-            embedding_dim=self.inner_dim,
+        )
+        self.distilled_guidance_layer = ChromaApproximator(
+            in_dim=in_channels,
+            out_dim=approximator_out_dim,
+            hidden_dim=approximator_hidden_dim,
             n_layers=approximator_layers,
         )
-        self.distilled_guidance_layer = ChromaApproximator(in_dim=64, out_dim=3072, hidden_dim=5120, n_layers=5)
 
         self.context_embedder = nn.Linear(joint_attention_dim, self.inner_dim)
         self.x_embedder = nn.Linear(in_channels, self.inner_dim)
diff --git a/src/diffusers/pipelines/chroma/pipeline_chroma.py b/src/diffusers/pipelines/chroma/pipeline_chroma.py
index f3074e0d0940..b0b3347b24ff 100644
--- a/src/diffusers/pipelines/chroma/pipeline_chroma.py
+++ b/src/diffusers/pipelines/chroma/pipeline_chroma.py
@@ -247,9 +247,11 @@ def _get_t5_prompt_embeds(
     def encode_prompt(
         self,
         prompt: Union[str, List[str]],
+        negative_prompt: Union[str, List[str]] = None,
         device: Optional[torch.device] = None,
         num_images_per_prompt: int = 1,
         prompt_embeds: Optional[torch.FloatTensor] = None,
+        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
         max_sequence_length: int = 512,
         lora_scale: Optional[float] = None,
     ):
@@ -258,6 +260,9 @@ def encode_prompt(
         Args:
             prompt (`str` or `List[str]`, *optional*):
                 prompt to be encoded
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt not to guide the image generation. If not defined, one has to pass `negative_prompt_embeds`
+                instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is less than `1`).
             device: (`torch.device`):
                 torch device
             num_images_per_prompt (`int`):
@@ -289,6 +294,14 @@ def encode_prompt(
                 device=device,
             )
 
+        if negative_prompt_embeds is None:
+            negative_prompt_embeds = self._get_t5_prompt_embeds(
+                prompt=negative_prompt,
+                num_images_per_prompt=num_images_per_prompt,
+                max_sequence_length=max_sequence_length,
+                device=device,
+            )
+
         if self.text_encoder is not None:
             if isinstance(self, FluxLoraLoaderMixin) and USE_PEFT_BACKEND:
                 # Retrieve the original scale by scaling back the LoRA layers
@@ -296,8 +309,9 @@ def encode_prompt(
 
         dtype = self.text_encoder.dtype if self.text_encoder is not None else self.transformer.dtype
         text_ids = torch.zeros(prompt_embeds.shape[1], 3).to(device=device, dtype=dtype)
+        negative_text_ids = torch.zeros(negative_prompt_embeds.shape[1], 3).to(device=device, dtype=dtype)
 
-        return prompt_embeds, text_ids
+        return prompt_embeds, text_ids, negative_prompt_embeds, negative_text_ids
 
     # Copied from diffusers.pipelines.flux.pipeline_flux.FluxPipeline.encode_image
     def encode_image(self, image, device, num_images_per_prompt):
@@ -665,26 +679,18 @@ def __call__(
         (
             prompt_embeds,
             text_ids,
+            negative_prompt_embeds,
+            negative_text_ids,
         ) = self.encode_prompt(
             prompt=prompt,
+            negative_prompt=negative_prompt,
             prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
             device=device,
             num_images_per_prompt=num_images_per_prompt,
             max_sequence_length=max_sequence_length,
             lora_scale=lora_scale,
         )
-        if self.do_classifier_free_guidance:
-            (
-                negative_prompt_embeds,
-                negative_text_ids,
-            ) = self.encode_prompt(
-                prompt=negative_prompt,
-                prompt_embeds=negative_prompt_embeds,
-                device=device,
-                num_images_per_prompt=num_images_per_prompt,
-                max_sequence_length=max_sequence_length,
-                lora_scale=lora_scale,
-            )
 
         # 4. Prepare latent variables
         num_channels_latents = self.transformer.config.in_channels // 4
diff --git a/tests/pipelines/chroma/test_pipeline_chroma.py b/tests/pipelines/chroma/test_pipeline_chroma.py
index 8103800c3f42..f70746f97e1a 100644
--- a/tests/pipelines/chroma/test_pipeline_chroma.py
+++ b/tests/pipelines/chroma/test_pipeline_chroma.py
@@ -41,10 +41,14 @@ def get_dummy_components(self, num_layers: int = 1, num_single_layers: int = 1):
             in_channels=4,
             num_layers=num_layers,
             num_single_layers=num_single_layers,
-            attention_head_dim=16,
-            num_attention_heads=192,
+            attention_head_dim=4,
+            num_attention_heads=4,
             joint_attention_dim=32,
             axes_dims_rope=[4, 4, 8],
+            approximator_in_factor=1,
+            approximator_hidden_dim=32,
+            approximator_out_dim=64,
+            approximator_layers=5,
         )
 
         torch.manual_seed(0)

From 292469d75546e8398dffdd67eb2e0f149fa17d22 Mon Sep 17 00:00:00 2001
From: DN6 <dhruv.nair@gmail.com>
Date: Fri, 13 Jun 2025 18:43:26 +0530
Subject: [PATCH 091/108] update

---
 tests/pipelines/chroma/test_pipeline_chroma.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tests/pipelines/chroma/test_pipeline_chroma.py b/tests/pipelines/chroma/test_pipeline_chroma.py
index 9a2c8501119c..c47719d3e4d0 100644
--- a/tests/pipelines/chroma/test_pipeline_chroma.py
+++ b/tests/pipelines/chroma/test_pipeline_chroma.py
@@ -8,6 +8,7 @@
 from diffusers.utils.testing_utils import torch_device
 
 from ..test_pipelines_common import (
+    FluxIPAdapterTesterMixin,
     PipelineTesterMixin,
     check_qkv_fusion_matches_attn_procs_length,
     check_qkv_fusion_processors_exist,

From 178c4ec928254b8e7f2bac7363ffa877e48aba00 Mon Sep 17 00:00:00 2001
From: BuildTools <imnotednamode@gmail.com>
Date: Fri, 13 Jun 2025 07:46:29 -0600
Subject: [PATCH 092/108] push local changes, fix docs

---
 docs/source/en/api/pipelines/chroma.md                |  1 +
 .../transformers/test_models_transformer_chroma.py    |  2 +-
 tests/pipelines/chroma/test_pipeline_chroma.py        | 11 +++++------
 3 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/docs/source/en/api/pipelines/chroma.md b/docs/source/en/api/pipelines/chroma.md
index 0f8c9940f2ea..22448d88e06b 100644
--- a/docs/source/en/api/pipelines/chroma.md
+++ b/docs/source/en/api/pipelines/chroma.md
@@ -25,6 +25,7 @@ Original model checkpoints for Chroma can be found [here](https://huggingface.co
 
 Chroma can use all the same optimizations as Flux.
 
+</Tip>
 
 ## Inference (Single File)
 
diff --git a/tests/models/transformers/test_models_transformer_chroma.py b/tests/models/transformers/test_models_transformer_chroma.py
index 5e177cca4402..d1a061ce10e5 100644
--- a/tests/models/transformers/test_models_transformer_chroma.py
+++ b/tests/models/transformers/test_models_transformer_chroma.py
@@ -125,7 +125,7 @@ def prepare_init_args_and_inputs_for_common(self):
             "num_layers": 1,
             "num_single_layers": 1,
             "attention_head_dim": 16,
-            "num_attention_heads": 192,
+            "num_attention_heads": 2,
             "joint_attention_dim": 32,
             "axes_dims_rope": [4, 4, 8],
             "approximator_num_channels": 8,
diff --git a/tests/pipelines/chroma/test_pipeline_chroma.py b/tests/pipelines/chroma/test_pipeline_chroma.py
index c47719d3e4d0..e8c2944a9c5e 100644
--- a/tests/pipelines/chroma/test_pipeline_chroma.py
+++ b/tests/pipelines/chroma/test_pipeline_chroma.py
@@ -39,14 +39,13 @@ def get_dummy_components(self, num_layers: int = 1, num_single_layers: int = 1):
             in_channels=4,
             num_layers=num_layers,
             num_single_layers=num_single_layers,
-            attention_head_dim=4,
-            num_attention_heads=4,
+            attention_head_dim=16,
+            num_attention_heads=2,
             joint_attention_dim=32,
             axes_dims_rope=[4, 4, 8],
-            approximator_in_factor=1,
-            approximator_hidden_dim=32,
-            approximator_out_dim=64,
-            approximator_layers=5,
+            approximator_num_channels=8,
+            approximator_hidden_dim=16,
+            approximator_layers=1,
         )
 
         torch.manual_seed(0)

From 16b6e33916e5e39916ec2a99b77a4d4b3e43c7a7 Mon Sep 17 00:00:00 2001
From: BuildTools <imnotednamode@gmail.com>
Date: Fri, 13 Jun 2025 08:11:12 -0600
Subject: [PATCH 093/108] add encoder test, remove pooled dim

---
 src/diffusers/models/transformers/transformer_chroma.py | 1 -
 tests/pipelines/chroma/test_pipeline_chroma.py          | 6 ------
 2 files changed, 7 deletions(-)

diff --git a/src/diffusers/models/transformers/transformer_chroma.py b/src/diffusers/models/transformers/transformer_chroma.py
index 1ca6cf02fa0e..2b415cfed2fe 100644
--- a/src/diffusers/models/transformers/transformer_chroma.py
+++ b/src/diffusers/models/transformers/transformer_chroma.py
@@ -409,7 +409,6 @@ def __init__(
         attention_head_dim: int = 128,
         num_attention_heads: int = 24,
         joint_attention_dim: int = 4096,
-        pooled_projection_dim: int = 768,
         axes_dims_rope: Tuple[int, ...] = (16, 56, 56),
         approximator_num_channels: int = 64,
         approximator_hidden_dim: int = 5120,
diff --git a/tests/pipelines/chroma/test_pipeline_chroma.py b/tests/pipelines/chroma/test_pipeline_chroma.py
index e8c2944a9c5e..1f20e2081dbf 100644
--- a/tests/pipelines/chroma/test_pipeline_chroma.py
+++ b/tests/pipelines/chroma/test_pipeline_chroma.py
@@ -168,9 +168,3 @@ def test_chroma_image_output_shape(self):
             image = pipe(**inputs).images[0]
             output_height, output_width, _ = image.shape
             assert (output_height, output_width) == (expected_height, expected_width)
-
-    @unittest.skip(
-        "Chroma uses Flux encode_prompt but uses CFG. This test is incompatible with the pipeline since the test case does not use a negative prompt embeds"
-    )
-    def test_encode_prompt_works_in_isolation(self):
-        pass

From 06fb9957a7c7a4d6238d931740b7d37735d41d6b Mon Sep 17 00:00:00 2001
From: BuildTools <imnotednamode@gmail.com>
Date: Fri, 13 Jun 2025 08:38:02 -0600
Subject: [PATCH 094/108] default proj dim

---
 tests/pipelines/test_pipelines_common.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/pipelines/test_pipelines_common.py b/tests/pipelines/test_pipelines_common.py
index 22c17085a297..978001e8c376 100644
--- a/tests/pipelines/test_pipelines_common.py
+++ b/tests/pipelines/test_pipelines_common.py
@@ -544,7 +544,7 @@ def test_ip_adapter(self, expected_max_diff: float = 1e-4, expected_pipe_slice=N
         components = self.get_dummy_components()
         pipe = self.pipeline_class(**components).to(torch_device)
         pipe.set_progress_bar_config(disable=None)
-        image_embed_dim = pipe.transformer.config.pooled_projection_dim
+        image_embed_dim = pipe.transformer.config.pooled_projection_dim or 768
 
         # forward pass without ip adapter
         inputs = self._modify_inputs_for_ip_adapter_test(self.get_dummy_inputs(torch_device))

From 49a4c8bc2209266bb867a83c3c1fd134cc7c8c9d Mon Sep 17 00:00:00 2001
From: BuildTools <imnotednamode@gmail.com>
Date: Fri, 13 Jun 2025 09:41:44 -0600
Subject: [PATCH 095/108] fix tests

---
 src/diffusers/pipelines/chroma/pipeline_chroma.py          | 7 +++++++
 .../models/transformers/test_models_transformer_chroma.py  | 2 +-
 tests/models/transformers/test_models_transformer_flux.py  | 4 +++-
 tests/pipelines/test_pipelines_common.py                   | 6 +++++-
 4 files changed, 16 insertions(+), 3 deletions(-)

diff --git a/src/diffusers/pipelines/chroma/pipeline_chroma.py b/src/diffusers/pipelines/chroma/pipeline_chroma.py
index 8289ce7872f5..b93b0d328ccc 100644
--- a/src/diffusers/pipelines/chroma/pipeline_chroma.py
+++ b/src/diffusers/pipelines/chroma/pipeline_chroma.py
@@ -668,6 +668,13 @@ def __call__(
             batch_size = 1
         elif prompt is not None and isinstance(prompt, list):
             batch_size = len(prompt)
+            if negative_prompt is not None and isinstance(negative_prompt, str):
+                negative_prompt = [negative_prompt] * batch_size
+            elif negative_prompt is not None and isinstance(negative_prompt, list):
+                if len(negative_prompt) == 1:
+                    negative_prompt = [negative_prompt] * batch_size
+                else:
+                    raise ValueError("prompt and negative_prompt are lists of unequal size")
         else:
             batch_size = prompt_embeds.shape[0]
 
diff --git a/tests/models/transformers/test_models_transformer_chroma.py b/tests/models/transformers/test_models_transformer_chroma.py
index d1a061ce10e5..93df7ca35c4a 100644
--- a/tests/models/transformers/test_models_transformer_chroma.py
+++ b/tests/models/transformers/test_models_transformer_chroma.py
@@ -82,7 +82,7 @@ class ChromaTransformerTests(ModelTesterMixin, unittest.TestCase):
     model_class = ChromaTransformer2DModel
     main_input_name = "hidden_states"
     # We override the items here because the transformer under consideration is small.
-    model_split_percents = [0.7, 0.6, 0.6]
+    model_split_percents = [0.8, 0.7, 0.7]
 
     # Skip setting testing with default: AttnProcessor
     uses_custom_attn_processor = True
diff --git a/tests/models/transformers/test_models_transformer_flux.py b/tests/models/transformers/test_models_transformer_flux.py
index 33c876535871..036ed2ea3039 100644
--- a/tests/models/transformers/test_models_transformer_flux.py
+++ b/tests/models/transformers/test_models_transformer_flux.py
@@ -57,7 +57,9 @@ def create_flux_ip_adapter_state_dict(model):
 
     image_projection = ImageProjection(
         cross_attention_dim=model.config["joint_attention_dim"],
-        image_embed_dim=model.config["pooled_projection_dim"],
+        image_embed_dim=(
+            model.config["pooled_projection_dim"] if "pooled_projection_dim" in model.config.keys() else 768
+        ),
         num_image_text_embeds=4,
     )
 
diff --git a/tests/pipelines/test_pipelines_common.py b/tests/pipelines/test_pipelines_common.py
index 978001e8c376..1ced3b5ace67 100644
--- a/tests/pipelines/test_pipelines_common.py
+++ b/tests/pipelines/test_pipelines_common.py
@@ -544,7 +544,11 @@ def test_ip_adapter(self, expected_max_diff: float = 1e-4, expected_pipe_slice=N
         components = self.get_dummy_components()
         pipe = self.pipeline_class(**components).to(torch_device)
         pipe.set_progress_bar_config(disable=None)
-        image_embed_dim = pipe.transformer.config.pooled_projection_dim or 768
+        image_embed_dim = (
+            pipe.transformer.config.pooled_projection_dim
+            if hasattr(pipe.transformer.config, "pooled_projection_dim")
+            else 768
+        )
 
         # forward pass without ip adapter
         inputs = self._modify_inputs_for_ip_adapter_test(self.get_dummy_inputs(torch_device))

From 3fe4ad67d58d83715bc238f8654f5e90bfc5653c Mon Sep 17 00:00:00 2001
From: BuildTools <imnotednamode@gmail.com>
Date: Fri, 13 Jun 2025 10:51:31 -0600
Subject: [PATCH 096/108] fix equal size list input

---
 src/diffusers/pipelines/chroma/pipeline_chroma.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/diffusers/pipelines/chroma/pipeline_chroma.py b/src/diffusers/pipelines/chroma/pipeline_chroma.py
index b93b0d328ccc..46e95348a168 100644
--- a/src/diffusers/pipelines/chroma/pipeline_chroma.py
+++ b/src/diffusers/pipelines/chroma/pipeline_chroma.py
@@ -673,7 +673,7 @@ def __call__(
             elif negative_prompt is not None and isinstance(negative_prompt, list):
                 if len(negative_prompt) == 1:
                     negative_prompt = [negative_prompt] * batch_size
-                else:
+                elif len(prompt) != len(negative_prompt):
                     raise ValueError("prompt and negative_prompt are lists of unequal size")
         else:
             batch_size = prompt_embeds.shape[0]

From 41751a3ec01f098474a7e4af60b1aa5b1c2d78c9 Mon Sep 17 00:00:00 2001
From: Dhruv Nair <dhruv.nair@gmail.com>
Date: Fri, 13 Jun 2025 20:41:49 +0200
Subject: [PATCH 097/108] update

---
 .../pipelines/chroma/pipeline_chroma.py       |  9 +++++++-
 .../pipelines/chroma/test_pipeline_chroma.py  | 23 +++++++++++--------
 2 files changed, 21 insertions(+), 11 deletions(-)

diff --git a/src/diffusers/pipelines/chroma/pipeline_chroma.py b/src/diffusers/pipelines/chroma/pipeline_chroma.py
index 8289ce7872f5..67cf69e46e8a 100644
--- a/src/diffusers/pipelines/chroma/pipeline_chroma.py
+++ b/src/diffusers/pipelines/chroma/pipeline_chroma.py
@@ -214,7 +214,7 @@ def _get_t5_prompt_embeds(
 
         text_inputs = self.tokenizer(
             prompt,
-            padding=True,
+            padding="max_length",
             max_length=max_sequence_length,
             truncation=True,
             return_length=False,
@@ -286,6 +286,11 @@ def encode_prompt(
 
         prompt = [prompt] if isinstance(prompt, str) else prompt
 
+        if prompt is not None:
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
         if prompt_embeds is None:
             prompt_embeds = self._get_t5_prompt_embeds(
                 prompt=prompt,
@@ -295,6 +300,8 @@ def encode_prompt(
             )
 
         if negative_prompt_embeds is None:
+            negative_prompt = negative_prompt or ""
+            negative_prompt = batch_size * [negative_prompt] if isinstance(negative_prompt, str) else negative_prompt
             negative_prompt_embeds = self._get_t5_prompt_embeds(
                 prompt=negative_prompt,
                 num_images_per_prompt=num_images_per_prompt,
diff --git a/tests/pipelines/chroma/test_pipeline_chroma.py b/tests/pipelines/chroma/test_pipeline_chroma.py
index c47719d3e4d0..ca3c5858eaa9 100644
--- a/tests/pipelines/chroma/test_pipeline_chroma.py
+++ b/tests/pipelines/chroma/test_pipeline_chroma.py
@@ -4,8 +4,15 @@
 import torch
 from transformers import AutoTokenizer, T5EncoderModel
 
-from diffusers import AutoencoderKL, ChromaPipeline, ChromaTransformer2DModel, FlowMatchEulerDiscreteScheduler
-from diffusers.utils.testing_utils import torch_device
+from diffusers import (
+    AutoencoderKL,
+    ChromaPipeline,
+    ChromaTransformer2DModel,
+    FlowMatchEulerDiscreteScheduler,
+)
+from diffusers.utils.testing_utils import (
+    torch_device,
+)
 
 from ..test_pipelines_common import (
     FluxIPAdapterTesterMixin,
@@ -22,9 +29,6 @@ class ChromaPipelineFastTests(
 ):
     pipeline_class = ChromaPipeline
     params = frozenset(["prompt", "height", "width", "guidance_scale", "prompt_embeds"])
-
-    pipeline_class = ChromaPipeline
-    params = frozenset(["prompt", "negative_prompt", "height", "width", "guidance_scale", "prompt_embeds"])
     batch_params = frozenset(["prompt"])
 
     # there is no xformers processor for Flux
@@ -39,14 +43,13 @@ def get_dummy_components(self, num_layers: int = 1, num_single_layers: int = 1):
             in_channels=4,
             num_layers=num_layers,
             num_single_layers=num_single_layers,
-            attention_head_dim=4,
-            num_attention_heads=4,
+            attention_head_dim=16,
+            num_attention_heads=2,
             joint_attention_dim=32,
             axes_dims_rope=[4, 4, 8],
-            approximator_in_factor=1,
             approximator_hidden_dim=32,
-            approximator_out_dim=64,
-            approximator_layers=5,
+            approximator_layers=1,
+            approximator_num_channels=16,
         )
 
         torch.manual_seed(0)

From fd3e94450aa9a2b5ab84ef2d35f6b7b73bb81166 Mon Sep 17 00:00:00 2001
From: BuildTools <imnotednamode@gmail.com>
Date: Fri, 13 Jun 2025 07:46:29 -0600
Subject: [PATCH 098/108] push local changes, fix docs

---
 docs/source/en/api/pipelines/chroma.md                |  1 +
 .../transformers/test_models_transformer_chroma.py    |  2 +-
 tests/pipelines/chroma/test_pipeline_chroma.py        | 11 ++---------
 3 files changed, 4 insertions(+), 10 deletions(-)

diff --git a/docs/source/en/api/pipelines/chroma.md b/docs/source/en/api/pipelines/chroma.md
index 0f8c9940f2ea..22448d88e06b 100644
--- a/docs/source/en/api/pipelines/chroma.md
+++ b/docs/source/en/api/pipelines/chroma.md
@@ -25,6 +25,7 @@ Original model checkpoints for Chroma can be found [here](https://huggingface.co
 
 Chroma can use all the same optimizations as Flux.
 
+</Tip>
 
 ## Inference (Single File)
 
diff --git a/tests/models/transformers/test_models_transformer_chroma.py b/tests/models/transformers/test_models_transformer_chroma.py
index 5e177cca4402..d1a061ce10e5 100644
--- a/tests/models/transformers/test_models_transformer_chroma.py
+++ b/tests/models/transformers/test_models_transformer_chroma.py
@@ -125,7 +125,7 @@ def prepare_init_args_and_inputs_for_common(self):
             "num_layers": 1,
             "num_single_layers": 1,
             "attention_head_dim": 16,
-            "num_attention_heads": 192,
+            "num_attention_heads": 2,
             "joint_attention_dim": 32,
             "axes_dims_rope": [4, 4, 8],
             "approximator_num_channels": 8,
diff --git a/tests/pipelines/chroma/test_pipeline_chroma.py b/tests/pipelines/chroma/test_pipeline_chroma.py
index ca3c5858eaa9..b8db37137c07 100644
--- a/tests/pipelines/chroma/test_pipeline_chroma.py
+++ b/tests/pipelines/chroma/test_pipeline_chroma.py
@@ -4,15 +4,8 @@
 import torch
 from transformers import AutoTokenizer, T5EncoderModel
 
-from diffusers import (
-    AutoencoderKL,
-    ChromaPipeline,
-    ChromaTransformer2DModel,
-    FlowMatchEulerDiscreteScheduler,
-)
-from diffusers.utils.testing_utils import (
-    torch_device,
-)
+from diffusers import AutoencoderKL, ChromaPipeline, ChromaTransformer2DModel, FlowMatchEulerDiscreteScheduler
+from diffusers.utils.testing_utils import torch_device
 
 from ..test_pipelines_common import (
     FluxIPAdapterTesterMixin,

From 8694f2ce53fa6a4653b656075d5da9d8b7845a11 Mon Sep 17 00:00:00 2001
From: BuildTools <imnotednamode@gmail.com>
Date: Fri, 13 Jun 2025 08:11:12 -0600
Subject: [PATCH 099/108] add encoder test, remove pooled dim

---
 src/diffusers/models/transformers/transformer_chroma.py | 1 -
 tests/pipelines/chroma/test_pipeline_chroma.py          | 6 ------
 2 files changed, 7 deletions(-)

diff --git a/src/diffusers/models/transformers/transformer_chroma.py b/src/diffusers/models/transformers/transformer_chroma.py
index 1ca6cf02fa0e..2b415cfed2fe 100644
--- a/src/diffusers/models/transformers/transformer_chroma.py
+++ b/src/diffusers/models/transformers/transformer_chroma.py
@@ -409,7 +409,6 @@ def __init__(
         attention_head_dim: int = 128,
         num_attention_heads: int = 24,
         joint_attention_dim: int = 4096,
-        pooled_projection_dim: int = 768,
         axes_dims_rope: Tuple[int, ...] = (16, 56, 56),
         approximator_num_channels: int = 64,
         approximator_hidden_dim: int = 5120,
diff --git a/tests/pipelines/chroma/test_pipeline_chroma.py b/tests/pipelines/chroma/test_pipeline_chroma.py
index b8db37137c07..fc5749f96cd8 100644
--- a/tests/pipelines/chroma/test_pipeline_chroma.py
+++ b/tests/pipelines/chroma/test_pipeline_chroma.py
@@ -165,9 +165,3 @@ def test_chroma_image_output_shape(self):
             image = pipe(**inputs).images[0]
             output_height, output_width, _ = image.shape
             assert (output_height, output_width) == (expected_height, expected_width)
-
-    @unittest.skip(
-        "Chroma uses Flux encode_prompt but uses CFG. This test is incompatible with the pipeline since the test case does not use a negative prompt embeds"
-    )
-    def test_encode_prompt_works_in_isolation(self):
-        pass

From 4e24f26d6f47cfc6e39aafc6b7273209caf57d1e Mon Sep 17 00:00:00 2001
From: BuildTools <imnotednamode@gmail.com>
Date: Fri, 13 Jun 2025 08:38:02 -0600
Subject: [PATCH 100/108] default proj dim

---
 tests/pipelines/test_pipelines_common.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/pipelines/test_pipelines_common.py b/tests/pipelines/test_pipelines_common.py
index 22c17085a297..978001e8c376 100644
--- a/tests/pipelines/test_pipelines_common.py
+++ b/tests/pipelines/test_pipelines_common.py
@@ -544,7 +544,7 @@ def test_ip_adapter(self, expected_max_diff: float = 1e-4, expected_pipe_slice=N
         components = self.get_dummy_components()
         pipe = self.pipeline_class(**components).to(torch_device)
         pipe.set_progress_bar_config(disable=None)
-        image_embed_dim = pipe.transformer.config.pooled_projection_dim
+        image_embed_dim = pipe.transformer.config.pooled_projection_dim or 768
 
         # forward pass without ip adapter
         inputs = self._modify_inputs_for_ip_adapter_test(self.get_dummy_inputs(torch_device))

From 0978b609c8a3639defffe80e04b91f39e16e0302 Mon Sep 17 00:00:00 2001
From: BuildTools <imnotednamode@gmail.com>
Date: Fri, 13 Jun 2025 09:41:44 -0600
Subject: [PATCH 101/108] fix tests

---
 src/diffusers/pipelines/chroma/pipeline_chroma.py          | 7 +++++++
 .../models/transformers/test_models_transformer_chroma.py  | 2 +-
 tests/models/transformers/test_models_transformer_flux.py  | 4 +++-
 tests/pipelines/test_pipelines_common.py                   | 6 +++++-
 4 files changed, 16 insertions(+), 3 deletions(-)

diff --git a/src/diffusers/pipelines/chroma/pipeline_chroma.py b/src/diffusers/pipelines/chroma/pipeline_chroma.py
index 67cf69e46e8a..ade4cc045d2b 100644
--- a/src/diffusers/pipelines/chroma/pipeline_chroma.py
+++ b/src/diffusers/pipelines/chroma/pipeline_chroma.py
@@ -675,6 +675,13 @@ def __call__(
             batch_size = 1
         elif prompt is not None and isinstance(prompt, list):
             batch_size = len(prompt)
+            if negative_prompt is not None and isinstance(negative_prompt, str):
+                negative_prompt = [negative_prompt] * batch_size
+            elif negative_prompt is not None and isinstance(negative_prompt, list):
+                if len(negative_prompt) == 1:
+                    negative_prompt = [negative_prompt] * batch_size
+                else:
+                    raise ValueError("prompt and negative_prompt are lists of unequal size")
         else:
             batch_size = prompt_embeds.shape[0]
 
diff --git a/tests/models/transformers/test_models_transformer_chroma.py b/tests/models/transformers/test_models_transformer_chroma.py
index d1a061ce10e5..93df7ca35c4a 100644
--- a/tests/models/transformers/test_models_transformer_chroma.py
+++ b/tests/models/transformers/test_models_transformer_chroma.py
@@ -82,7 +82,7 @@ class ChromaTransformerTests(ModelTesterMixin, unittest.TestCase):
     model_class = ChromaTransformer2DModel
     main_input_name = "hidden_states"
     # We override the items here because the transformer under consideration is small.
-    model_split_percents = [0.7, 0.6, 0.6]
+    model_split_percents = [0.8, 0.7, 0.7]
 
     # Skip setting testing with default: AttnProcessor
     uses_custom_attn_processor = True
diff --git a/tests/models/transformers/test_models_transformer_flux.py b/tests/models/transformers/test_models_transformer_flux.py
index 33c876535871..036ed2ea3039 100644
--- a/tests/models/transformers/test_models_transformer_flux.py
+++ b/tests/models/transformers/test_models_transformer_flux.py
@@ -57,7 +57,9 @@ def create_flux_ip_adapter_state_dict(model):
 
     image_projection = ImageProjection(
         cross_attention_dim=model.config["joint_attention_dim"],
-        image_embed_dim=model.config["pooled_projection_dim"],
+        image_embed_dim=(
+            model.config["pooled_projection_dim"] if "pooled_projection_dim" in model.config.keys() else 768
+        ),
         num_image_text_embeds=4,
     )
 
diff --git a/tests/pipelines/test_pipelines_common.py b/tests/pipelines/test_pipelines_common.py
index 978001e8c376..1ced3b5ace67 100644
--- a/tests/pipelines/test_pipelines_common.py
+++ b/tests/pipelines/test_pipelines_common.py
@@ -544,7 +544,11 @@ def test_ip_adapter(self, expected_max_diff: float = 1e-4, expected_pipe_slice=N
         components = self.get_dummy_components()
         pipe = self.pipeline_class(**components).to(torch_device)
         pipe.set_progress_bar_config(disable=None)
-        image_embed_dim = pipe.transformer.config.pooled_projection_dim or 768
+        image_embed_dim = (
+            pipe.transformer.config.pooled_projection_dim
+            if hasattr(pipe.transformer.config, "pooled_projection_dim")
+            else 768
+        )
 
         # forward pass without ip adapter
         inputs = self._modify_inputs_for_ip_adapter_test(self.get_dummy_inputs(torch_device))

From c711e8f10bdcd70f313e1be7a611be2ff8fd0552 Mon Sep 17 00:00:00 2001
From: BuildTools <imnotednamode@gmail.com>
Date: Fri, 13 Jun 2025 10:51:31 -0600
Subject: [PATCH 102/108] fix equal size list input

---
 src/diffusers/pipelines/chroma/pipeline_chroma.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/diffusers/pipelines/chroma/pipeline_chroma.py b/src/diffusers/pipelines/chroma/pipeline_chroma.py
index ade4cc045d2b..b6a33a1565fa 100644
--- a/src/diffusers/pipelines/chroma/pipeline_chroma.py
+++ b/src/diffusers/pipelines/chroma/pipeline_chroma.py
@@ -680,7 +680,7 @@ def __call__(
             elif negative_prompt is not None and isinstance(negative_prompt, list):
                 if len(negative_prompt) == 1:
                     negative_prompt = [negative_prompt] * batch_size
-                else:
+                elif len(prompt) != len(negative_prompt):
                     raise ValueError("prompt and negative_prompt are lists of unequal size")
         else:
             batch_size = prompt_embeds.shape[0]

From 589e939e33d9dc3ca20761443335bb1d6dcd10e1 Mon Sep 17 00:00:00 2001
From: DN6 <dhruv.nair@gmail.com>
Date: Sat, 14 Jun 2025 00:14:06 +0530
Subject: [PATCH 103/108] Revert "fix equal size list input"

This reverts commit 3fe4ad67d58d83715bc238f8654f5e90bfc5653c.
---
 src/diffusers/pipelines/chroma/pipeline_chroma.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/diffusers/pipelines/chroma/pipeline_chroma.py b/src/diffusers/pipelines/chroma/pipeline_chroma.py
index b6a33a1565fa..ade4cc045d2b 100644
--- a/src/diffusers/pipelines/chroma/pipeline_chroma.py
+++ b/src/diffusers/pipelines/chroma/pipeline_chroma.py
@@ -680,7 +680,7 @@ def __call__(
             elif negative_prompt is not None and isinstance(negative_prompt, list):
                 if len(negative_prompt) == 1:
                     negative_prompt = [negative_prompt] * batch_size
-                elif len(prompt) != len(negative_prompt):
+                else:
                     raise ValueError("prompt and negative_prompt are lists of unequal size")
         else:
             batch_size = prompt_embeds.shape[0]

From a967e66d036e6fbf2c15aa8afb2849269216d5ce Mon Sep 17 00:00:00 2001
From: DN6 <dhruv.nair@gmail.com>
Date: Sat, 14 Jun 2025 00:28:56 +0530
Subject: [PATCH 104/108] update

---
 .../pipelines/chroma/pipeline_chroma.py        | 18 ++++++++++++++++--
 1 file changed, 16 insertions(+), 2 deletions(-)

diff --git a/src/diffusers/pipelines/chroma/pipeline_chroma.py b/src/diffusers/pipelines/chroma/pipeline_chroma.py
index 67cf69e46e8a..a0a6fd6ba6d1 100644
--- a/src/diffusers/pipelines/chroma/pipeline_chroma.py
+++ b/src/diffusers/pipelines/chroma/pipeline_chroma.py
@@ -252,6 +252,7 @@ def encode_prompt(
         num_images_per_prompt: int = 1,
         prompt_embeds: Optional[torch.FloatTensor] = None,
         negative_prompt_embeds: Optional[torch.FloatTensor] = None,
+        do_classifier_free_guidance: bool = True,
         max_sequence_length: int = 512,
         lora_scale: Optional[float] = None,
     ):
@@ -298,10 +299,22 @@ def encode_prompt(
                 max_sequence_length=max_sequence_length,
                 device=device,
             )
-
-        if negative_prompt_embeds is None:
+        if do_classifier_free_guidance and negative_prompt_embeds is None:
             negative_prompt = negative_prompt or ""
             negative_prompt = batch_size * [negative_prompt] if isinstance(negative_prompt, str) else negative_prompt
+
+            if prompt is not None and type(prompt) is not type(negative_prompt):
+                raise TypeError(
+                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
+                    f" {type(prompt)}."
+                )
+            elif batch_size != len(negative_prompt):
+                raise ValueError(
+                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
+                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
+                    " the batch size of `prompt`."
+                )
+
             negative_prompt_embeds = self._get_t5_prompt_embeds(
                 prompt=negative_prompt,
                 num_images_per_prompt=num_images_per_prompt,
@@ -693,6 +706,7 @@ def __call__(
             negative_prompt=negative_prompt,
             prompt_embeds=prompt_embeds,
             negative_prompt_embeds=negative_prompt_embeds,
+            do_classifier_free_guidance=self.do_classifier_free_guidance,
             device=device,
             num_images_per_prompt=num_images_per_prompt,
             max_sequence_length=max_sequence_length,

From 4f00bae5de60f1ad0a6f9123be76dffdc99bc637 Mon Sep 17 00:00:00 2001
From: DN6 <dhruv.nair@gmail.com>
Date: Sat, 14 Jun 2025 00:31:33 +0530
Subject: [PATCH 105/108] update

---
 src/diffusers/pipelines/chroma/pipeline_chroma.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/diffusers/pipelines/chroma/pipeline_chroma.py b/src/diffusers/pipelines/chroma/pipeline_chroma.py
index a0a6fd6ba6d1..0b00005fe3c3 100644
--- a/src/diffusers/pipelines/chroma/pipeline_chroma.py
+++ b/src/diffusers/pipelines/chroma/pipeline_chroma.py
@@ -299,6 +299,8 @@ def encode_prompt(
                 max_sequence_length=max_sequence_length,
                 device=device,
             )
+
+        negative_text_ids = None
         if do_classifier_free_guidance and negative_prompt_embeds is None:
             negative_prompt = negative_prompt or ""
             negative_prompt = batch_size * [negative_prompt] if isinstance(negative_prompt, str) else negative_prompt
@@ -321,6 +323,7 @@ def encode_prompt(
                 max_sequence_length=max_sequence_length,
                 device=device,
             )
+            negative_text_ids = torch.zeros(negative_prompt_embeds.shape[1], 3).to(device=device, dtype=dtype)
 
         if self.text_encoder is not None:
             if isinstance(self, FluxLoraLoaderMixin) and USE_PEFT_BACKEND:
@@ -329,7 +332,6 @@ def encode_prompt(
 
         dtype = self.text_encoder.dtype if self.text_encoder is not None else self.transformer.dtype
         text_ids = torch.zeros(prompt_embeds.shape[1], 3).to(device=device, dtype=dtype)
-        negative_text_ids = torch.zeros(negative_prompt_embeds.shape[1], 3).to(device=device, dtype=dtype)
 
         return prompt_embeds, text_ids, negative_prompt_embeds, negative_text_ids
 

From 0497faa3db87bc4e384b2a66fb50b846ec7b6cc7 Mon Sep 17 00:00:00 2001
From: DN6 <dhruv.nair@gmail.com>
Date: Sat, 14 Jun 2025 00:33:39 +0530
Subject: [PATCH 106/108] update

---
 src/diffusers/pipelines/chroma/pipeline_chroma.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/diffusers/pipelines/chroma/pipeline_chroma.py b/src/diffusers/pipelines/chroma/pipeline_chroma.py
index 0b00005fe3c3..8560f3d52b83 100644
--- a/src/diffusers/pipelines/chroma/pipeline_chroma.py
+++ b/src/diffusers/pipelines/chroma/pipeline_chroma.py
@@ -300,7 +300,10 @@ def encode_prompt(
                 device=device,
             )
 
+        dtype = self.text_encoder.dtype if self.text_encoder is not None else self.transformer.dtype
+        text_ids = torch.zeros(prompt_embeds.shape[1], 3).to(device=device, dtype=dtype)
         negative_text_ids = None
+
         if do_classifier_free_guidance and negative_prompt_embeds is None:
             negative_prompt = negative_prompt or ""
             negative_prompt = batch_size * [negative_prompt] if isinstance(negative_prompt, str) else negative_prompt
@@ -330,9 +333,6 @@ def encode_prompt(
                 # Retrieve the original scale by scaling back the LoRA layers
                 unscale_lora_layers(self.text_encoder, lora_scale)
 
-        dtype = self.text_encoder.dtype if self.text_encoder is not None else self.transformer.dtype
-        text_ids = torch.zeros(prompt_embeds.shape[1], 3).to(device=device, dtype=dtype)
-
         return prompt_embeds, text_ids, negative_prompt_embeds, negative_text_ids
 
     # Copied from diffusers.pipelines.flux.pipeline_flux.FluxPipeline.encode_image

From e10f701537a560ddc7d30ac81f38a25c00be6837 Mon Sep 17 00:00:00 2001
From: DN6 <dhruv.nair@gmail.com>
Date: Sat, 14 Jun 2025 00:51:24 +0530
Subject: [PATCH 107/108] update

---
 src/diffusers/loaders/peft.py                 |  1 +
 .../pipelines/chroma/pipeline_chroma.py       | 43 ++++++++++---------
 2 files changed, 24 insertions(+), 20 deletions(-)

diff --git a/src/diffusers/loaders/peft.py b/src/diffusers/loaders/peft.py
index 0480e93f356f..e7a458f28ef9 100644
--- a/src/diffusers/loaders/peft.py
+++ b/src/diffusers/loaders/peft.py
@@ -60,6 +60,7 @@
     "HiDreamImageTransformer2DModel": lambda model_cls, weights: weights,
     "HunyuanVideoFramepackTransformer3DModel": lambda model_cls, weights: weights,
     "WanVACETransformer3DModel": lambda model_cls, weights: weights,
+    "ChromaTransformer2DModel": lambda model_cls, weights: weights,
 }
 
 
diff --git a/src/diffusers/pipelines/chroma/pipeline_chroma.py b/src/diffusers/pipelines/chroma/pipeline_chroma.py
index 8560f3d52b83..c111458d3320 100644
--- a/src/diffusers/pipelines/chroma/pipeline_chroma.py
+++ b/src/diffusers/pipelines/chroma/pipeline_chroma.py
@@ -304,28 +304,31 @@ def encode_prompt(
         text_ids = torch.zeros(prompt_embeds.shape[1], 3).to(device=device, dtype=dtype)
         negative_text_ids = None
 
-        if do_classifier_free_guidance and negative_prompt_embeds is None:
-            negative_prompt = negative_prompt or ""
-            negative_prompt = batch_size * [negative_prompt] if isinstance(negative_prompt, str) else negative_prompt
-
-            if prompt is not None and type(prompt) is not type(negative_prompt):
-                raise TypeError(
-                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
-                    f" {type(prompt)}."
-                )
-            elif batch_size != len(negative_prompt):
-                raise ValueError(
-                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
-                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
-                    " the batch size of `prompt`."
+        if do_classifier_free_guidance:
+            if negative_prompt_embeds is None:
+                negative_prompt = negative_prompt or ""
+                negative_prompt = (
+                    batch_size * [negative_prompt] if isinstance(negative_prompt, str) else negative_prompt
                 )
 
-            negative_prompt_embeds = self._get_t5_prompt_embeds(
-                prompt=negative_prompt,
-                num_images_per_prompt=num_images_per_prompt,
-                max_sequence_length=max_sequence_length,
-                device=device,
-            )
+                if prompt is not None and type(prompt) is not type(negative_prompt):
+                    raise TypeError(
+                        f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
+                        f" {type(prompt)}."
+                    )
+                elif batch_size != len(negative_prompt):
+                    raise ValueError(
+                        f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
+                        f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
+                        " the batch size of `prompt`."
+                    )
+
+                negative_prompt_embeds = self._get_t5_prompt_embeds(
+                    prompt=negative_prompt,
+                    num_images_per_prompt=num_images_per_prompt,
+                    max_sequence_length=max_sequence_length,
+                    device=device,
+                )
             negative_text_ids = torch.zeros(negative_prompt_embeds.shape[1], 3).to(device=device, dtype=dtype)
 
         if self.text_encoder is not None:

From d267bb6955a463b4fbda17cb88da88c94b874284 Mon Sep 17 00:00:00 2001
From: DN6 <dhruv.nair@gmail.com>
Date: Sat, 14 Jun 2025 01:20:39 +0530
Subject: [PATCH 108/108] update

---
 tests/pipelines/test_pipelines_common.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/tests/pipelines/test_pipelines_common.py b/tests/pipelines/test_pipelines_common.py
index 1ced3b5ace67..687a28294c9a 100644
--- a/tests/pipelines/test_pipelines_common.py
+++ b/tests/pipelines/test_pipelines_common.py
@@ -307,7 +307,6 @@ def test_ip_adapter(self, expected_max_diff: float = 1e-4, expected_pipe_slice=N
 
         # forward pass without ip adapter
         inputs = self._modify_inputs_for_ip_adapter_test(self.get_dummy_inputs(torch_device))
-        __import__("ipdb").set_trace()
         if expected_pipe_slice is None:
             output_without_adapter = pipe(**inputs)[0]
         else: