diff --git a/.github/workflows/nightly_tests.yml b/.github/workflows/nightly_tests.yml
index 1738efd63bb7..8b7e57e91297 100644
--- a/.github/workflows/nightly_tests.yml
+++ b/.github/workflows/nightly_tests.yml
@@ -73,6 +73,8 @@ jobs:
         run: |
           uv pip install -e ".[quality]"
           uv pip uninstall accelerate && uv pip install -U accelerate@git+https://github.com/huggingface/accelerate.git
+          #uv pip uninstall transformers huggingface_hub && uv pip install --prerelease allow -U transformers@git+https://github.com/huggingface/transformers.git
+          uv pip uninstall transformers huggingface_hub && uv pip install transformers==4.57.1 
           uv pip install pytest-reportlog
       - name: Environment
         run: |
@@ -84,7 +86,7 @@ jobs:
           CUBLAS_WORKSPACE_CONFIG: :16:8
         run: |
           pytest -n 1 --max-worker-restart=0 --dist=loadfile \
-            -s -v -k "not Flax and not Onnx" \
+             -k "not Flax and not Onnx" \
             --make-reports=tests_pipeline_${{ matrix.module }}_cuda \
             --report-log=tests_pipeline_${{ matrix.module }}_cuda.log \
             tests/pipelines/${{ matrix.module }}
@@ -126,6 +128,8 @@ jobs:
         uv pip install -e ".[quality]"
         uv pip install peft@git+https://github.com/huggingface/peft.git
         uv pip uninstall accelerate && uv pip install -U accelerate@git+https://github.com/huggingface/accelerate.git
+        #uv pip uninstall transformers huggingface_hub && uv pip install --prerelease allow -U transformers@git+https://github.com/huggingface/transformers.git
+        uv pip uninstall transformers huggingface_hub && uv pip install transformers==4.57.1 
         uv pip install pytest-reportlog
     - name: Environment
       run: python utils/print_env.py
@@ -138,7 +142,7 @@ jobs:
         CUBLAS_WORKSPACE_CONFIG: :16:8
       run: |
         pytest -n 1 --max-worker-restart=0 --dist=loadfile \
-          -s -v -k "not Flax and not Onnx" \
+          -k "not Flax and not Onnx" \
           --make-reports=tests_torch_${{ matrix.module }}_cuda \
           --report-log=tests_torch_${{ matrix.module }}_cuda.log \
           tests/${{ matrix.module }}
@@ -151,7 +155,7 @@ jobs:
         CUBLAS_WORKSPACE_CONFIG: :16:8
       run: |
         pytest -n 1 --max-worker-restart=0 --dist=loadfile \
-          -s -v --make-reports=examples_torch_cuda \
+          --make-reports=examples_torch_cuda \
           --report-log=examples_torch_cuda.log \
           examples/
 
@@ -190,6 +194,8 @@ jobs:
     - name: Install dependencies
       run: |
         uv pip install -e ".[quality,training]"
+        #uv pip uninstall transformers huggingface_hub && uv pip install --prerelease allow -U transformers@git+https://github.com/huggingface/transformers.git
+        uv pip uninstall transformers huggingface_hub && uv pip install transformers==4.57.1 
     - name: Environment
       run: |
         python utils/print_env.py
@@ -198,7 +204,7 @@ jobs:
         HF_TOKEN: ${{ secrets.DIFFUSERS_HF_HUB_READ_TOKEN }}
         RUN_COMPILE: yes
       run: |
-        pytest -n 1 --max-worker-restart=0 --dist=loadfile -s -v -k "compile" --make-reports=tests_torch_compile_cuda tests/
+        pytest -n 1 --max-worker-restart=0 --dist=loadfile -k "compile" --make-reports=tests_torch_compile_cuda tests/
     - name: Failure short reports
       if: ${{ failure() }}
       run: cat reports/tests_torch_compile_cuda_failures_short.txt
@@ -232,6 +238,8 @@ jobs:
           uv pip install -e ".[quality]"
           uv pip install peft@git+https://github.com/huggingface/peft.git
           uv pip uninstall accelerate && uv pip install -U accelerate@git+https://github.com/huggingface/accelerate.git
+          #uv pip uninstall transformers huggingface_hub && uv pip install --prerelease allow -U transformers@git+https://github.com/huggingface/transformers.git
+          uv pip uninstall transformers huggingface_hub && uv pip install transformers==4.57.1 
           uv pip install pytest-reportlog
       - name: Environment
         run: |
@@ -281,6 +289,8 @@ jobs:
           uv pip install -e ".[quality]"
           uv pip install peft@git+https://github.com/huggingface/peft.git
           uv pip uninstall accelerate && uv pip install -U accelerate@git+https://github.com/huggingface/accelerate.git
+          #uv pip uninstall transformers huggingface_hub && uv pip install --prerelease allow -U transformers@git+https://github.com/huggingface/transformers.git
+          uv pip uninstall transformers huggingface_hub && uv pip install transformers==4.57.1 
 
       - name: Environment
         run: |
@@ -293,7 +303,7 @@ jobs:
           CUBLAS_WORKSPACE_CONFIG: :16:8
         run: |
           pytest -n 1 --max-worker-restart=0 --dist=loadfile \
-            -s -v -k "not Flax and not Onnx" \
+            -k "not Flax and not Onnx" \
             --make-reports=tests_torch_minimum_version_cuda \
             tests/models/test_modeling_common.py \
             tests/pipelines/test_pipelines_common.py \
@@ -358,6 +368,8 @@ jobs:
               uv pip install ${{ join(matrix.config.additional_deps, ' ') }}
           fi
           uv pip install pytest-reportlog
+          #uv pip uninstall transformers huggingface_hub && uv pip install --prerelease allow -U transformers@git+https://github.com/huggingface/transformers.git
+          uv pip uninstall transformers huggingface_hub && uv pip install transformers==4.57.1 
       - name: Environment
         run: |
           python utils/print_env.py
@@ -405,6 +417,8 @@ jobs:
         run: |
           uv pip install -e ".[quality]"
           uv pip install -U bitsandbytes optimum_quanto
+          #uv pip uninstall transformers huggingface_hub && uv pip install --prerelease allow -U transformers@git+https://github.com/huggingface/transformers.git
+          uv pip uninstall transformers huggingface_hub && uv pip install transformers==4.57.1 
           uv pip install pytest-reportlog
       - name: Environment
         run: |
@@ -531,7 +545,7 @@ jobs:
 #          HF_HOME: /System/Volumes/Data/mnt/cache
 #          HF_TOKEN: ${{ secrets.DIFFUSERS_HF_HUB_READ_TOKEN }}
 #        run: |
-#          ${CONDA_RUN} pytest -n 1 -s -v --make-reports=tests_torch_mps \
+#          ${CONDA_RUN} pytest -n 1  --make-reports=tests_torch_mps \
 #            --report-log=tests_torch_mps.log \
 #            tests/
 #      - name: Failure short reports
@@ -587,7 +601,7 @@ jobs:
 #          HF_HOME: /System/Volumes/Data/mnt/cache
 #          HF_TOKEN: ${{ secrets.DIFFUSERS_HF_HUB_READ_TOKEN }}
 #        run: |
-#          ${CONDA_RUN} pytest -n 1 -s -v --make-reports=tests_torch_mps \
+#          ${CONDA_RUN} pytest -n 1  --make-reports=tests_torch_mps \
 #            --report-log=tests_torch_mps.log \
 #            tests/
 #      - name: Failure short reports
diff --git a/.github/workflows/pr_modular_tests.yml b/.github/workflows/pr_modular_tests.yml
index 7081ee518d55..13c228621f5c 100644
--- a/.github/workflows/pr_modular_tests.yml
+++ b/.github/workflows/pr_modular_tests.yml
@@ -109,7 +109,8 @@ jobs:
     - name: Install dependencies
       run: |
         uv pip install -e ".[quality]"
-        uv pip uninstall transformers huggingface_hub && uv pip install --prerelease allow -U transformers@git+https://github.com/huggingface/transformers.git
+        #uv pip uninstall transformers huggingface_hub && uv pip install --prerelease allow -U transformers@git+https://github.com/huggingface/transformers.git
+        uv pip uninstall transformers huggingface_hub && uv pip install transformers==4.57.1
         uv pip uninstall accelerate && uv pip install -U accelerate@git+https://github.com/huggingface/accelerate.git --no-deps
 
     - name: Environment
@@ -120,7 +121,7 @@ jobs:
       if: ${{ matrix.config.framework == 'pytorch_pipelines' }}
       run: |
         pytest -n 8 --max-worker-restart=0 --dist=loadfile \
-          -s -v -k "not Flax and not Onnx" \
+          -k "not Flax and not Onnx" \
           --make-reports=tests_${{ matrix.config.report }} \
           tests/modular_pipelines
 
diff --git a/.github/workflows/pr_tests.yml b/.github/workflows/pr_tests.yml
index 3306ebe43ef7..674e62ff443a 100644
--- a/.github/workflows/pr_tests.yml
+++ b/.github/workflows/pr_tests.yml
@@ -115,7 +115,8 @@ jobs:
     - name: Install dependencies
       run: |
         uv pip install -e ".[quality]"
-        uv pip uninstall transformers huggingface_hub && uv pip install --prerelease allow -U transformers@git+https://github.com/huggingface/transformers.git
+        #uv pip uninstall transformers huggingface_hub && uv pip install --prerelease allow -U transformers@git+https://github.com/huggingface/transformers.git
+        uv pip uninstall transformers huggingface_hub && uv pip install transformers==4.57.1
         uv pip uninstall accelerate && uv pip install -U accelerate@git+https://github.com/huggingface/accelerate.git --no-deps
 
     - name: Environment
@@ -126,7 +127,7 @@ jobs:
       if: ${{ matrix.config.framework == 'pytorch_pipelines' }}
       run: |
         pytest -n 8 --max-worker-restart=0 --dist=loadfile \
-          -s -v -k "not Flax and not Onnx" \
+          -k "not Flax and not Onnx" \
           --make-reports=tests_${{ matrix.config.report }} \
           tests/pipelines
 
@@ -134,7 +135,7 @@ jobs:
       if: ${{ matrix.config.framework == 'pytorch_models' }}
       run: |
         pytest -n 4 --max-worker-restart=0 --dist=loadfile \
-          -s -v -k "not Flax and not Onnx and not Dependency" \
+          -k "not Flax and not Onnx and not Dependency" \
           --make-reports=tests_${{ matrix.config.report }} \
           tests/models tests/schedulers tests/others
 
@@ -246,7 +247,8 @@ jobs:
         uv pip install -U peft@git+https://github.com/huggingface/peft.git --no-deps
         uv pip install -U tokenizers
         uv pip uninstall accelerate && uv pip install -U accelerate@git+https://github.com/huggingface/accelerate.git --no-deps
-        uv pip uninstall transformers huggingface_hub && uv pip install --prerelease allow -U transformers@git+https://github.com/huggingface/transformers.git
+        #uv pip uninstall transformers huggingface_hub && uv pip install --prerelease allow -U transformers@git+https://github.com/huggingface/transformers.git
+        uv pip uninstall transformers huggingface_hub && uv pip install transformers==4.57.1
 
     - name: Environment
       run: |
@@ -255,11 +257,11 @@ jobs:
     - name: Run fast PyTorch LoRA tests with PEFT
       run: |
         pytest -n 4 --max-worker-restart=0 --dist=loadfile \
-          -s -v \
+          \
           --make-reports=tests_peft_main \
           tests/lora/
         pytest -n 4 --max-worker-restart=0 --dist=loadfile \
-          -s -v \
+          \
           --make-reports=tests_models_lora_peft_main \
           tests/models/ -k "lora"
 
diff --git a/.github/workflows/pr_tests_gpu.yml b/.github/workflows/pr_tests_gpu.yml
index 6c208ad7cac7..468979d379c1 100644
--- a/.github/workflows/pr_tests_gpu.yml
+++ b/.github/workflows/pr_tests_gpu.yml
@@ -1,4 +1,4 @@
-name: Fast GPU Tests on PR 
+name: Fast GPU Tests on PR
 
 on:
   pull_request:
@@ -71,7 +71,7 @@ jobs:
         if: ${{ failure() }}
         run: |
           echo "Repo consistency check failed. Please ensure the right dependency versions are installed with 'pip install -e .[quality]' and run 'make fix-copies'" >> $GITHUB_STEP_SUMMARY
-  
+
   setup_torch_cuda_pipeline_matrix:
     needs: [check_code_quality, check_repository_consistency]
     name: Setup Torch Pipelines CUDA Slow Tests Matrix
@@ -131,7 +131,8 @@ jobs:
         run: |
           uv pip install -e ".[quality]"
           uv pip uninstall accelerate && uv pip install -U accelerate@git+https://github.com/huggingface/accelerate.git
-          uv pip uninstall transformers huggingface_hub && uv pip install --prerelease allow -U transformers@git+https://github.com/huggingface/transformers.git
+          #uv pip uninstall transformers huggingface_hub && uv pip install --prerelease allow -U transformers@git+https://github.com/huggingface/transformers.git
+          uv pip uninstall transformers huggingface_hub && uv pip install transformers==4.57.1
 
       - name: Environment
         run: |
@@ -149,18 +150,18 @@ jobs:
           # https://pytorch.org/docs/stable/notes/randomness.html#avoiding-nondeterministic-algorithms
           CUBLAS_WORKSPACE_CONFIG: :16:8
         run: |
-          if [ "${{ matrix.module }}" = "ip_adapters" ]; then 
+          if [ "${{ matrix.module }}" = "ip_adapters" ]; then
               pytest -n 1 --max-worker-restart=0 --dist=loadfile \
-              -s -v -k "not Flax and not Onnx" \
+              -k "not Flax and not Onnx" \
               --make-reports=tests_pipeline_${{ matrix.module }}_cuda \
               tests/pipelines/${{ matrix.module }}
-          else 
+          else
               pattern=$(cat ${{ steps.extract_tests.outputs.pattern_file }})
               pytest -n 1 --max-worker-restart=0 --dist=loadfile \
-              -s -v -k "not Flax and not Onnx and $pattern" \
+              -k "not Flax and not Onnx and $pattern" \
               --make-reports=tests_pipeline_${{ matrix.module }}_cuda \
               tests/pipelines/${{ matrix.module }}
-          fi 
+          fi
 
       - name: Failure short reports
         if: ${{ failure() }}
@@ -201,7 +202,8 @@ jobs:
         uv pip install -e ".[quality]"
         uv pip install peft@git+https://github.com/huggingface/peft.git
         uv pip uninstall accelerate && uv pip install -U accelerate@git+https://github.com/huggingface/accelerate.git
-        uv pip uninstall transformers huggingface_hub && uv pip install --prerelease allow -U transformers@git+https://github.com/huggingface/transformers.git
+        #uv pip uninstall transformers huggingface_hub && uv pip install --prerelease allow -U transformers@git+https://github.com/huggingface/transformers.git
+        uv pip uninstall transformers huggingface_hub && uv pip install transformers==4.57.1
 
     - name: Environment
       run: |
@@ -222,11 +224,11 @@ jobs:
       run: |
         pattern=$(cat ${{ steps.extract_tests.outputs.pattern_file }})
         if [ -z "$pattern" ]; then
-          pytest -n 1 -sv --max-worker-restart=0 --dist=loadfile -k "not Flax and not Onnx" tests/${{ matrix.module }} \
-          --make-reports=tests_torch_cuda_${{ matrix.module }}  
+          pytest -n 1  --max-worker-restart=0 --dist=loadfile -k "not Flax and not Onnx" tests/${{ matrix.module }} \
+          --make-reports=tests_torch_cuda_${{ matrix.module }}
         else
-          pytest -n 1 -sv --max-worker-restart=0 --dist=loadfile -k "not Flax and not Onnx and $pattern" tests/${{ matrix.module }} \
-          --make-reports=tests_torch_cuda_${{ matrix.module }}  
+          pytest -n 1  --max-worker-restart=0 --dist=loadfile -k "not Flax and not Onnx and $pattern" tests/${{ matrix.module }} \
+          --make-reports=tests_torch_cuda_${{ matrix.module }}
         fi
 
     - name: Failure short reports
@@ -262,7 +264,8 @@ jobs:
         nvidia-smi
     - name: Install dependencies
       run: |
-        uv pip uninstall transformers huggingface_hub && uv pip install --prerelease allow -U transformers@git+https://github.com/huggingface/transformers.git
+        #uv pip uninstall transformers huggingface_hub && uv pip install --prerelease allow -U transformers@git+https://github.com/huggingface/transformers.git
+        uv pip uninstall transformers huggingface_hub && uv pip install transformers==4.57.1
         uv pip install -e ".[quality,training]"
 
     - name: Environment
@@ -274,7 +277,7 @@ jobs:
         HF_TOKEN: ${{ secrets.DIFFUSERS_HF_HUB_READ_TOKEN }}
       run: |
         uv pip install ".[training]"
-        pytest -n 1 --max-worker-restart=0 --dist=loadfile -s -v --make-reports=examples_torch_cuda examples/
+        pytest -n 1 --max-worker-restart=0 --dist=loadfile --make-reports=examples_torch_cuda examples/
 
     - name: Failure short reports
       if: ${{ failure() }}
diff --git a/.github/workflows/push_tests.yml b/.github/workflows/push_tests.yml
index a10dfcaf4e4d..7b1c441d3dc0 100644
--- a/.github/workflows/push_tests.yml
+++ b/.github/workflows/push_tests.yml
@@ -76,6 +76,8 @@ jobs:
         run: |
           uv pip install -e ".[quality]"
           uv pip uninstall accelerate && uv pip install -U accelerate@git+https://github.com/huggingface/accelerate.git
+          #uv pip uninstall transformers huggingface_hub && uv pip install --prerelease allow -U transformers@git+https://github.com/huggingface/transformers.git
+          uv pip uninstall transformers huggingface_hub && uv pip install transformers==4.57.1
       - name: Environment
         run: |
           python utils/print_env.py
@@ -86,7 +88,7 @@ jobs:
           CUBLAS_WORKSPACE_CONFIG: :16:8
         run: |
           pytest -n 1 --max-worker-restart=0 --dist=loadfile \
-            -s -v -k "not Flax and not Onnx" \
+            -k "not Flax and not Onnx" \
             --make-reports=tests_pipeline_${{ matrix.module }}_cuda \
             tests/pipelines/${{ matrix.module }}
       - name: Failure short reports
@@ -127,6 +129,8 @@ jobs:
         uv pip install -e ".[quality]"
         uv pip install peft@git+https://github.com/huggingface/peft.git
         uv pip uninstall accelerate && uv pip install -U accelerate@git+https://github.com/huggingface/accelerate.git
+        #uv pip uninstall transformers huggingface_hub && uv pip install --prerelease allow -U transformers@git+https://github.com/huggingface/transformers.git
+        uv pip uninstall transformers huggingface_hub && uv pip install transformers==4.57.1
 
     - name: Environment
       run: |
@@ -139,7 +143,7 @@ jobs:
         CUBLAS_WORKSPACE_CONFIG: :16:8
       run: |
         pytest -n 1 --max-worker-restart=0 --dist=loadfile \
-          -s -v -k "not Flax and not Onnx" \
+          -k "not Flax and not Onnx" \
           --make-reports=tests_torch_cuda_${{ matrix.module }} \
           tests/${{ matrix.module }}
 
@@ -178,6 +182,8 @@ jobs:
     - name: Install dependencies
       run: |
         uv pip install -e ".[quality,training]"
+        #uv pip uninstall transformers huggingface_hub && uv pip install --prerelease allow -U transformers@git+https://github.com/huggingface/transformers.git
+        uv pip uninstall transformers huggingface_hub && uv pip install transformers==4.57.1
     - name: Environment
       run: |
         python utils/print_env.py
@@ -186,7 +192,7 @@ jobs:
         HF_TOKEN: ${{ secrets.DIFFUSERS_HF_HUB_READ_TOKEN }}
         RUN_COMPILE: yes
       run: |
-        pytest -n 1 --max-worker-restart=0 --dist=loadfile -s -v -k "compile" --make-reports=tests_torch_compile_cuda tests/
+        pytest -n 1 --max-worker-restart=0 --dist=loadfile -k "compile" --make-reports=tests_torch_compile_cuda tests/
     - name: Failure short reports
       if: ${{ failure() }}
       run: cat reports/tests_torch_compile_cuda_failures_short.txt
@@ -227,7 +233,7 @@ jobs:
       env:
         HF_TOKEN: ${{ secrets.DIFFUSERS_HF_HUB_READ_TOKEN }}
       run: |
-        pytest -n 1 --max-worker-restart=0 --dist=loadfile -s -v -k "xformers" --make-reports=tests_torch_xformers_cuda tests/
+        pytest -n 1 --max-worker-restart=0 --dist=loadfile -k "xformers" --make-reports=tests_torch_xformers_cuda tests/
     - name: Failure short reports
       if: ${{ failure() }}
       run: cat reports/tests_torch_xformers_cuda_failures_short.txt
@@ -270,7 +276,7 @@ jobs:
         HF_TOKEN: ${{ secrets.DIFFUSERS_HF_HUB_READ_TOKEN }}
       run: |
         uv pip install ".[training]"
-        pytest -n 1 --max-worker-restart=0 --dist=loadfile -s -v --make-reports=examples_torch_cuda examples/
+        pytest -n 1 --max-worker-restart=0 --dist=loadfile --make-reports=examples_torch_cuda examples/
 
     - name: Failure short reports
       if: ${{ failure() }}
diff --git a/.github/workflows/push_tests_fast.yml b/.github/workflows/push_tests_fast.yml
index ae619d481c48..38cbffaa6315 100644
--- a/.github/workflows/push_tests_fast.yml
+++ b/.github/workflows/push_tests_fast.yml
@@ -70,7 +70,7 @@ jobs:
       if: ${{ matrix.config.framework == 'pytorch' }}
       run: |
         pytest -n 4 --max-worker-restart=0 --dist=loadfile \
-          -s -v -k "not Flax and not Onnx" \
+          -k "not Flax and not Onnx" \
           --make-reports=tests_${{ matrix.config.report }} \
           tests/
 
diff --git a/.github/workflows/push_tests_mps.yml b/.github/workflows/push_tests_mps.yml
index 484c7a8eeb49..2d6feb592815 100644
--- a/.github/workflows/push_tests_mps.yml
+++ b/.github/workflows/push_tests_mps.yml
@@ -57,7 +57,7 @@ jobs:
         HF_HOME: /System/Volumes/Data/mnt/cache
         HF_TOKEN: ${{ secrets.HF_TOKEN }}
       run: |
-        ${CONDA_RUN} python -m pytest -n 0 -s -v --make-reports=tests_torch_mps tests/
+        ${CONDA_RUN} python -m pytest -n 0 --make-reports=tests_torch_mps tests/
 
     - name: Failure short reports
       if: ${{ failure() }}
diff --git a/.github/workflows/release_tests_fast.yml b/.github/workflows/release_tests_fast.yml
index 808818beada3..efdd6ea2b651 100644
--- a/.github/workflows/release_tests_fast.yml
+++ b/.github/workflows/release_tests_fast.yml
@@ -84,7 +84,7 @@ jobs:
           CUBLAS_WORKSPACE_CONFIG: :16:8
         run: |
           pytest -n 1 --max-worker-restart=0 --dist=loadfile \
-            -s -v -k "not Flax and not Onnx" \
+            -k "not Flax and not Onnx" \
             --make-reports=tests_pipeline_${{ matrix.module }}_cuda \
             tests/pipelines/${{ matrix.module }}
       - name: Failure short reports
@@ -137,7 +137,7 @@ jobs:
         CUBLAS_WORKSPACE_CONFIG: :16:8
       run: |
         pytest -n 1 --max-worker-restart=0 --dist=loadfile \
-          -s -v -k "not Flax and not Onnx" \
+          -k "not Flax and not Onnx" \
           --make-reports=tests_torch_${{ matrix.module }}_cuda \
           tests/${{ matrix.module }}
 
@@ -187,7 +187,7 @@ jobs:
           CUBLAS_WORKSPACE_CONFIG: :16:8
         run: |
           pytest -n 1 --max-worker-restart=0 --dist=loadfile \
-            -s -v -k "not Flax and not Onnx" \
+            -k "not Flax and not Onnx" \
             --make-reports=tests_torch_minimum_cuda \
             tests/models/test_modeling_common.py \
             tests/pipelines/test_pipelines_common.py \
@@ -240,7 +240,7 @@ jobs:
         HF_TOKEN: ${{ secrets.DIFFUSERS_HF_HUB_READ_TOKEN }}
         RUN_COMPILE: yes
       run: |
-        pytest -n 1 --max-worker-restart=0 --dist=loadfile -s -v -k "compile" --make-reports=tests_torch_compile_cuda tests/
+        pytest -n 1 --max-worker-restart=0 --dist=loadfile -k "compile" --make-reports=tests_torch_compile_cuda tests/
     - name: Failure short reports
       if: ${{ failure() }}
       run: cat reports/tests_torch_compile_cuda_failures_short.txt
@@ -281,7 +281,7 @@ jobs:
       env:
         HF_TOKEN: ${{ secrets.DIFFUSERS_HF_HUB_READ_TOKEN }}
       run: |
-        pytest -n 1 --max-worker-restart=0 --dist=loadfile -s -v -k "xformers" --make-reports=tests_torch_xformers_cuda tests/
+        pytest -n 1 --max-worker-restart=0 --dist=loadfile -k "xformers" --make-reports=tests_torch_xformers_cuda tests/
     - name: Failure short reports
       if: ${{ failure() }}
       run: cat reports/tests_torch_xformers_cuda_failures_short.txt
@@ -326,7 +326,7 @@ jobs:
         HF_TOKEN: ${{ secrets.DIFFUSERS_HF_HUB_READ_TOKEN }}
       run: |
         uv pip install ".[training]"
-        pytest -n 1 --max-worker-restart=0 --dist=loadfile -s -v --make-reports=examples_torch_cuda examples/
+        pytest -n 1 --max-worker-restart=0 --dist=loadfile --make-reports=examples_torch_cuda examples/
 
     - name: Failure short reports
       if: ${{ failure() }}
diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml
index 8103c01643c6..c176f8786f38 100644
--- a/docs/source/en/_toctree.yml
+++ b/docs/source/en/_toctree.yml
@@ -22,6 +22,8 @@
     title: Reproducibility
   - local: using-diffusers/schedulers
     title: Schedulers
+  - local: using-diffusers/automodel
+    title: AutoModel
   - local: using-diffusers/other-formats
     title: Model formats
   - local: using-diffusers/push_to_hub
@@ -119,6 +121,8 @@
     title: ComponentsManager
   - local: modular_diffusers/guiders
     title: Guiders
+  - local: modular_diffusers/custom_blocks
+    title: Building Custom Blocks
   title: Modular Diffusers
 - isExpanded: false
   sections:
@@ -329,6 +333,8 @@
         title: BriaTransformer2DModel
       - local: api/models/chroma_transformer
         title: ChromaTransformer2DModel
+      - local: api/models/chronoedit_transformer_3d
+        title: ChronoEditTransformer3DModel
       - local: api/models/cogvideox_transformer3d
         title: CogVideoXTransformer3DModel
       - local: api/models/cogview3plus_transformer2d
@@ -343,6 +349,8 @@
         title: DiTTransformer2DModel
       - local: api/models/easyanimate_transformer3d
         title: EasyAnimateTransformer3DModel
+      - local: api/models/flux2_transformer
+        title: Flux2Transformer2DModel
       - local: api/models/flux_transformer
         title: FluxTransformer2DModel
       - local: api/models/hidream_image_transformer
@@ -351,6 +359,8 @@
         title: HunyuanDiT2DModel
       - local: api/models/hunyuanimage_transformer_2d
         title: HunyuanImageTransformer2DModel
+      - local: api/models/hunyuan_video15_transformer_3d
+        title: HunyuanVideo15Transformer3DModel
       - local: api/models/hunyuan_video_transformer_3d
         title: HunyuanVideoTransformer3DModel
       - local: api/models/latte_transformer3d
@@ -365,6 +375,8 @@
         title: MochiTransformer3DModel
       - local: api/models/omnigen_transformer
         title: OmniGenTransformer2DModel
+      - local: api/models/ovisimage_transformer2d
+        title: OvisImageTransformer2DModel
       - local: api/models/pixart_transformer2d
         title: PixArtTransformer2DModel
       - local: api/models/prior_transformer
@@ -373,6 +385,8 @@
         title: QwenImageTransformer2DModel
       - local: api/models/sana_transformer2d
         title: SanaTransformer2DModel
+      - local: api/models/sana_video_transformer3d
+        title: SanaVideoTransformer3DModel
       - local: api/models/sd3_transformer2d
         title: SD3Transformer2DModel
       - local: api/models/skyreels_v2_transformer_3d
@@ -383,8 +397,12 @@
         title: Transformer2DModel
       - local: api/models/transformer_temporal
         title: TransformerTemporalModel
+      - local: api/models/wan_animate_transformer_3d
+        title: WanAnimateTransformer3DModel
       - local: api/models/wan_transformer_3d
         title: WanTransformer3DModel
+      - local: api/models/z_image_transformer2d
+        title: ZImageTransformer2DModel        
       title: Transformers
     - sections:
       - local: api/models/stable_cascade_unet
@@ -421,6 +439,8 @@
         title: AutoencoderKLHunyuanImageRefiner
       - local: api/models/autoencoder_kl_hunyuan_video
         title: AutoencoderKLHunyuanVideo
+      - local: api/models/autoencoder_kl_hunyuan_video15
+        title: AutoencoderKLHunyuanVideo15
       - local: api/models/autoencoderkl_ltx_video
         title: AutoencoderKLLTXVideo
       - local: api/models/autoencoderkl_magvit
@@ -444,6 +464,8 @@
   - sections:
     - local: api/pipelines/overview
       title: Overview
+    - local: api/pipelines/auto_pipeline
+      title: AutoPipeline
     - sections:
       - local: api/pipelines/audioldm
         title: AudioLDM
@@ -456,8 +478,6 @@
       - local: api/pipelines/stable_audio
         title: Stable Audio
       title: Audio
-    - local: api/pipelines/auto_pipeline
-      title: AutoPipeline
     - sections:
       - local: api/pipelines/amused
         title: aMUSEd
@@ -515,12 +535,16 @@
         title: EasyAnimate
       - local: api/pipelines/flux
         title: Flux
+      - local: api/pipelines/flux2
+        title: Flux2
       - local: api/pipelines/control_flux_inpaint
         title: FluxControlInpaint
       - local: api/pipelines/hidream
         title: HiDream-I1
       - local: api/pipelines/hunyuandit
         title: Hunyuan-DiT
+      - local: api/pipelines/hunyuanimage21
+        title: HunyuanImage2.1
       - local: api/pipelines/pix2pix
         title: InstructPix2Pix
       - local: api/pipelines/kandinsky
@@ -529,8 +553,8 @@
         title: Kandinsky 2.2
       - local: api/pipelines/kandinsky3
         title: Kandinsky 3
-      - local: api/pipelines/kandinsky5
-        title: Kandinsky 5
+      - local: api/pipelines/kandinsky5_image
+        title: Kandinsky 5.0 Image
       - local: api/pipelines/kolors
         title: Kolors
       - local: api/pipelines/latent_consistency_models
@@ -549,6 +573,8 @@
         title: MultiDiffusion
       - local: api/pipelines/omnigen
         title: OmniGen
+      - local: api/pipelines/ovis_image
+        title: Ovis-Image
       - local: api/pipelines/pag
         title: PAG
       - local: api/pipelines/paint_by_example
@@ -565,6 +591,8 @@
         title: Sana
       - local: api/pipelines/sana_sprint
         title: Sana Sprint
+      - local: api/pipelines/sana_video
+        title: Sana Video
       - local: api/pipelines/self_attention_guidance
         title: Self-Attention Guidance
       - local: api/pipelines/semantic_stable_diffusion
@@ -622,22 +650,28 @@
         title: VisualCloze
       - local: api/pipelines/wuerstchen
         title: Wuerstchen
+      - local: api/pipelines/z_image
+        title: Z-Image
       title: Image
     - sections:
       - local: api/pipelines/allegro
         title: Allegro
+      - local: api/pipelines/chronoedit
+        title: ChronoEdit
       - local: api/pipelines/cogvideox
         title: CogVideoX
       - local: api/pipelines/consisid
         title: ConsisID
       - local: api/pipelines/framepack
         title: Framepack
-      - local: api/pipelines/hunyuanimage21
-        title: HunyuanImage2.1
       - local: api/pipelines/hunyuan_video
         title: HunyuanVideo
+      - local: api/pipelines/hunyuan_video15
+        title: HunyuanVideo1.5
       - local: api/pipelines/i2vgenxl
         title: I2VGen-XL
+      - local: api/pipelines/kandinsky5_video
+        title: Kandinsky 5.0 Video
       - local: api/pipelines/latte
         title: Latte
       - local: api/pipelines/ltx_video
diff --git a/docs/source/en/api/cache.md b/docs/source/en/api/cache.md
index 9ba474208551..c93dcad43821 100644
--- a/docs/source/en/api/cache.md
+++ b/docs/source/en/api/cache.md
@@ -34,3 +34,9 @@ Cache methods speedup diffusion transformers by storing and reusing intermediate
 [[autodoc]] FirstBlockCacheConfig
 
 [[autodoc]] apply_first_block_cache
+
+### TaylorSeerCacheConfig
+
+[[autodoc]] TaylorSeerCacheConfig
+
+[[autodoc]] apply_taylorseer_cache
diff --git a/docs/source/en/api/loaders/lora.md b/docs/source/en/api/loaders/lora.md
index 8e0326e0c334..7911bc2b2332 100644
--- a/docs/source/en/api/loaders/lora.md
+++ b/docs/source/en/api/loaders/lora.md
@@ -30,7 +30,9 @@ LoRA is a fast and lightweight training method that inserts and trains a signifi
 - [`CogView4LoraLoaderMixin`] provides similar functions for [CogView4](https://huggingface.co/docs/diffusers/main/en/api/pipelines/cogview4).
 - [`AmusedLoraLoaderMixin`] is for the [`AmusedPipeline`].
 - [`HiDreamImageLoraLoaderMixin`] provides similar functions for [HiDream Image](https://huggingface.co/docs/diffusers/main/en/api/pipelines/hidream)
-- [`QwenImageLoraLoaderMixin`] provides similar functions for [Qwen Image](https://huggingface.co/docs/diffusers/main/en/api/pipelines/qwen)
+- [`QwenImageLoraLoaderMixin`] provides similar functions for [Qwen Image](https://huggingface.co/docs/diffusers/main/en/api/pipelines/qwen).
+- [`ZImageLoraLoaderMixin`] provides similar functions for [Z-Image](https://huggingface.co/docs/diffusers/main/en/api/pipelines/zimage).
+- [`Flux2LoraLoaderMixin`] provides similar functions for [Flux2](https://huggingface.co/docs/diffusers/main/en/api/pipelines/flux2).
 - [`LoraBaseMixin`] provides a base class with several utility methods to fuse, unfuse, unload, LoRAs and more.
 
 > [!TIP]
@@ -56,6 +58,10 @@ LoRA is a fast and lightweight training method that inserts and trains a signifi
 
 [[autodoc]] loaders.lora_pipeline.FluxLoraLoaderMixin
 
+## Flux2LoraLoaderMixin
+
+[[autodoc]] loaders.lora_pipeline.Flux2LoraLoaderMixin
+
 ## CogVideoXLoraLoaderMixin
 
 [[autodoc]] loaders.lora_pipeline.CogVideoXLoraLoaderMixin
@@ -107,6 +113,10 @@ LoRA is a fast and lightweight training method that inserts and trains a signifi
 
 [[autodoc]] loaders.lora_pipeline.QwenImageLoraLoaderMixin
 
+## ZImageLoraLoaderMixin
+
+[[autodoc]] loaders.lora_pipeline.ZImageLoraLoaderMixin
+
 ## KandinskyLoraLoaderMixin
 [[autodoc]] loaders.lora_pipeline.KandinskyLoraLoaderMixin
 
diff --git a/docs/source/en/api/models/auto_model.md b/docs/source/en/api/models/auto_model.md
index 376dd12d12c4..aee9b5dbe50c 100644
--- a/docs/source/en/api/models/auto_model.md
+++ b/docs/source/en/api/models/auto_model.md
@@ -12,15 +12,7 @@ specific language governing permissions and limitations under the License.
 
 # AutoModel
 
-The `AutoModel` is designed to make it easy to load a checkpoint without needing to know the specific model class. `AutoModel` automatically retrieves the correct model class from the checkpoint `config.json` file.
-
-```python
-from diffusers import AutoModel, AutoPipelineForText2Image
-
-unet = AutoModel.from_pretrained("stable-diffusion-v1-5/stable-diffusion-v1-5", subfolder="unet")
-pipe = AutoPipelineForText2Image.from_pretrained("stable-diffusion-v1-5/stable-diffusion-v1-5", unet=unet)
-```
-
+[`AutoModel`] automatically retrieves the correct model class from the checkpoint `config.json` file.
 
 ## AutoModel
 
diff --git a/docs/source/en/api/models/autoencoder_kl_hunyuan_video15.md b/docs/source/en/api/models/autoencoder_kl_hunyuan_video15.md
new file mode 100644
index 000000000000..e82fe31380a5
--- /dev/null
+++ b/docs/source/en/api/models/autoencoder_kl_hunyuan_video15.md
@@ -0,0 +1,36 @@
+<!-- Copyright 2025 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License. -->
+
+# AutoencoderKLHunyuanVideo15
+
+The 3D variational autoencoder (VAE) model with KL loss used in [HunyuanVideo1.5](https://github.com/Tencent/HunyuanVideo1-1.5) by Tencent.
+
+The model can be loaded with the following code snippet.
+
+```python
+from diffusers import AutoencoderKLHunyuanVideo15
+
+vae = AutoencoderKLHunyuanVideo15.from_pretrained("hunyuanvideo-community/HunyuanVideo-1.5-Diffusers-480p_t2v", subfolder="vae", torch_dtype=torch.float32)
+
+# make sure to enable tiling to avoid OOM
+vae.enable_tiling()
+```
+
+## AutoencoderKLHunyuanVideo15
+
+[[autodoc]] AutoencoderKLHunyuanVideo15
+  - decode
+  - encode
+  - all
+
+## DecoderOutput
+
+[[autodoc]] models.autoencoders.vae.DecoderOutput
diff --git a/docs/source/en/api/models/chronoedit_transformer_3d.md b/docs/source/en/api/models/chronoedit_transformer_3d.md
new file mode 100644
index 000000000000..94982821795d
--- /dev/null
+++ b/docs/source/en/api/models/chronoedit_transformer_3d.md
@@ -0,0 +1,32 @@
+<!-- Copyright 2025 The ChronoEdit Team and HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License. -->
+
+# ChronoEditTransformer3DModel
+
+A Diffusion Transformer model for 3D video-like data from [ChronoEdit: Towards Temporal Reasoning for Image Editing and World Simulation](https://huggingface.co/papers/2510.04290) from NVIDIA and University of Toronto, by Jay Zhangjie Wu, Xuanchi Ren, Tianchang Shen, Tianshi Cao, Kai He, Yifan Lu, Ruiyuan Gao, Enze Xie, Shiyi Lan, Jose M. Alvarez, Jun Gao, Sanja Fidler, Zian Wang, Huan Ling.
+
+> **TL;DR:** ChronoEdit reframes image editing as a video generation task, using input and edited images as start/end frames to leverage pretrained video models with temporal consistency. A temporal reasoning stage introduces reasoning tokens to ensure physically plausible edits and visualize the editing trajectory.
+
+The model can be loaded with the following code snippet.
+
+```python
+from diffusers import ChronoEditTransformer3DModel
+
+transformer = ChronoEditTransformer3DModel.from_pretrained("nvidia/ChronoEdit-14B-Diffusers", subfolder="transformer", torch_dtype=torch.bfloat16)
+```
+
+## ChronoEditTransformer3DModel
+
+[[autodoc]] ChronoEditTransformer3DModel
+
+## Transformer2DModelOutput
+
+[[autodoc]] models.modeling_outputs.Transformer2DModelOutput
diff --git a/docs/source/en/api/models/flux2_transformer.md b/docs/source/en/api/models/flux2_transformer.md
new file mode 100644
index 000000000000..c85681d2b011
--- /dev/null
+++ b/docs/source/en/api/models/flux2_transformer.md
@@ -0,0 +1,19 @@
+<!--Copyright 2025 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
+
+# Flux2Transformer2DModel
+
+A Transformer model for image-like data from [Flux2](https://hf.co/black-forest-labs/FLUX.2-dev).
+
+## Flux2Transformer2DModel
+
+[[autodoc]] Flux2Transformer2DModel
diff --git a/docs/source/en/api/models/hunyuan_video15_transformer_3d.md b/docs/source/en/api/models/hunyuan_video15_transformer_3d.md
new file mode 100644
index 000000000000..5ad4c6f4643f
--- /dev/null
+++ b/docs/source/en/api/models/hunyuan_video15_transformer_3d.md
@@ -0,0 +1,30 @@
+<!-- Copyright 2025 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License. -->
+
+# HunyuanVideo15Transformer3DModel
+
+A Diffusion Transformer model for 3D video-like data used in [HunyuanVideo1.5](https://github.com/Tencent/HunyuanVideo1-1.5).
+
+The model can be loaded with the following code snippet.
+
+```python
+from diffusers import HunyuanVideo15Transformer3DModel
+
+transformer = HunyuanVideo15Transformer3DModel.from_pretrained("hunyuanvideo-community/HunyuanVideo-1.5-Diffusers-480p_t2v" subfolder="transformer", torch_dtype=torch.bfloat16)
+```
+
+## HunyuanVideo15Transformer3DModel
+
+[[autodoc]] HunyuanVideo15Transformer3DModel
+
+## Transformer2DModelOutput
+
+[[autodoc]] models.modeling_outputs.Transformer2DModelOutput
diff --git a/docs/source/en/api/models/ovisimage_transformer2d.md b/docs/source/en/api/models/ovisimage_transformer2d.md
new file mode 100644
index 000000000000..484652404af3
--- /dev/null
+++ b/docs/source/en/api/models/ovisimage_transformer2d.md
@@ -0,0 +1,24 @@
+<!-- Copyright 2025 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License. -->
+
+# OvisImageTransformer2DModel
+
+The model can be loaded with the following code snippet.
+
+```python
+from diffusers import OvisImageTransformer2DModel
+
+transformer = OvisImageTransformer2DModel.from_pretrained("AIDC-AI/Ovis-Image-7B", subfolder="transformer", torch_dtype=torch.bfloat16)
+```
+
+## OvisImageTransformer2DModel
+
+[[autodoc]] OvisImageTransformer2DModel
diff --git a/docs/source/en/api/models/sana_video_transformer3d.md b/docs/source/en/api/models/sana_video_transformer3d.md
new file mode 100644
index 000000000000..0cf1451a2d39
--- /dev/null
+++ b/docs/source/en/api/models/sana_video_transformer3d.md
@@ -0,0 +1,36 @@
+<!-- Copyright 2025 The SANA-Video Authors and HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License. -->
+
+# SanaVideoTransformer3DModel
+
+A Diffusion Transformer model for 3D data (video) from [SANA-Video: Efficient Video Generation with Block Linear Diffusion Transformer](https://huggingface.co/papers/2509.24695) from NVIDIA and MIT HAN Lab, by Junsong Chen, Yuyang Zhao, Jincheng Yu, Ruihang Chu, Junyu Chen, Shuai Yang, Xianbang Wang, Yicheng Pan, Daquan Zhou, Huan Ling, Haozhe Liu, Hongwei Yi, Hao Zhang, Muyang Li, Yukang Chen, Han Cai, Sanja Fidler, Ping Luo, Song Han, Enze Xie.
+
+The abstract from the paper is:
+
+*We introduce SANA-Video, a small diffusion model that can efficiently generate videos up to 720x1280 resolution and minute-length duration. SANA-Video synthesizes high-resolution, high-quality and long videos with strong text-video alignment at a remarkably fast speed, deployable on RTX 5090 GPU. Two core designs ensure our efficient, effective and long video generation: (1) Linear DiT: We leverage linear attention as the core operation, which is more efficient than vanilla attention given the large number of tokens processed in video generation. (2) Constant-Memory KV cache for Block Linear Attention: we design block-wise autoregressive approach for long video generation by employing a constant-memory state, derived from the cumulative properties of linear attention. This KV cache provides the Linear DiT with global context at a fixed memory cost, eliminating the need for a traditional KV cache and enabling efficient, minute-long video generation. In addition, we explore effective data filters and model training strategies, narrowing the training cost to 12 days on 64 H100 GPUs, which is only 1% of the cost of MovieGen. Given its low cost, SANA-Video achieves competitive performance compared to modern state-of-the-art small diffusion models (e.g., Wan 2.1-1.3B and SkyReel-V2-1.3B) while being 16x faster in measured latency. Moreover, SANA-Video can be deployed on RTX 5090 GPUs with NVFP4 precision, accelerating the inference speed of generating a 5-second 720p video from 71s to 29s (2.4x speedup). In summary, SANA-Video enables low-cost, high-quality video generation.*
+
+The model can be loaded with the following code snippet.
+
+```python
+from diffusers import SanaVideoTransformer3DModel
+import torch
+
+transformer = SanaVideoTransformer3DModel.from_pretrained("Efficient-Large-Model/SANA-Video_2B_480p_diffusers", subfolder="transformer", torch_dtype=torch.bfloat16)
+```
+
+## SanaVideoTransformer3DModel
+
+[[autodoc]] SanaVideoTransformer3DModel
+
+## Transformer2DModelOutput
+
+[[autodoc]] models.modeling_outputs.Transformer2DModelOutput
+
diff --git a/docs/source/en/api/models/wan_animate_transformer_3d.md b/docs/source/en/api/models/wan_animate_transformer_3d.md
new file mode 100644
index 000000000000..cc7b3f0c408c
--- /dev/null
+++ b/docs/source/en/api/models/wan_animate_transformer_3d.md
@@ -0,0 +1,30 @@
+<!-- Copyright 2025 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License. -->
+
+# WanAnimateTransformer3DModel
+
+A Diffusion Transformer model for 3D video-like data was introduced in [Wan Animate](https://github.com/Wan-Video/Wan2.2) by the Alibaba Wan Team.
+
+The model can be loaded with the following code snippet.
+
+```python
+from diffusers import WanAnimateTransformer3DModel
+
+transformer = WanAnimateTransformer3DModel.from_pretrained("Wan-AI/Wan2.2-Animate-14B-Diffusers", subfolder="transformer", torch_dtype=torch.bfloat16)
+```
+
+## WanAnimateTransformer3DModel
+
+[[autodoc]] WanAnimateTransformer3DModel
+
+## Transformer2DModelOutput
+
+[[autodoc]] models.modeling_outputs.Transformer2DModelOutput
diff --git a/docs/source/en/api/models/z_image_transformer2d.md b/docs/source/en/api/models/z_image_transformer2d.md
new file mode 100644
index 000000000000..2ecb9851febd
--- /dev/null
+++ b/docs/source/en/api/models/z_image_transformer2d.md
@@ -0,0 +1,19 @@
+<!--Copyright 2025 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
+
+# ZImageTransformer2DModel
+
+A Transformer model for image-like data from [Z-Image](https://huggingface.co/Tongyi-MAI/Z-Image-Turbo).
+
+## ZImageTransformer2DModel
+
+[[autodoc]] ZImageTransformer2DModel
\ No newline at end of file
diff --git a/docs/source/en/api/pipelines/bria_fibo.md b/docs/source/en/api/pipelines/bria_fibo.md
index 96cad10dda15..96c6b0317e1b 100644
--- a/docs/source/en/api/pipelines/bria_fibo.md
+++ b/docs/source/en/api/pipelines/bria_fibo.md
@@ -21,9 +21,10 @@ With only 8 billion parameters, FIBO provides a new level of image quality, prom
 FIBO is trained exclusively on a structured prompt and will not work with freeform text prompts.
 you can use the [FIBO-VLM-prompt-to-JSON](https://huggingface.co/briaai/FIBO-VLM-prompt-to-JSON) model or the [FIBO-gemini-prompt-to-JSON](https://huggingface.co/briaai/FIBO-gemini-prompt-to-JSON)  to convert your freeform text prompt to a structured JSON prompt.
 
-its not recommended to use freeform text prompts directly with FIBO, as it will not produce the best results.
+> [!NOTE]
+> Avoid using freeform text prompts directly with FIBO because it does not produce the best results.
 
-you can learn more about FIBO in  [Bria Fibo Hugging Face page](https://huggingface.co/briaai/FIBO).
+Refer to the Bria Fibo Hugging Face [page](https://huggingface.co/briaai/FIBO) to learn more.
 
 
 ## Usage
@@ -37,9 +38,8 @@ hf auth login
 ```
 
 
-## BriaPipeline
+## BriaFiboPipeline
 
-[[autodoc]] BriaPipeline
+[[autodoc]] BriaFiboPipeline
 	- all
-	- __call__
-
+	- __call__
\ No newline at end of file
diff --git a/docs/source/en/api/pipelines/chronoedit.md b/docs/source/en/api/pipelines/chronoedit.md
new file mode 100644
index 000000000000..48e70ab9e55e
--- /dev/null
+++ b/docs/source/en/api/pipelines/chronoedit.md
@@ -0,0 +1,156 @@
+<!-- Copyright 2025 The ChronoEdit Team and HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License. -->
+
+<div style="float: right;">
+  <div class="flex flex-wrap space-x-1">
+    <a href="https://huggingface.co/docs/diffusers/main/en/tutorials/using_peft_for_inference" target="_blank" rel="noopener">
+      <img alt="LoRA" src="https://img.shields.io/badge/LoRA-d8b4fe?style=flat"/>
+    </a>
+  </div>
+</div>
+
+# ChronoEdit
+
+[ChronoEdit: Towards Temporal Reasoning for Image Editing and World Simulation](https://huggingface.co/papers/2510.04290) from NVIDIA and University of Toronto, by Jay Zhangjie Wu, Xuanchi Ren, Tianchang Shen, Tianshi Cao, Kai He, Yifan Lu, Ruiyuan Gao, Enze Xie, Shiyi Lan, Jose M. Alvarez, Jun Gao, Sanja Fidler, Zian Wang, Huan Ling.
+
+> **TL;DR:** ChronoEdit reframes image editing as a video generation task, using input and edited images as start/end frames to leverage pretrained video models with temporal consistency. A temporal reasoning stage introduces reasoning tokens to ensure physically plausible edits and visualize the editing trajectory.
+
+*Recent advances in large generative models have greatly enhanced both image editing and in-context image generation, yet a critical gap remains in ensuring physical consistency, where edited objects must remain coherent. This capability is especially vital for world simulation related tasks. In this paper, we present ChronoEdit, a framework that reframes image editing as a video generation problem. First, ChronoEdit treats the input and edited images as the first and last frames of a video, allowing it to leverage large pretrained video generative models that capture not only object appearance but also the implicit physics of motion and interaction through learned temporal consistency. Second, ChronoEdit introduces a temporal reasoning stage that explicitly performs editing at inference time. Under this setting, target frame is jointly denoised with reasoning tokens to imagine a plausible editing trajectory that constrains the solution space to physically viable transformations. The reasoning tokens are then dropped after a few steps to avoid the high computational cost of rendering a full video. To validate ChronoEdit, we introduce PBench-Edit, a new benchmark of image-prompt pairs for contexts that require physical consistency, and demonstrate that ChronoEdit surpasses state-of-the-art baselines in both visual fidelity and physical plausibility. Project page for code and models: [this https URL](https://research.nvidia.com/labs/toronto-ai/chronoedit).*
+
+The ChronoEdit pipeline is developed by the ChronoEdit Team. The original code is available on [GitHub](https://github.com/nv-tlabs/ChronoEdit), and pretrained models can be found in the [nvidia/ChronoEdit](https://huggingface.co/collections/nvidia/chronoedit) collection on Hugging Face.
+
+
+### Image Editing
+
+```py
+import torch
+import numpy as np
+from diffusers import AutoencoderKLWan, ChronoEditTransformer3DModel, ChronoEditPipeline
+from diffusers.utils import export_to_video, load_image
+from transformers import CLIPVisionModel
+from PIL import Image
+
+model_id = "nvidia/ChronoEdit-14B-Diffusers"
+image_encoder = CLIPVisionModel.from_pretrained(model_id, subfolder="image_encoder", torch_dtype=torch.float32)
+vae = AutoencoderKLWan.from_pretrained(model_id, subfolder="vae", torch_dtype=torch.float32)
+transformer = ChronoEditTransformer3DModel.from_pretrained(model_id, subfolder="transformer", torch_dtype=torch.bfloat16)
+pipe = ChronoEditPipeline.from_pretrained(model_id, image_encoder=image_encoder, transformer=transformer, vae=vae, torch_dtype=torch.bfloat16)
+pipe.to("cuda")
+
+image = load_image(
+    "https://huggingface.co/spaces/nvidia/ChronoEdit/resolve/main/examples/3.png"
+)
+max_area = 720 * 1280
+aspect_ratio = image.height / image.width
+mod_value = pipe.vae_scale_factor_spatial * pipe.transformer.config.patch_size[1]
+height = round(np.sqrt(max_area * aspect_ratio)) // mod_value * mod_value
+width = round(np.sqrt(max_area / aspect_ratio)) // mod_value * mod_value
+print("width", width, "height", height)
+image = image.resize((width, height))
+prompt = (
+    "The user wants to transform the image by adding a small, cute mouse sitting inside the floral teacup, enjoying a spa bath. The mouse should appear relaxed and cheerful, with a tiny white bath towel draped over its head like a turban. It should be positioned comfortably in the cup’s liquid, with gentle steam rising around it to blend with the cozy atmosphere. "
+    "The mouse’s pose should be natural—perhaps sitting upright with paws resting lightly on the rim or submerged in the tea. The teacup’s floral design, gold trim, and warm lighting must remain unchanged to preserve the original aesthetic. The steam should softly swirl around the mouse, enhancing the spa-like, whimsical mood."
+)
+
+output = pipe(
+    image=image,
+    prompt=prompt,
+    height=height,
+    width=width,
+    num_frames=5,
+    num_inference_steps=50,
+    guidance_scale=5.0,
+    enable_temporal_reasoning=False,
+    num_temporal_reasoning_steps=0,
+).frames[0]
+Image.fromarray((output[-1] * 255).clip(0, 255).astype("uint8")).save("output.png")
+```
+
+Optionally, enable **temporal reasoning** for improved physical consistency:
+```py
+output = pipe(
+    image=image,
+    prompt=prompt,
+    height=height,
+    width=width,
+    num_frames=29,
+    num_inference_steps=50,
+    guidance_scale=5.0,
+    enable_temporal_reasoning=True,
+    num_temporal_reasoning_steps=50,
+).frames[0]
+export_to_video(output, "output.mp4", fps=16)
+Image.fromarray((output[-1] * 255).clip(0, 255).astype("uint8")).save("output.png")
+```
+
+### Inference with 8-Step Distillation Lora
+
+```py
+import torch
+import numpy as np
+from diffusers import AutoencoderKLWan, ChronoEditTransformer3DModel, ChronoEditPipeline
+from diffusers.utils import export_to_video, load_image
+from transformers import CLIPVisionModel
+from PIL import Image
+
+model_id = "nvidia/ChronoEdit-14B-Diffusers"
+image_encoder = CLIPVisionModel.from_pretrained(model_id, subfolder="image_encoder", torch_dtype=torch.float32)
+vae = AutoencoderKLWan.from_pretrained(model_id, subfolder="vae", torch_dtype=torch.float32)
+transformer = ChronoEditTransformer3DModel.from_pretrained(model_id, subfolder="transformer", torch_dtype=torch.bfloat16)
+pipe = ChronoEditPipeline.from_pretrained(model_id, image_encoder=image_encoder, transformer=transformer, vae=vae, torch_dtype=torch.bfloat16)
+lora_path = hf_hub_download(repo_id=model_id, filename="lora/chronoedit_distill_lora.safetensors")
+pipe.load_lora_weights(lora_path)
+pipe.fuse_lora(lora_scale=1.0)
+pipe.scheduler = UniPCMultistepScheduler.from_config(pipe.scheduler.config, flow_shift=2.0)
+pipe.to("cuda")
+
+image = load_image(
+    "https://huggingface.co/spaces/nvidia/ChronoEdit/resolve/main/examples/3.png"
+)
+max_area = 720 * 1280
+aspect_ratio = image.height / image.width
+mod_value = pipe.vae_scale_factor_spatial * pipe.transformer.config.patch_size[1]
+height = round(np.sqrt(max_area * aspect_ratio)) // mod_value * mod_value
+width = round(np.sqrt(max_area / aspect_ratio)) // mod_value * mod_value
+print("width", width, "height", height)
+image = image.resize((width, height))
+prompt = (
+    "The user wants to transform the image by adding a small, cute mouse sitting inside the floral teacup, enjoying a spa bath. The mouse should appear relaxed and cheerful, with a tiny white bath towel draped over its head like a turban. It should be positioned comfortably in the cup’s liquid, with gentle steam rising around it to blend with the cozy atmosphere. "
+    "The mouse’s pose should be natural—perhaps sitting upright with paws resting lightly on the rim or submerged in the tea. The teacup’s floral design, gold trim, and warm lighting must remain unchanged to preserve the original aesthetic. The steam should softly swirl around the mouse, enhancing the spa-like, whimsical mood."
+)
+
+output = pipe(
+    image=image,
+    prompt=prompt,
+    height=height,
+    width=width,
+    num_frames=5,
+    num_inference_steps=8,
+    guidance_scale=1.0,
+    enable_temporal_reasoning=False,
+    num_temporal_reasoning_steps=0,
+).frames[0]
+export_to_video(output, "output.mp4", fps=16)
+Image.fromarray((output[-1] * 255).clip(0, 255).astype("uint8")).save("output.png")
+```
+
+## ChronoEditPipeline
+
+[[autodoc]] ChronoEditPipeline
+  - all
+  - __call__
+
+## ChronoEditPipelineOutput
+
+[[autodoc]] pipelines.chronoedit.pipeline_output.ChronoEditPipelineOutput
\ No newline at end of file
diff --git a/docs/source/en/api/pipelines/flux2.md b/docs/source/en/api/pipelines/flux2.md
new file mode 100644
index 000000000000..393e0d03c341
--- /dev/null
+++ b/docs/source/en/api/pipelines/flux2.md
@@ -0,0 +1,39 @@
+<!--Copyright 2025 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
+
+# Flux2
+
+<div class="flex flex-wrap space-x-1">
+  <img alt="LoRA" src="https://img.shields.io/badge/LoRA-d8b4fe?style=flat"/>
+  <img alt="MPS" src="https://img.shields.io/badge/MPS-000000?style=flat&logo=apple&logoColor=white%22">
+</div>
+
+Flux.2 is the recent series of image generation models from Black Forest Labs, preceded by the [Flux.1](./flux.md) series. It is an entirely new model with a new architecture and pre-training done from scratch!
+
+Original model checkpoints for Flux can be found [here](https://huggingface.co/black-forest-labs). Original inference code can be found [here](https://github.com/black-forest-labs/flux2).
+
+> [!TIP]
+> Flux2 can be quite expensive to run on consumer hardware devices. However, you can perform a suite of optimizations to run it faster and in a more memory-friendly manner. Check out [this section](https://huggingface.co/blog/sd3#memory-optimizations-for-sd3) for more details. Additionally, Flux can benefit from quantization for memory efficiency with a trade-off in inference latency. Refer to [this blog post](https://huggingface.co/blog/quanto-diffusers) to learn more.
+>
+> [Caching](../../optimization/cache) may also speed up inference by storing and reusing intermediate outputs.
+
+## Caption upsampling
+
+Flux.2 can potentially generate better better outputs with better prompts. We can "upsample"
+an input prompt by setting the `caption_upsample_temperature` argument in the pipeline call arguments.
+The [official implementation](https://github.com/black-forest-labs/flux2/blob/5a5d316b1b42f6b59a8c9194b77c8256be848432/src/flux2/text_encoder.py#L140) recommends this value to be 0.15.
+
+## Flux2Pipeline
+
+[[autodoc]] Flux2Pipeline
+	- all
+	- __call__
\ No newline at end of file
diff --git a/docs/source/en/api/pipelines/hunyuan_video15.md b/docs/source/en/api/pipelines/hunyuan_video15.md
new file mode 100644
index 000000000000..d77e72bb0f71
--- /dev/null
+++ b/docs/source/en/api/pipelines/hunyuan_video15.md
@@ -0,0 +1,120 @@
+<!-- Copyright 2025 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License. -->
+
+
+# HunyuanVideo-1.5
+
+HunyuanVideo-1.5 is a lightweight yet powerful video generation model that achieves state-of-the-art visual quality and motion coherence with only 8.3 billion parameters, enabling efficient inference on consumer-grade GPUs. This achievement is built upon several key components, including meticulous data curation, an advanced DiT architecture with selective and sliding tile attention (SSTA), enhanced bilingual understanding through glyph-aware text encoding, progressive pre-training and post-training, and an efficient video super-resolution network. Leveraging these designs, we developed a unified framework capable of high-quality text-to-video and image-to-video generation across multiple durations and resolutions. Extensive experiments demonstrate that this compact and proficient model establishes a new state-of-the-art among open-source models.
+
+You can find all the original HunyuanVideo checkpoints under the [Tencent](https://huggingface.co/tencent) organization.
+
+> [!TIP]
+> Click on the HunyuanVideo models in the right sidebar for more examples of video generation tasks.
+>
+> The examples below use a checkpoint from [hunyuanvideo-community](https://huggingface.co/hunyuanvideo-community) because the weights are stored in a layout compatible with Diffusers.
+
+The example below demonstrates how to generate a video optimized for memory or inference speed.
+
+<hfoptions id="usage">
+<hfoption id="memory">
+
+Refer to the [Reduce memory usage](../../optimization/memory) guide for more details about the various memory saving techniques.
+
+
+```py
+import torch
+from diffusers import AutoModel, HunyuanVideo15Pipeline
+from diffusers.utils import export_to_video
+
+
+pipeline = HunyuanVideo15Pipeline.from_pretrained(
+    "HunyuanVideo-1.5-Diffusers-480p_t2v",
+    torch_dtype=torch.bfloat16,
+)
+
+# model-offloading and tiling
+pipeline.enable_model_cpu_offload()
+pipeline.vae.enable_tiling()
+
+prompt = "A fluffy teddy bear sits on a bed of soft pillows surrounded by children's toys."
+video = pipeline(prompt=prompt, num_frames=61, num_inference_steps=30).frames[0]
+export_to_video(video, "output.mp4", fps=15)
+```
+
+## Notes
+
+- HunyuanVideo1.5 use attention masks with variable-length sequences. For best performance, we recommend using an attention backend that handles padding efficiently.
+
+    - **H100/H800:** `_flash_3_hub` or `_flash_3_varlen_hub`
+    - **A100/A800/RTX 4090:** `flash_hub` or `flash_varlen_hub`
+    - **Other GPUs:** `sage_hub`
+
+Refer to the [Attention backends](../../optimization/attention_backends) guide for more details about using a different backend.
+
+
+```py
+pipe.transformer.set_attention_backend("flash_hub")  # or your preferred backend
+```
+
+- [`HunyuanVideo15Pipeline`] use guider and does not take `guidance_scale` parameter at runtime. 
+
+You can check the default guider configuration using `pipe.guider`:
+
+```py
+>>> pipe.guider 
+ClassifierFreeGuidance {
+  "_class_name": "ClassifierFreeGuidance",
+  "_diffusers_version": "0.36.0.dev0",
+  "enabled": true,
+  "guidance_rescale": 0.0,
+  "guidance_scale": 6.0,
+  "start": 0.0,
+  "stop": 1.0,
+  "use_original_formulation": false
+}
+
+State:
+  step: None
+  num_inference_steps: None
+  timestep: None
+  count_prepared: 0
+  enabled: True
+  num_conditions: 2
+```
+
+To update guider configuration, you can run `pipe.guider = pipe.guider.new(...)`
+
+```py
+pipe.guider = pipe.guider.new(guidance_scale=5.0)
+```
+
+Read more on Guider [here](../../modular_diffusers/guiders).
+
+
+
+## HunyuanVideo15Pipeline
+
+[[autodoc]] HunyuanVideo15Pipeline
+  - all
+  - __call__
+
+## HunyuanVideo15ImageToVideoPipeline
+
+[[autodoc]] HunyuanVideo15ImageToVideoPipeline
+  - all
+  - __call__
+
+## HunyuanVideo15PipelineOutput
+
+[[autodoc]] pipelines.hunyuan_video1_5.pipeline_output.HunyuanVideo15PipelineOutput
diff --git a/docs/source/en/api/pipelines/kandinsky5.md b/docs/source/en/api/pipelines/kandinsky5.md
deleted file mode 100644
index a98a0826b7db..000000000000
--- a/docs/source/en/api/pipelines/kandinsky5.md
+++ /dev/null
@@ -1,149 +0,0 @@
-<!--Copyright 2025 The HuggingFace Team. All rights reserved.
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
--->
-
-# Kandinsky 5.0
-
-Kandinsky 5.0 is created by the Kandinsky team: Alexey Letunovskiy, Maria Kovaleva, Ivan Kirillov, Lev Novitskiy, Denis Koposov, Dmitrii Mikhailov, Anna Averchenkova, Andrey Shutkin, Julia Agafonova, Olga Kim, Anastasiia Kargapoltseva, Nikita Kiselev, Anna Dmitrienko,  Anastasia Maltseva, Kirill Chernyshev, Ilia Vasiliev, Viacheslav Vasilev, Vladimir Polovnikov, Yury Kolabushin, Alexander Belykh, Mikhail Mamaev, Anastasia Aliaskina, Tatiana Nikulina, Polina Gavrilova, Vladimir Arkhipkin, Vladimir Korviakov, Nikolai Gerasimenko, Denis Parkhomenko, Denis Dimitrov
-
-
-Kandinsky 5.0 is a family of diffusion models for Video & Image generation. Kandinsky 5.0 T2V Lite is a lightweight video generation model (2B parameters) that ranks #1 among open-source models in its class. It outperforms larger models and offers the best understanding of Russian concepts in the open-source ecosystem.
-
-The model introduces several key innovations:
-- **Latent diffusion pipeline** with **Flow Matching** for improved training stability
-- **Diffusion Transformer (DiT)** as the main generative backbone with cross-attention to text embeddings
-- Dual text encoding using **Qwen2.5-VL** and **CLIP** for comprehensive text understanding
-- **HunyuanVideo 3D VAE** for efficient video encoding and decoding
-- **Sparse attention mechanisms** (NABLA) for efficient long-sequence processing
-
-The original codebase can be found at [ai-forever/Kandinsky-5](https://github.com/ai-forever/Kandinsky-5).
-
-> [!TIP]
-> Check out the [AI Forever](https://huggingface.co/ai-forever) organization on the Hub for the official model checkpoints for text-to-video generation, including pretrained, SFT, no-CFG, and distilled variants.
-
-## Available Models
-
-Kandinsky 5.0 T2V Lite comes in several variants optimized for different use cases:
-
-| model_id | Description | Use Cases |
-|------------|-------------|-----------|
-| **ai-forever/Kandinsky-5.0-T2V-Lite-sft-5s-Diffusers** | 5 second Supervised Fine-Tuned model | Highest generation quality |
-| **ai-forever/Kandinsky-5.0-T2V-Lite-sft-10s-Diffusers** | 10 second Supervised Fine-Tuned model | Highest generation quality |
-| **ai-forever/Kandinsky-5.0-T2V-Lite-nocfg-5s-Diffusers** | 5 second Classifier-Free Guidance distilled | 2× faster inference |
-| **ai-forever/Kandinsky-5.0-T2V-Lite-nocfg-10s-Diffusers** | 10 second Classifier-Free Guidance distilled | 2× faster inference |
-| **ai-forever/Kandinsky-5.0-T2V-Lite-distilled16steps-5s-Diffusers** | 5 second Diffusion distilled to 16 steps | 6× faster inference, minimal quality loss |
-| **ai-forever/Kandinsky-5.0-T2V-Lite-distilled16steps-10s-Diffusers** | 10 second Diffusion distilled to 16 steps | 6× faster inference, minimal quality loss |
-| **ai-forever/Kandinsky-5.0-T2V-Lite-pretrain-5s-Diffusers** | 5 second Base pretrained model | Research and fine-tuning |
-| **ai-forever/Kandinsky-5.0-T2V-Lite-pretrain-10s-Diffusers** | 10 second Base pretrained model | Research and fine-tuning |
-
-All models are available in 5-second and 10-second video generation versions.
-
-## Kandinsky5T2VPipeline
-
-[[autodoc]] Kandinsky5T2VPipeline
-    - all
-    - __call__
-
-## Usage Examples
-
-### Basic Text-to-Video Generation
-
-```python
-import torch
-from diffusers import Kandinsky5T2VPipeline
-from diffusers.utils import export_to_video
-
-# Load the pipeline
-model_id = "ai-forever/Kandinsky-5.0-T2V-Lite-sft-5s-Diffusers"
-pipe = Kandinsky5T2VPipeline.from_pretrained(model_id, torch_dtype=torch.bfloat16)
-pipe = pipe.to("cuda")
-
-# Generate video
-prompt = "A cat and a dog baking a cake together in a kitchen."
-negative_prompt = "Static, 2D cartoon, cartoon, 2d animation, paintings, images, worst quality, low quality, ugly, deformed, walking backwards"
-
-output = pipe(
-    prompt=prompt,
-    negative_prompt=negative_prompt,
-    height=512,
-    width=768,
-    num_frames=121,  # ~5 seconds at 24fps
-    num_inference_steps=50,
-    guidance_scale=5.0,
-).frames[0]
-
-export_to_video(output, "output.mp4", fps=24, quality=9)
-```
-
-### 10 second Models
-**⚠️ Warning!** all 10 second models should be used with Flex attention and max-autotune-no-cudagraphs compilation:
-
-```python
-pipe = Kandinsky5T2VPipeline.from_pretrained(
-    "ai-forever/Kandinsky-5.0-T2V-Lite-sft-10s-Diffusers", 
-    torch_dtype=torch.bfloat16
-)
-pipe = pipe.to("cuda")
-
-pipe.transformer.set_attention_backend(
-    "flex"
-)                                       # <--- Set attention backend to Flex
-pipe.transformer.compile(
-    mode="max-autotune-no-cudagraphs", 
-    dynamic=True
-)                                       # <--- Compile with max-autotune-no-cudagraphs
-
-prompt = "A cat and a dog baking a cake together in a kitchen."
-negative_prompt = "Static, 2D cartoon, cartoon, 2d animation, paintings, images, worst quality, low quality, ugly, deformed, walking backwards"
-
-output = pipe(
-    prompt=prompt,
-    negative_prompt=negative_prompt,
-    height=512,
-    width=768,
-    num_frames=241,
-    num_inference_steps=50,
-    guidance_scale=5.0,
-).frames[0]
-
-export_to_video(output, "output.mp4", fps=24, quality=9)
-```
-
-### Diffusion Distilled model
-**⚠️ Warning!** all nocfg and diffusion distilled models should be inferred without CFG (```guidance_scale=1.0```):
-
-```python
-model_id = "ai-forever/Kandinsky-5.0-T2V-Lite-distilled16steps-5s-Diffusers"
-pipe = Kandinsky5T2VPipeline.from_pretrained(model_id, torch_dtype=torch.bfloat16)
-pipe = pipe.to("cuda")
-
-output = pipe(
-    prompt="A beautiful sunset over mountains",
-    num_inference_steps=16,  # <--- Model is distilled in 16 steps
-    guidance_scale=1.0,      # <--- no CFG
-).frames[0]
-
-export_to_video(output, "output.mp4", fps=24, quality=9)
-```
-
-
-## Citation
-```bibtex
-@misc{kandinsky2025,
-    author = {Alexey Letunovskiy and Maria Kovaleva and Ivan Kirillov and Lev Novitskiy and Denis Koposov and
-              Dmitrii Mikhailov and Anna Averchenkova and Andrey Shutkin and Julia Agafonova and Olga Kim and
-              Anastasiia Kargapoltseva and Nikita Kiselev and Vladimir Arkhipkin and Vladimir Korviakov and
-              Nikolai Gerasimenko and Denis Parkhomenko and Anna Dmitrienko and Anastasia Maltseva and
-              Kirill Chernyshev and Ilia Vasiliev and Viacheslav Vasilev and Vladimir Polovnikov and
-              Yury Kolabushin and Alexander Belykh and Mikhail Mamaev and Anastasia Aliaskina and
-              Tatiana Nikulina and Polina Gavrilova and Denis Dimitrov},
-    title = {Kandinsky 5.0: A family of diffusion models for Video & Image generation},
-    howpublished = {\url{https://github.com/ai-forever/Kandinsky-5}},
-    year = 2025
-}
-```
diff --git a/docs/source/en/api/pipelines/kandinsky5_image.md b/docs/source/en/api/pipelines/kandinsky5_image.md
new file mode 100644
index 000000000000..1125e1594b03
--- /dev/null
+++ b/docs/source/en/api/pipelines/kandinsky5_image.md
@@ -0,0 +1,116 @@
+<!--Copyright 2025 The HuggingFace Team and Kandinsky Lab Team. All rights reserved.
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
+
+# Kandinsky 5.0 Image
+
+[Kandinsky 5.0](https://arxiv.org/abs/2511.14993) is a family of diffusion models for Video & Image generation. 
+
+Kandinsky 5.0 Image Lite is a lightweight image generation model (6B parameters).
+
+The model introduces several key innovations:
+- **Latent diffusion pipeline** with **Flow Matching** for improved training stability
+- **Diffusion Transformer (DiT)** as the main generative backbone with cross-attention to text embeddings
+- Dual text encoding using **Qwen2.5-VL** and **CLIP** for comprehensive text understanding
+- **Flux VAE** for efficient image encoding and decoding
+
+The original codebase can be found at [kandinskylab/Kandinsky-5](https://github.com/kandinskylab/Kandinsky-5).
+
+> [!TIP]
+> Check out the [Kandinsky Lab](https://huggingface.co/kandinskylab) organization on the Hub for the official model checkpoints for text-to-video generation, including pretrained, SFT, no-CFG, and distilled variants.
+
+
+## Available Models
+
+Kandinsky 5.0 Image Lite:
+
+| model_id | Description | Use Cases |
+|------------|-------------|-----------|
+| [**kandinskylab/Kandinsky-5.0-T2I-Lite-sft-Diffusers**](https://huggingface.co/kandinskylab/Kandinsky-5.0-T2I-Lite-sft-Diffusers) | 6B image Supervised Fine-Tuned model | Highest generation quality |
+| [**kandinskylab/Kandinsky-5.0-I2I-Lite-sft-Diffusers**](https://huggingface.co/kandinskylab/Kandinsky-5.0-I2I-Lite-sft-Diffusers) | 6B image editing Supervised Fine-Tuned model | Highest generation quality |
+| [**kandinskylab/Kandinsky-5.0-T2I-Lite-pretrain-Diffusers**](https://huggingface.co/kandinskylab/Kandinsky-5.0-T2I-Lite-pretrain-Diffusers) | 6B image Base pretrained model | Research and fine-tuning |
+| [**kandinskylab/Kandinsky-5.0-I2I-Lite-pretrain-Diffusers**](https://huggingface.co/kandinskylab/Kandinsky-5.0-I2I-Lite-pretrain-Diffusers) | 6B image editing Base pretrained model | Research and fine-tuning |
+
+## Usage Examples
+
+### Basic Text-to-Image Generation
+
+```python
+import torch
+from diffusers import Kandinsky5T2IPipeline
+
+# Load the pipeline
+model_id = "kandinskylab/Kandinsky-5.0-T2I-Lite-sft-Diffusers"
+pipe = Kandinsky5T2IPipeline.from_pretrained(model_id)
+_ = pipe.to(device='cuda',dtype=torch.bfloat16)
+
+# Generate image
+prompt = "A fluffy, expressive cat wearing a bright red hat with a soft, slightly textured fabric. The hat should look cozy and well-fitted on the cat’s head. On the front of the hat, add clean, bold white text that reads “SWEET”, clearly visible and neatly centered. Ensure the overall lighting highlights the hat’s color and the cat’s fur details."
+
+output = pipe(
+    prompt=prompt,
+    negative_prompt="",
+    height=1024,
+    width=1024,
+    num_inference_steps=50,
+    guidance_scale=3.5,
+).image[0]
+```
+
+### Basic Image-to-Image Generation
+
+```python
+import torch
+from diffusers import Kandinsky5I2IPipeline
+from diffusers.utils import load_image 
+# Load the pipeline
+model_id = "kandinskylab/Kandinsky-5.0-I2I-Lite-sft-Diffusers"
+pipe = Kandinsky5I2IPipeline.from_pretrained(model_id)
+
+_ = pipe.to(device='cuda',dtype=torch.bfloat16)
+pipe.enable_model_cpu_offload()                                               # <--- Enable CPU offloading for single GPU inference
+
+# Edit the input image
+image = load_image(
+    "https://huggingface.co/kandinsky-community/kandinsky-3/resolve/main/assets/title.jpg?download=true"
+)
+
+prompt = "Change the background from a winter night scene to a bright summer day. Place the character on a sandy beach with clear blue sky, soft sunlight, and gentle waves in the distance. Replace the winter clothing with a light short-sleeved T-shirt (in soft pastel colors) and casual shorts. Ensure the character’s fur reflects warm daylight instead of cold winter tones. Add small beach details such as seashells, footprints in the sand, and a few scattered beach toys nearby. Keep the oranges in the scene, but place them naturally on the sand."
+negative_prompt = ""
+
+output = pipe(
+    image=image,
+    prompt=prompt,
+    negative_prompt=negative_prompt,
+    guidance_scale=3.5,
+).image[0]
+```
+
+
+## Kandinsky5T2IPipeline
+
+[[autodoc]] Kandinsky5T2IPipeline
+    - all
+    - __call__
+
+## Kandinsky5I2IPipeline
+
+[[autodoc]] Kandinsky5I2IPipeline
+    - all
+    - __call__
+
+
+## Citation
+```bibtex
+@misc{kandinsky2025,
+    author = {Alexander Belykh and Alexander Varlamov and Alexey Letunovskiy and Anastasia Aliaskina and Anastasia Maltseva and Anastasiia Kargapoltseva and Andrey Shutkin and Anna Averchenkova and Anna Dmitrienko and Bulat Akhmatov and Denis Dimitrov and Denis Koposov and Denis Parkhomenko and Dmitrii and Ilya Vasiliev and Ivan Kirillov and Julia Agafonova and Kirill Chernyshev and Kormilitsyn Semen and Lev Novitskiy and Maria Kovaleva and Mikhail Mamaev and Mikhailov and Nikita Kiselev and Nikita Osterov and Nikolai Gerasimenko and Nikolai Vaulin and Olga Kim and Olga Vdovchenko and Polina Gavrilova and Polina Mikhailova and Tatiana Nikulina and Viacheslav Vasilev and Vladimir Arkhipkin and Vladimir Korviakov and Vladimir Polovnikov and Yury Kolabushin},
+    title = {Kandinsky 5.0: A family of diffusion models for Video & Image generation},
+    howpublished = {\url{https://github.com/kandinskylab/Kandinsky-5}},
+    year = 2025
+}
+```
diff --git a/docs/source/en/api/pipelines/kandinsky5_video.md b/docs/source/en/api/pipelines/kandinsky5_video.md
new file mode 100644
index 000000000000..733e2481732a
--- /dev/null
+++ b/docs/source/en/api/pipelines/kandinsky5_video.md
@@ -0,0 +1,310 @@
+<!--Copyright 2025 The HuggingFace Team Kandinsky Lab Team. All rights reserved.
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
+
+# Kandinsky 5.0 Video
+
+[Kandinsky 5.0](https://arxiv.org/abs/2511.14993) is a family of diffusion models for Video & Image generation.
+
+Kandinsky 5.0 Lite line-up of lightweight video generation models (2B parameters) that ranks #1 among open-source models in its class. It outperforms larger models and offers the best understanding of Russian concepts in the open-source ecosystem.
+
+Kandinsky 5.0 Pro line-up of large high quality video generation models (19B parameters). It offers high qualty generation in HD and more generation formats like I2V.
+
+The model introduces several key innovations:
+- **Latent diffusion pipeline** with **Flow Matching** for improved training stability
+- **Diffusion Transformer (DiT)** as the main generative backbone with cross-attention to text embeddings
+- Dual text encoding using **Qwen2.5-VL** and **CLIP** for comprehensive text understanding
+- **HunyuanVideo 3D VAE** for efficient video encoding and decoding
+- **Sparse attention mechanisms** (NABLA) for efficient long-sequence processing
+
+The original codebase can be found at [kandinskylab/Kandinsky-5](https://github.com/kandinskylab/Kandinsky-5).
+
+> [!TIP]
+> Check out the [Kandinsky Lab](https://huggingface.co/kandinskylab) organization on the Hub for the official model checkpoints for text-to-video generation, including pretrained, SFT, no-CFG, and distilled variants.
+
+## Available Models
+
+Kandinsky 5.0 T2V Pro:
+
+| model_id | Description | Use Cases |
+|------------|-------------|-----------|
+| **kandinskylab/Kandinsky-5.0-T2V-Pro-sft-5s-Diffusers** | 5 second Text-to-Video Pro model | High-quality text-to-video generation |
+| **kandinskylab/Kandinsky-5.0-I2V-Pro-sft-5s-Diffusers** | 5 second Image-to-Video Pro model | High-quality image-to-video generation |
+
+Kandinsky 5.0 T2V Lite:
+| model_id | Description | Use Cases |
+|------------|-------------|-----------|
+| **kandinskylab/Kandinsky-5.0-T2V-Lite-sft-5s-Diffusers** | 5 second Supervised Fine-Tuned model | Highest generation quality |
+| **kandinskylab/Kandinsky-5.0-T2V-Lite-sft-10s-Diffusers** | 10 second Supervised Fine-Tuned model | Highest generation quality |
+| **kandinskylab/Kandinsky-5.0-T2V-Lite-nocfg-5s-Diffusers** | 5 second Classifier-Free Guidance distilled | 2× faster inference |
+| **kandinskylab/Kandinsky-5.0-T2V-Lite-nocfg-10s-Diffusers** | 10 second Classifier-Free Guidance distilled | 2× faster inference |
+| **kandinskylab/Kandinsky-5.0-T2V-Lite-distilled16steps-5s-Diffusers** | 5 second Diffusion distilled to 16 steps | 6× faster inference, minimal quality loss |
+| **kandinskylab/Kandinsky-5.0-T2V-Lite-distilled16steps-10s-Diffusers** | 10 second Diffusion distilled to 16 steps | 6× faster inference, minimal quality loss |
+| **kandinskylab/Kandinsky-5.0-T2V-Lite-pretrain-5s-Diffusers** | 5 second Base pretrained model | Research and fine-tuning |
+| **kandinskylab/Kandinsky-5.0-T2V-Lite-pretrain-10s-Diffusers** | 10 second Base pretrained model | Research and fine-tuning |
+
+
+## Usage Examples
+
+### Basic Text-to-Video Generation
+
+#### Pro
+**⚠️ Warning!** all Pro models should be infered with pipeline.enable_model_cpu_offload()  
+```python
+import torch
+from diffusers import Kandinsky5T2VPipeline
+from diffusers.utils import export_to_video
+
+# Load the pipeline
+model_id = "kandinskylab/Kandinsky-5.0-T2V-Pro-sft-5s-Diffusers"
+pipe = Kandinsky5T2VPipeline.from_pretrained(model_id, torch_dtype=torch.bfloat16)
+
+pipe = pipe.to("cuda")
+pipeline.transformer.set_attention_backend("flex")                            # <--- Set attention bakend to Flex
+pipeline.enable_model_cpu_offload()                                           # <--- Enable cpu offloading for single GPU inference
+pipeline.transformer.compile(mode="max-autotune-no-cudagraphs", dynamic=True) # <--- Compile with max-autotune-no-cudagraphs
+
+# Generate video
+prompt = "A cat and a dog baking a cake together in a kitchen."
+negative_prompt = "Static, 2D cartoon, cartoon, 2d animation, paintings, images, worst quality, low quality, ugly, deformed, walking backwards"
+
+output = pipe(
+    prompt=prompt,
+    negative_prompt=negative_prompt,
+    height=768,
+    width=1024,
+    num_frames=121,  # ~5 seconds at 24fps
+    num_inference_steps=50,
+    guidance_scale=5.0,
+).frames[0]
+
+export_to_video(output, "output.mp4", fps=24, quality=9)
+```
+
+#### Lite
+```python
+import torch
+from diffusers import Kandinsky5T2VPipeline
+from diffusers.utils import export_to_video
+
+# Load the pipeline
+model_id = "kandinskylab/Kandinsky-5.0-T2V-Lite-sft-5s-Diffusers"
+pipe = Kandinsky5T2VPipeline.from_pretrained(model_id, torch_dtype=torch.bfloat16)
+pipe = pipe.to("cuda")
+
+# Generate video
+prompt = "A cat and a dog baking a cake together in a kitchen."
+negative_prompt = "Static, 2D cartoon, cartoon, 2d animation, paintings, images, worst quality, low quality, ugly, deformed, walking backwards"
+
+output = pipe(
+    prompt=prompt,
+    negative_prompt=negative_prompt,
+    height=512,
+    width=768,
+    num_frames=121,  # ~5 seconds at 24fps
+    num_inference_steps=50,
+    guidance_scale=5.0,
+).frames[0]
+
+export_to_video(output, "output.mp4", fps=24, quality=9)
+```
+
+### 10 second Models
+**⚠️ Warning!** all 10 second models should be used with Flex attention and max-autotune-no-cudagraphs compilation:
+
+```python
+pipe = Kandinsky5T2VPipeline.from_pretrained(
+    "kandinskylab/Kandinsky-5.0-T2V-Lite-sft-10s-Diffusers", 
+    torch_dtype=torch.bfloat16
+)
+pipe = pipe.to("cuda")
+
+pipe.transformer.set_attention_backend(
+    "flex"
+)                                       # <--- Set attention bakend to Flex
+pipe.transformer.compile(
+    mode="max-autotune-no-cudagraphs", 
+    dynamic=True
+)                                       # <--- Compile with max-autotune-no-cudagraphs
+
+prompt = "A cat and a dog baking a cake together in a kitchen."
+negative_prompt = "Static, 2D cartoon, cartoon, 2d animation, paintings, images, worst quality, low quality, ugly, deformed, walking backwards"
+
+output = pipe(
+    prompt=prompt,
+    negative_prompt=negative_prompt,
+    height=512,
+    width=768,
+    num_frames=241,
+    num_inference_steps=50,
+    guidance_scale=5.0,
+).frames[0]
+
+export_to_video(output, "output.mp4", fps=24, quality=9)
+```
+
+### Diffusion Distilled model
+**⚠️ Warning!** all nocfg and diffusion distilled models should be infered wothout CFG (```guidance_scale=1.0```):
+
+```python
+model_id = "kandinskylab/Kandinsky-5.0-T2V-Lite-distilled16steps-5s-Diffusers"
+pipe = Kandinsky5T2VPipeline.from_pretrained(model_id, torch_dtype=torch.bfloat16)
+pipe = pipe.to("cuda")
+
+output = pipe(
+    prompt="A beautiful sunset over mountains",
+    num_inference_steps=16,  # <--- Model is distilled in 16 steps
+    guidance_scale=1.0,      # <--- no CFG
+).frames[0]
+
+export_to_video(output, "output.mp4", fps=24, quality=9)
+```
+
+
+### Basic Image-to-Video Generation
+**⚠️ Warning!** all Pro models should be infered with pipeline.enable_model_cpu_offload()  
+```python
+import torch
+from diffusers import Kandinsky5T2VPipeline
+from diffusers.utils import export_to_video
+
+# Load the pipeline
+model_id = "kandinskylab/Kandinsky-5.0-I2V-Pro-sft-5s-Diffusers"
+pipe = Kandinsky5T2VPipeline.from_pretrained(model_id, torch_dtype=torch.bfloat16)
+
+pipe = pipe.to("cuda")
+pipeline.transformer.set_attention_backend("flex")                            # <--- Set attention bakend to Flex
+pipeline.enable_model_cpu_offload()                                           # <--- Enable cpu offloading for single GPU inference
+pipeline.transformer.compile(mode="max-autotune-no-cudagraphs", dynamic=True) # <--- Compile with max-autotune-no-cudagraphs
+
+# Generate video
+image = load_image(
+    "https://huggingface.co/kandinsky-community/kandinsky-3/resolve/main/assets/title.jpg?download=true"
+)
+height = 896
+width = 896
+image = image.resize((width, height))
+
+prompt = "An funny furry creture smiles happily and holds a sign that says 'Kandinsky'"
+negative_prompt = ""
+
+output = pipe(
+    prompt=prompt,
+    negative_prompt=negative_prompt,
+    height=height,
+    width=width,
+    num_frames=121,  # ~5 seconds at 24fps
+    num_inference_steps=50,
+    guidance_scale=5.0,
+).frames[0]
+
+export_to_video(output, "output.mp4", fps=24, quality=9)
+```
+
+
+
+## Kandinsky 5.0 Pro Side-by-Side evaluation
+
+<table border="0" style="width: 200; text-align: left; margin-top: 20px;">
+  <tr>
+      <td>
+          <img width="200" alt="image" src="https://github.com/user-attachments/assets/73e5ff00-2735-40fd-8f01-767de9181918" />
+      </td>
+      <td>
+         <img width="200" alt="image" src="https://github.com/user-attachments/assets/f449a9e7-74b7-481d-82da-02723e396acd" />
+      </td>
+
+  <tr>
+      <td>
+          Comparison with Veo 3 
+      </td>
+      <td>
+          Comparison with Veo 3 fast
+      </td>
+  <tr>
+      <td>
+          <img width="200" alt="image" src="https://github.com/user-attachments/assets/a6902fb6-b5e8-4093-adad-aa4caab79c6d" />
+      </td>
+      <td>
+          <img width="200" alt="image" src="https://github.com/user-attachments/assets/09986015-3d07-4de8-b942-c145039b9b2d" />
+      </td>
+  <tr>
+      <td>
+          Comparison with Wan 2.2 A14B Text-to-Video mode
+      </td>
+      <td>
+          Comparison with Wan 2.2 A14B Image-to-Video mode
+      </td>
+
+</table>
+
+
+## Kandinsky 5.0 Lite Side-by-Side evaluation
+
+The evaluation is based on the expanded prompts from the [Movie Gen benchmark](https://github.com/facebookresearch/MovieGenBench), which are available in the expanded_prompt column of the benchmark/moviegen_bench.csv file.
+
+<table border="0" style="width: 400; text-align: left; margin-top: 20px;">
+  <tr>
+      <td>
+          <img src="https://github.com/kandinskylab/kandinsky-5/raw/main/assets/sbs/kandinsky_5_video_lite_vs_sora.jpg" width=400 >
+      </td>
+      <td>
+          <img src="https://github.com/kandinskylab/kandinsky-5/raw/main/assets/sbs/kandinsky_5_video_lite_vs_wan_2.1_14B.jpg" width=400 >
+      </td>
+  <tr>
+      <td>
+          <img src="https://github.com/kandinskylab/kandinsky-5/raw/main/assets/sbs/kandinsky_5_video_lite_vs_wan_2.2_5B.jpg" width=400 >
+      </td>
+      <td>
+          <img src="https://github.com/kandinskylab/kandinsky-5/raw/main/assets/sbs/kandinsky_5_video_lite_vs_wan_2.2_A14B.jpg" width=400 >
+      </td>
+  <tr>
+      <td>
+          <img src="https://github.com/kandinskylab/kandinsky-5/raw/main/assets/sbs/kandinsky_5_video_lite_vs_wan_2.1_1.3B.jpg" width=400 >
+      </td>
+
+</table>
+
+
+
+
+## Kandinsky 5.0 Lite Distill Side-by-Side evaluation
+
+<table border="0" style="width: 400; text-align: left; margin-top: 20px;">
+  <tr>
+      <td>
+          <img src="https://github.com/kandinskylab/kandinsky-5/raw/main/assets/sbs/kandinsky_5_video_lite_5s_vs_kandinsky_5_video_lite_distill_5s.jpg" width=400 >
+      </td>
+      <td>
+          <img src="https://github.com/kandinskylab/kandinsky-5/raw/main/assets/sbs/kandinsky_5_video_lite_10s_vs_kandinsky_5_video_lite_distill_10s.jpg" width=400 >
+      </td>
+
+</table>
+
+## Kandinsky5T2VPipeline
+
+[[autodoc]] Kandinsky5T2VPipeline
+    - all
+    - __call__
+
+## Kandinsky5I2VPipeline
+
+[[autodoc]] Kandinsky5I2VPipeline
+    - all
+    - __call__
+
+
+## Citation
+```bibtex
+@misc{kandinsky2025,
+    author = {Alexander Belykh and Alexander Varlamov and Alexey Letunovskiy and Anastasia Aliaskina and Anastasia Maltseva and Anastasiia Kargapoltseva and Andrey Shutkin and Anna Averchenkova and Anna Dmitrienko and Bulat Akhmatov and Denis Dimitrov and Denis Koposov and Denis Parkhomenko and Dmitrii and Ilya Vasiliev and Ivan Kirillov and Julia Agafonova and Kirill Chernyshev and Kormilitsyn Semen and Lev Novitskiy and Maria Kovaleva and Mikhail Mamaev and Mikhailov and Nikita Kiselev and Nikita Osterov and Nikolai Gerasimenko and Nikolai Vaulin and Olga Kim and Olga Vdovchenko and Polina Gavrilova and Polina Mikhailova and Tatiana Nikulina and Viacheslav Vasilev and Vladimir Arkhipkin and Vladimir Korviakov and Vladimir Polovnikov and Yury Kolabushin},
+    title = {Kandinsky 5.0: A family of diffusion models for Video & Image generation},
+    howpublished = {\url{https://github.com/kandinskylab/Kandinsky-5}},
+    year = 2025
+}
+```
diff --git a/docs/source/en/api/pipelines/ovis_image.md b/docs/source/en/api/pipelines/ovis_image.md
new file mode 100644
index 000000000000..e03889b0a020
--- /dev/null
+++ b/docs/source/en/api/pipelines/ovis_image.md
@@ -0,0 +1,50 @@
+<!--Copyright 2025 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
+
+# Ovis-Image
+
+![concepts](https://github.com/AIDC-AI/Ovis-Image/blob/main/docs/imgs/ovis_image_case.png)
+
+Ovis-Image is a 7B text-to-image model specifically optimized for high-quality text rendering, designed to operate efficiently under stringent computational constraints.
+
+[Ovis-Image Technical Report](https://arxiv.org/abs/2511.22982) from Alibaba Group, by Guo-Hua Wang, Liangfu Cao, Tianyu Cui, Minghao Fu, Xiaohao Chen, Pengxin Zhan, Jianshan Zhao, Lan Li, Bowen Fu, Jiaqi Liu, Qing-Guo Chen.
+
+The abstract from the paper is:
+
+*We introduce Ovis-Image, a 7B text-to-image model specifically optimized for high-quality text rendering, designed to operate efficiently under stringent computational constraints. Built upon our previous Ovis-U1 framework, Ovis-Image integrates a diffusion-based visual decoder with the stronger Ovis 2.5 multimodal backbone, leveraging a text-centric training pipeline that combines large-scale pre-training with carefully tailored post-training refinements. Despite its compact architecture, Ovis-Image achieves text rendering performance on par with significantly larger open models such as Qwen-Image and approaches closed-source systems like Seedream and GPT4o. Crucially, the model remains deployable on a single high-end GPU with moderate memory, narrowing the gap between frontier-level text rendering and practical deployment. Our results indicate that combining a strong multimodal backbone with a carefully designed, text-focused training recipe is sufficient to achieve reliable bilingual text rendering without resorting to oversized or proprietary models.*
+
+**Highlights**: 
+
+*   **Strong text rendering at a compact 7B scale**: Ovis-Image is a 7B text-to-image model that delivers text rendering quality comparable to much larger 20B-class systems such as Qwen-Image and competitive with leading closed-source models like GPT4o in text-centric scenarios, while remaining small enough to run on widely accessible hardware.
+*   **High fidelity on text-heavy, layout-sensitive prompts**: The model excels on prompts that demand tight alignment between linguistic content and rendered typography (e.g., posters, banners, logos, UI mockups, infographics), producing legible, correctly spelled, and semantically consistent text across diverse fonts, sizes, and aspect ratios without compromising overall visual quality.
+*   **Efficiency and deployability**: With its 7B parameter budget and streamlined architecture, Ovis-Image fits on a single high-end GPU with moderate memory, supports low-latency interactive use, and scales to batch production serving, bringing near–frontier text rendering to applications where tens-of-billions–parameter models are impractical.
+
+
+This pipeline was contributed by Ovis-Image Team. The original codebase can be found [here](https://github.com/AIDC-AI/Ovis-Image).
+
+Available models:
+
+| Model | Recommended dtype |
+|:-----:|:-----------------:|
+| [`AIDC-AI/Ovis-Image-7B`](https://huggingface.co/AIDC-AI/Ovis-Image-7B) | `torch.bfloat16` |
+
+Refer to [this](https://huggingface.co/collections/AIDC-AI/ovis-image) collection for more information.
+
+## OvisImagePipeline
+
+[[autodoc]] OvisImagePipeline
+	- all
+	- __call__
+
+## OvisImagePipelineOutput
+
+[[autodoc]] pipelines.ovis_image.pipeline_output.OvisImagePipelineOutput
diff --git a/docs/source/en/api/pipelines/sana_sprint.md b/docs/source/en/api/pipelines/sana_sprint.md
index 357d7e406dd4..46cdc13302ec 100644
--- a/docs/source/en/api/pipelines/sana_sprint.md
+++ b/docs/source/en/api/pipelines/sana_sprint.md
@@ -24,9 +24,6 @@ The abstract from the paper is:
 
 *This paper presents SANA-Sprint, an efficient diffusion model for ultra-fast text-to-image (T2I) generation. SANA-Sprint is built on a pre-trained foundation model and augmented with hybrid distillation, dramatically reducing inference steps from 20 to 1-4. We introduce three key innovations: (1) We propose a training-free approach that transforms a pre-trained flow-matching model for continuous-time consistency distillation (sCM), eliminating costly training from scratch and achieving high training efficiency. Our hybrid distillation strategy combines sCM with latent adversarial distillation (LADD): sCM ensures alignment with the teacher model, while LADD enhances single-step generation fidelity. (2) SANA-Sprint is a unified step-adaptive model that achieves high-quality generation in 1-4 steps, eliminating step-specific training and improving efficiency. (3) We integrate ControlNet with SANA-Sprint for real-time interactive image generation, enabling instant visual feedback for user interaction. SANA-Sprint establishes a new Pareto frontier in speed-quality tradeoffs, achieving state-of-the-art performance with 7.59 FID and 0.74 GenEval in only 1 step — outperforming FLUX-schnell (7.94 FID / 0.71 GenEval) while being 10× faster (0.1s vs 1.1s on H100). It also achieves 0.1s (T2I) and 0.25s (ControlNet) latency for 1024×1024 images on H100, and 0.31s (T2I) on an RTX 4090, showcasing its exceptional efficiency and potential for AI-powered consumer applications (AIPC). Code and pre-trained models will be open-sourced.*
 
-> [!TIP]
-> Make sure to check out the Schedulers [guide](../../using-diffusers/schedulers) to learn how to explore the tradeoff between scheduler speed and quality, and see the [reuse components across pipelines](../../using-diffusers/loading#reuse-a-pipeline) section to learn how to efficiently load the same components into multiple pipelines.
-
 This pipeline was contributed by [lawrence-cj](https://github.com/lawrence-cj), [shuchen Xue](https://github.com/scxue) and [Enze Xie](https://github.com/xieenze). The original codebase can be found [here](https://github.com/NVlabs/Sana). The original weights can be found under [hf.co/Efficient-Large-Model](https://huggingface.co/Efficient-Large-Model/).
 
 Available models:
diff --git a/docs/source/en/api/pipelines/sana_video.md b/docs/source/en/api/pipelines/sana_video.md
new file mode 100644
index 000000000000..9e330c758318
--- /dev/null
+++ b/docs/source/en/api/pipelines/sana_video.md
@@ -0,0 +1,189 @@
+<!-- Copyright 2025 The SANA-Video Authors and HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License. -->
+
+# Sana-Video
+
+<div class="flex flex-wrap space-x-1">
+  <img alt="LoRA" src="https://img.shields.io/badge/LoRA-d8b4fe?style=flat"/>
+  <img alt="MPS" src="https://img.shields.io/badge/MPS-000000?style=flat&logo=apple&logoColor=white%22">
+</div>
+
+[SANA-Video: Efficient Video Generation with Block Linear Diffusion Transformer](https://huggingface.co/papers/2509.24695) from NVIDIA and MIT HAN Lab, by Junsong Chen, Yuyang Zhao, Jincheng Yu, Ruihang Chu, Junyu Chen, Shuai Yang, Xianbang Wang, Yicheng Pan, Daquan Zhou, Huan Ling, Haozhe Liu, Hongwei Yi, Hao Zhang, Muyang Li, Yukang Chen, Han Cai, Sanja Fidler, Ping Luo, Song Han, Enze Xie.
+
+The abstract from the paper is:
+
+*We introduce SANA-Video, a small diffusion model that can efficiently generate videos up to 720x1280 resolution and minute-length duration. SANA-Video synthesizes high-resolution, high-quality and long videos with strong text-video alignment at a remarkably fast speed, deployable on RTX 5090 GPU. Two core designs ensure our efficient, effective and long video generation: (1) Linear DiT: We leverage linear attention as the core operation, which is more efficient than vanilla attention given the large number of tokens processed in video generation. (2) Constant-Memory KV cache for Block Linear Attention: we design block-wise autoregressive approach for long video generation by employing a constant-memory state, derived from the cumulative properties of linear attention. This KV cache provides the Linear DiT with global context at a fixed memory cost, eliminating the need for a traditional KV cache and enabling efficient, minute-long video generation. In addition, we explore effective data filters and model training strategies, narrowing the training cost to 12 days on 64 H100 GPUs, which is only 1% of the cost of MovieGen. Given its low cost, SANA-Video achieves competitive performance compared to modern state-of-the-art small diffusion models (e.g., Wan 2.1-1.3B and SkyReel-V2-1.3B) while being 16x faster in measured latency. Moreover, SANA-Video can be deployed on RTX 5090 GPUs with NVFP4 precision, accelerating the inference speed of generating a 5-second 720p video from 71s to 29s (2.4x speedup). In summary, SANA-Video enables low-cost, high-quality video generation. [this https URL](https://github.com/NVlabs/SANA).*
+
+This pipeline was contributed by SANA Team. The original codebase can be found [here](https://github.com/NVlabs/Sana). The original weights can be found under [hf.co/Efficient-Large-Model](https://hf.co/collections/Efficient-Large-Model/sana-video).
+
+Available models:
+
+| Model | Recommended dtype |
+|:-----:|:-----------------:|
+| [`Efficient-Large-Model/SANA-Video_2B_480p_diffusers`](https://huggingface.co/Efficient-Large-Model/ANA-Video_2B_480p_diffusers) | `torch.bfloat16` |
+
+Refer to [this](https://huggingface.co/collections/Efficient-Large-Model/sana-video) collection for more information.
+
+Note: The recommended dtype mentioned is for the transformer weights. The text encoder and VAE weights must stay in `torch.bfloat16` or `torch.float32` for the model to work correctly. Please refer to the inference example below to see how to load the model with the recommended dtype. 
+
+
+## Generation Pipelines
+
+<hfoptions id="generation pipelines">`
+<hfoption id="Text-to-Video">
+
+The example below demonstrates how to use the text-to-video pipeline to generate a video using a text description.
+
+```python
+pipe = SanaVideoPipeline.from_pretrained(
+    "Efficient-Large-Model/SANA-Video_2B_480p_diffusers", 
+    torch_dtype=torch.bfloat16,
+)
+pipe.text_encoder.to(torch.bfloat16)
+pipe.vae.to(torch.float32)
+pipe.to("cuda")
+
+prompt = "A cat and a dog baking a cake together in a kitchen. The cat is carefully measuring flour, while the dog is stirring the batter with a wooden spoon. The kitchen is cozy, with sunlight streaming through the window."
+negative_prompt = "A chaotic sequence with misshapen, deformed limbs in heavy motion blur, sudden disappearance, jump cuts, jerky movements, rapid shot changes, frames out of sync, inconsistent character shapes, temporal artifacts, jitter, and ghosting effects, creating a disorienting visual experience."
+motion_scale = 30
+motion_prompt = f" motion score: {motion_scale}."
+prompt = prompt + motion_prompt
+
+video = pipe(
+    prompt=prompt,
+    negative_prompt=negative_prompt,
+    height=480,
+    width=832,
+    frames=81,
+    guidance_scale=6,
+    num_inference_steps=50,
+    generator=torch.Generator(device="cuda").manual_seed(0),
+).frames[0]
+
+export_to_video(video, "sana_video.mp4", fps=16)
+```
+
+</hfoption>
+<hfoption id="Image-to-Video">
+
+The example below demonstrates how to use the image-to-video pipeline to generate a video using a text description and a starting frame.
+
+```python
+pipe = SanaImageToVideoPipeline.from_pretrained(
+    "Efficient-Large-Model/SANA-Video_2B_480p_diffusers",
+    torch_dtype=torch.bfloat16,
+)
+pipe.scheduler = FlowMatchEulerDiscreteScheduler.from_config(pipe.scheduler.config, flow_shift=8.0)
+pipe.vae.to(torch.float32)
+pipe.text_encoder.to(torch.bfloat16)
+pipe.to("cuda")
+
+image = load_image("https://raw.githubusercontent.com/NVlabs/Sana/refs/heads/main/asset/samples/i2v-1.png")
+prompt = "A woman stands against a stunning sunset backdrop, her long, wavy brown hair gently blowing in the breeze. She wears a sleeveless, light-colored blouse with a deep V-neckline, which accentuates her graceful posture. The warm hues of the setting sun cast a golden glow across her face and hair, creating a serene and ethereal atmosphere. The background features a blurred landscape with soft, rolling hills and scattered clouds, adding depth to the scene. The camera remains steady, capturing the tranquil moment from a medium close-up angle."
+negative_prompt = "A chaotic sequence with misshapen, deformed limbs in heavy motion blur, sudden disappearance, jump cuts, jerky movements, rapid shot changes, frames out of sync, inconsistent character shapes, temporal artifacts, jitter, and ghosting effects, creating a disorienting visual experience."
+motion_scale = 30
+motion_prompt = f" motion score: {motion_scale}."
+prompt = prompt + motion_prompt
+
+motion_scale = 30.0
+
+video = pipe(
+    image=image,
+    prompt=prompt,
+    negative_prompt=negative_prompt,
+    height=480,
+    width=832,
+    frames=81,
+    guidance_scale=6,
+    num_inference_steps=50,
+    generator=torch.Generator(device="cuda").manual_seed(0),
+).frames[0]
+
+export_to_video(video, "sana-i2v.mp4", fps=16)
+```
+
+</hfoption>
+</hfoptions>
+
+
+## Quantization
+
+Quantization helps reduce the memory requirements of very large models by storing model weights in a lower precision data type. However, quantization may have varying impact on video quality depending on the video model.
+
+Refer to the [Quantization](../../quantization/overview) overview to learn more about supported quantization backends and selecting a quantization backend that supports your use case. The example below demonstrates how to load a quantized [`SanaVideoPipeline`] for inference with bitsandbytes.
+
+```py
+import torch
+from diffusers import BitsAndBytesConfig as DiffusersBitsAndBytesConfig, SanaVideoTransformer3DModel, SanaVideoPipeline
+from transformers import BitsAndBytesConfig as BitsAndBytesConfig, AutoModel
+
+quant_config = BitsAndBytesConfig(load_in_8bit=True)
+text_encoder_8bit = AutoModel.from_pretrained(
+    "Efficient-Large-Model/SANA-Video_2B_480p_diffusers",
+    subfolder="text_encoder",
+    quantization_config=quant_config,
+    torch_dtype=torch.float16,
+)
+
+quant_config = DiffusersBitsAndBytesConfig(load_in_8bit=True)
+transformer_8bit = SanaVideoTransformer3DModel.from_pretrained(
+    "Efficient-Large-Model/SANA-Video_2B_480p_diffusers",
+    subfolder="transformer",
+    quantization_config=quant_config,
+    torch_dtype=torch.float16,
+)
+
+pipeline = SanaVideoPipeline.from_pretrained(
+    "Efficient-Large-Model/SANA-Video_2B_480p_diffusers",
+    text_encoder=text_encoder_8bit,
+    transformer=transformer_8bit,
+    torch_dtype=torch.float16,
+    device_map="balanced",
+)
+
+model_score = 30
+prompt = "Evening, backlight, side lighting, soft light, high contrast, mid-shot, centered composition, clean solo shot, warm color. A young Caucasian man stands in a forest, golden light glimmers on his hair as sunlight filters through the leaves. He wears a light shirt, wind gently blowing his hair and collar, light dances across his face with his movements. The background is blurred, with dappled light and soft tree shadows in the distance. The camera focuses on his lifted gaze, clear and emotional."
+negative_prompt = "A chaotic sequence with misshapen, deformed limbs in heavy motion blur, sudden disappearance, jump cuts, jerky movements, rapid shot changes, frames out of sync, inconsistent character shapes, temporal artifacts, jitter, and ghosting effects, creating a disorienting visual experience."
+motion_prompt = f" motion score: {model_score}."
+prompt = prompt + motion_prompt
+
+output = pipeline(
+    prompt=prompt,
+    negative_prompt=negative_prompt,
+    height=480,
+    width=832,
+    num_frames=81,
+    guidance_scale=6.0,
+    num_inference_steps=50
+).frames[0]
+export_to_video(output, "sana-video-output.mp4", fps=16)
+```
+
+## SanaVideoPipeline
+
+[[autodoc]] SanaVideoPipeline
+  - all
+  - __call__
+
+
+## SanaImageToVideoPipeline
+
+[[autodoc]] SanaImageToVideoPipeline
+  - all
+  - __call__
+
+
+## SanaVideoPipelineOutput
+
+[[autodoc]] pipelines.sana_video.pipeline_sana_video.SanaVideoPipelineOutput
diff --git a/docs/source/en/api/pipelines/wan.md b/docs/source/en/api/pipelines/wan.md
index 3289a840e2b1..6aab6c5b33b9 100644
--- a/docs/source/en/api/pipelines/wan.md
+++ b/docs/source/en/api/pipelines/wan.md
@@ -40,6 +40,7 @@ The following Wan models are supported in Diffusers:
 - [Wan 2.2 T2V 14B](https://huggingface.co/Wan-AI/Wan2.2-T2V-A14B-Diffusers)
 - [Wan 2.2 I2V 14B](https://huggingface.co/Wan-AI/Wan2.2-I2V-A14B-Diffusers)
 - [Wan 2.2 TI2V 5B](https://huggingface.co/Wan-AI/Wan2.2-TI2V-5B-Diffusers)
+- [Wan 2.2 Animate 14B](https://huggingface.co/Wan-AI/Wan2.2-Animate-14B-Diffusers)
 
 > [!TIP]
 > Click on the Wan models in the right sidebar for more examples of video generation.
@@ -95,15 +96,15 @@ pipeline = WanPipeline.from_pretrained(
 pipeline.to("cuda")
 
 prompt = """
-The camera rushes from far to near in a low-angle shot, 
-revealing a white ferret on a log. It plays, leaps into the water, and emerges, as the camera zooms in 
-for a close-up. Water splashes berry bushes nearby, while moss, snow, and leaves blanket the ground. 
-Birch trees and a light blue sky frame the scene, with ferns in the foreground. Side lighting casts dynamic 
+The camera rushes from far to near in a low-angle shot,
+revealing a white ferret on a log. It plays, leaps into the water, and emerges, as the camera zooms in
+for a close-up. Water splashes berry bushes nearby, while moss, snow, and leaves blanket the ground.
+Birch trees and a light blue sky frame the scene, with ferns in the foreground. Side lighting casts dynamic
 shadows and warm highlights. Medium composition, front view, low angle, with depth of field.
 """
 negative_prompt = """
-Bright tones, overexposed, static, blurred details, subtitles, style, works, paintings, images, static, overall gray, worst quality, 
-low quality, JPEG compression residue, ugly, incomplete, extra fingers, poorly drawn hands, poorly drawn faces, deformed, disfigured, 
+Bright tones, overexposed, static, blurred details, subtitles, style, works, paintings, images, static, overall gray, worst quality,
+low quality, JPEG compression residue, ugly, incomplete, extra fingers, poorly drawn hands, poorly drawn faces, deformed, disfigured,
 misshapen limbs, fused fingers, still picture, messy background, three legs, many people in the background, walking backwards
 """
 
@@ -150,15 +151,15 @@ pipeline.transformer = torch.compile(
 )
 
 prompt = """
-The camera rushes from far to near in a low-angle shot, 
-revealing a white ferret on a log. It plays, leaps into the water, and emerges, as the camera zooms in 
-for a close-up. Water splashes berry bushes nearby, while moss, snow, and leaves blanket the ground. 
-Birch trees and a light blue sky frame the scene, with ferns in the foreground. Side lighting casts dynamic 
+The camera rushes from far to near in a low-angle shot,
+revealing a white ferret on a log. It plays, leaps into the water, and emerges, as the camera zooms in
+for a close-up. Water splashes berry bushes nearby, while moss, snow, and leaves blanket the ground.
+Birch trees and a light blue sky frame the scene, with ferns in the foreground. Side lighting casts dynamic
 shadows and warm highlights. Medium composition, front view, low angle, with depth of field.
 """
 negative_prompt = """
-Bright tones, overexposed, static, blurred details, subtitles, style, works, paintings, images, static, overall gray, worst quality, 
-low quality, JPEG compression residue, ugly, incomplete, extra fingers, poorly drawn hands, poorly drawn faces, deformed, disfigured, 
+Bright tones, overexposed, static, blurred details, subtitles, style, works, paintings, images, static, overall gray, worst quality,
+low quality, JPEG compression residue, ugly, incomplete, extra fingers, poorly drawn hands, poorly drawn faces, deformed, disfigured,
 misshapen limbs, fused fingers, still picture, messy background, three legs, many people in the background, walking backwards
 """
 
@@ -249,6 +250,208 @@ The code snippets available in [this](https://github.com/huggingface/diffusers/p
 
 The general rule of thumb to keep in mind when preparing inputs for the VACE pipeline is that the input images, or frames of a video that you want to use for conditioning, should have a corresponding mask that is black in color. The black mask signifies that the model will not generate new content for that area, and only use those parts for conditioning the generation process. For parts/frames that should be generated by the model, the mask should be white in color.
 
+</hfoption>
+</hfoptions>
+
+### Wan-Animate: Unified Character Animation and Replacement with Holistic Replication
+
+[Wan-Animate](https://huggingface.co/papers/2509.14055) by the Wan Team.
+
+*We introduce Wan-Animate, a unified framework for character animation and replacement. Given a character image and a reference video, Wan-Animate can animate the character by precisely replicating the expressions and movements of the character in the video to generate high-fidelity character videos. Alternatively, it can integrate the animated character into the reference video to replace the original character, replicating the scene's lighting and color tone to achieve seamless environmental integration. Wan-Animate is built upon the Wan model. To adapt it for character animation tasks, we employ a modified input paradigm to differentiate between reference conditions and regions for generation. This design unifies multiple tasks into a common symbolic representation. We use spatially-aligned skeleton signals to replicate body motion and implicit facial features extracted from source images to reenact expressions, enabling the generation of character videos with high controllability and expressiveness. Furthermore, to enhance environmental integration during character replacement, we develop an auxiliary Relighting LoRA. This module preserves the character's appearance consistency while applying the appropriate environmental lighting and color tone. Experimental results demonstrate that Wan-Animate achieves state-of-the-art performance. We are committed to open-sourcing the model weights and its source code.*
+
+The project page: https://humanaigc.github.io/wan-animate
+
+This model was mostly contributed by [M. Tolga Cangöz](https://github.com/tolgacangoz).
+
+#### Usage
+
+The Wan-Animate pipeline supports two modes of operation:
+
+1. **Animation Mode** (default): Animates a character image based on motion and expression from reference videos
+2. **Replacement Mode**: Replaces a character in a background video with a new character while preserving the scene
+
+##### Prerequisites
+
+Before using the pipeline, you need to preprocess your reference video to extract:
+- **Pose video**: Contains skeletal keypoints representing body motion
+- **Face video**: Contains facial feature representations for expression control
+
+For replacement mode, you additionally need:
+- **Background video**: The original video containing the scene
+- **Mask video**: A mask indicating where to generate content (white) vs. preserve original (black)
+
+> [!NOTE]
+> Raw videos should not be used for inputs such as `pose_video`, which the pipeline expects to be preprocessed to extract the proper information. Preprocessing scripts to prepare these inputs are available in the [original Wan-Animate repository](https://github.com/Wan-Video/Wan2.2?tab=readme-ov-file#1-preprocessing). Integration of these preprocessing steps into Diffusers is planned for a future release.
+
+The example below demonstrates how to use the Wan-Animate pipeline:
+
+<hfoptions id="Animate usage">
+<hfoption id="Animation mode">
+
+```python
+import numpy as np
+import torch
+from diffusers import AutoencoderKLWan, WanAnimatePipeline
+from diffusers.utils import export_to_video, load_image, load_video
+
+model_id = "Wan-AI/Wan2.2-Animate-14B-Diffusers"
+vae = AutoencoderKLWan.from_pretrained(model_id, subfolder="vae", torch_dtype=torch.float32)
+pipe = WanAnimatePipeline.from_pretrained(model_id, vae=vae, torch_dtype=torch.bfloat16)
+pipe.to("cuda")
+
+# Load character image and preprocessed videos
+image = load_image("path/to/character.jpg")
+pose_video = load_video("path/to/pose_video.mp4")  # Preprocessed skeletal keypoints
+face_video = load_video("path/to/face_video.mp4")  # Preprocessed facial features
+
+# Resize image to match VAE constraints
+def aspect_ratio_resize(image, pipe, max_area=720 * 1280):
+    aspect_ratio = image.height / image.width
+    mod_value = pipe.vae_scale_factor_spatial * pipe.transformer.config.patch_size[1]
+    height = round(np.sqrt(max_area * aspect_ratio)) // mod_value * mod_value
+    width = round(np.sqrt(max_area / aspect_ratio)) // mod_value * mod_value
+    image = image.resize((width, height))
+    return image, height, width
+
+image, height, width = aspect_ratio_resize(image, pipe)
+
+prompt = "A person dancing energetically in a studio with dynamic lighting and professional camera work"
+negative_prompt = "blurry, low quality, distorted, deformed, static, poorly drawn"
+
+# Generate animated video
+output = pipe(
+    image=image,
+    pose_video=pose_video,
+    face_video=face_video,
+    prompt=prompt,
+    negative_prompt=negative_prompt,
+    height=height,
+    width=width,
+    segment_frame_length=77,
+    guidance_scale=1.0,
+    mode="animate",  # Animation mode (default)
+).frames[0]
+export_to_video(output, "animated_character.mp4", fps=30)
+```
+
+</hfoption>
+<hfoption id="Replacement mode">
+
+```python
+import numpy as np
+import torch
+from diffusers import AutoencoderKLWan, WanAnimatePipeline
+from diffusers.utils import export_to_video, load_image, load_video
+
+model_id = "Wan-AI/Wan2.2-Animate-14B-Diffusers"
+vae = AutoencoderKLWan.from_pretrained(model_id, subfolder="vae", torch_dtype=torch.float32)
+pipe = WanAnimatePipeline.from_pretrained(model_id, vae=vae, torch_dtype=torch.bfloat16)
+pipe.to("cuda")
+
+# Load all required inputs for replacement mode
+image = load_image("path/to/new_character.jpg")
+pose_video = load_video("path/to/pose_video.mp4")  # Preprocessed skeletal keypoints
+face_video = load_video("path/to/face_video.mp4")  # Preprocessed facial features
+background_video = load_video("path/to/background_video.mp4")  # Original scene
+mask_video = load_video("path/to/mask_video.mp4")  # Black: preserve, White: generate
+
+# Resize image to match video dimensions
+def aspect_ratio_resize(image, pipe, max_area=720 * 1280):
+    aspect_ratio = image.height / image.width
+    mod_value = pipe.vae_scale_factor_spatial * pipe.transformer.config.patch_size[1]
+    height = round(np.sqrt(max_area * aspect_ratio)) // mod_value * mod_value
+    width = round(np.sqrt(max_area / aspect_ratio)) // mod_value * mod_value
+    image = image.resize((width, height))
+    return image, height, width
+
+image, height, width = aspect_ratio_resize(image, pipe)
+
+prompt = "A person seamlessly integrated into the scene with consistent lighting and environment"
+negative_prompt = "blurry, low quality, inconsistent lighting, floating, disconnected from scene"
+
+# Replace character in background video
+output = pipe(
+    image=image,
+    pose_video=pose_video,
+    face_video=face_video,
+    background_video=background_video,
+    mask_video=mask_video,
+    prompt=prompt,
+    negative_prompt=negative_prompt,
+    height=height,
+    width=width,
+    segment_frame_lengths=77,
+    guidance_scale=1.0,
+    mode="replace",  # Replacement mode
+).frames[0]
+export_to_video(output, "character_replaced.mp4", fps=30)
+```
+
+</hfoption>
+<hfoption id="Advanced options">
+
+```python
+import numpy as np
+import torch
+from diffusers import AutoencoderKLWan, WanAnimatePipeline
+from diffusers.utils import export_to_video, load_image, load_video
+
+model_id = "Wan-AI/Wan2.2-Animate-14B-Diffusers"
+vae = AutoencoderKLWan.from_pretrained(model_id, subfolder="vae", torch_dtype=torch.float32)
+pipe = WanAnimatePipeline.from_pretrained(model_id, vae=vae, torch_dtype=torch.bfloat16)
+pipe.to("cuda")
+
+image = load_image("path/to/character.jpg")
+pose_video = load_video("path/to/pose_video.mp4")
+face_video = load_video("path/to/face_video.mp4")
+
+def aspect_ratio_resize(image, pipe, max_area=720 * 1280):
+    aspect_ratio = image.height / image.width
+    mod_value = pipe.vae_scale_factor_spatial * pipe.transformer.config.patch_size[1]
+    height = round(np.sqrt(max_area * aspect_ratio)) // mod_value * mod_value
+    width = round(np.sqrt(max_area / aspect_ratio)) // mod_value * mod_value
+    image = image.resize((width, height))
+    return image, height, width
+
+image, height, width = aspect_ratio_resize(image, pipe)
+
+prompt = "A person dancing energetically in a studio"
+negative_prompt = "blurry, low quality"
+
+# Advanced: Use temporal guidance and custom callback
+def callback_fn(pipe, step_index, timestep, callback_kwargs):
+    # You can modify latents or other tensors here
+    print(f"Step {step_index}, Timestep {timestep}")
+    return callback_kwargs
+
+output = pipe(
+    image=image,
+    pose_video=pose_video,
+    face_video=face_video,
+    prompt=prompt,
+    negative_prompt=negative_prompt,
+    height=height,
+    width=width,
+    segment_frame_length=77,
+    num_inference_steps=50,
+    guidance_scale=5.0,
+    prev_segment_conditioning_frames=5,  # Use 5 frames for temporal guidance (1 or 5 recommended)
+    callback_on_step_end=callback_fn,
+    callback_on_step_end_tensor_inputs=["latents"],
+).frames[0]
+export_to_video(output, "animated_advanced.mp4", fps=30)
+```
+
+</hfoption>
+</hfoptions>
+
+#### Key Parameters
+
+- **mode**: Choose between `"animate"` (default) or `"replace"`
+- **prev_segment_conditioning_frames**: Number of frames for temporal guidance (1 or 5 recommended). Using 5 provides better temporal consistency but requires more memory
+- **guidance_scale**: Controls how closely the output follows the text prompt. Higher values (5-7) produce results more aligned with the prompt. For Wan-Animate, CFG is disabled by default (`guidance_scale=1.0`) but can be enabled to support negative prompts and finer control over facial expressions. (Note that CFG will only target the text prompt and face conditioning.)
+
+
 ## Notes
 
 - Wan2.1 supports LoRAs with [`~loaders.WanLoraLoaderMixin.load_lora_weights`].
@@ -281,10 +484,10 @@ The general rule of thumb to keep in mind when preparing inputs for the VACE pip
 
   # use "steamboat willie style" to trigger the LoRA
   prompt = """
-  steamboat willie style, golden era animation, The camera rushes from far to near in a low-angle shot, 
-  revealing a white ferret on a log. It plays, leaps into the water, and emerges, as the camera zooms in 
-  for a close-up. Water splashes berry bushes nearby, while moss, snow, and leaves blanket the ground. 
-  Birch trees and a light blue sky frame the scene, with ferns in the foreground. Side lighting casts dynamic 
+  steamboat willie style, golden era animation, The camera rushes from far to near in a low-angle shot,
+  revealing a white ferret on a log. It plays, leaps into the water, and emerges, as the camera zooms in
+  for a close-up. Water splashes berry bushes nearby, while moss, snow, and leaves blanket the ground.
+  Birch trees and a light blue sky frame the scene, with ferns in the foreground. Side lighting casts dynamic
   shadows and warm highlights. Medium composition, front view, low angle, with depth of field.
   """
 
@@ -359,6 +562,12 @@ The general rule of thumb to keep in mind when preparing inputs for the VACE pip
   - all
   - __call__
 
+## WanAnimatePipeline
+
+[[autodoc]] WanAnimatePipeline
+  - all
+  - __call__
+
 ## WanPipelineOutput
 
-[[autodoc]] pipelines.wan.pipeline_output.WanPipelineOutput
\ No newline at end of file
+[[autodoc]] pipelines.wan.pipeline_output.WanPipelineOutput
diff --git a/docs/source/en/api/pipelines/z_image.md b/docs/source/en/api/pipelines/z_image.md
new file mode 100644
index 000000000000..5175f6b0fb6f
--- /dev/null
+++ b/docs/source/en/api/pipelines/z_image.md
@@ -0,0 +1,66 @@
+<!--Copyright 2025 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
+
+# Z-Image
+
+<div class="flex flex-wrap space-x-1">
+  <img alt="LoRA" src="https://img.shields.io/badge/LoRA-d8b4fe?style=flat"/>
+</div>
+
+[Z-Image](https://huggingface.co/papers/2511.22699) is a powerful and highly efficient image generation model with 6B parameters. Currently there's only one model with two more to be released:
+
+|Model|Hugging Face|
+|---|---|
+|Z-Image-Turbo|https://huggingface.co/Tongyi-MAI/Z-Image-Turbo|
+
+## Z-Image-Turbo
+
+Z-Image-Turbo is a distilled version of Z-Image that matches or exceeds leading competitors with only 8 NFEs (Number of Function Evaluations). It offers sub-second inference latency on enterprise-grade H800 GPUs and fits comfortably within 16G VRAM consumer devices. It excels in photorealistic image generation, bilingual text rendering (English & Chinese), and robust instruction adherence.
+
+## Image-to-image
+
+Use [`ZImageImg2ImgPipeline`] to transform an existing image based on a text prompt.
+
+```python
+import torch
+from diffusers import ZImageImg2ImgPipeline
+from diffusers.utils import load_image
+
+pipe = ZImageImg2ImgPipeline.from_pretrained("Tongyi-MAI/Z-Image-Turbo", torch_dtype=torch.bfloat16)
+pipe.to("cuda")
+
+url = "https://raw.githubusercontent.com/CompVis/stable-diffusion/main/assets/stable-samples/img2img/sketch-mountains-input.jpg"
+init_image = load_image(url).resize((1024, 1024))
+
+prompt = "A fantasy landscape with mountains and a river, detailed, vibrant colors"
+image = pipe(
+    prompt,
+    image=init_image,
+    strength=0.6,
+    num_inference_steps=9,
+    guidance_scale=0.0,
+    generator=torch.Generator("cuda").manual_seed(42),
+).images[0]
+image.save("zimage_img2img.png")
+```
+
+## ZImagePipeline
+
+[[autodoc]] ZImagePipeline
+	- all
+	- __call__
+
+## ZImageImg2ImgPipeline
+
+[[autodoc]] ZImageImg2ImgPipeline
+	- all
+	- __call__
diff --git a/docs/source/en/modular_diffusers/custom_blocks.md b/docs/source/en/modular_diffusers/custom_blocks.md
new file mode 100644
index 000000000000..1c311582264e
--- /dev/null
+++ b/docs/source/en/modular_diffusers/custom_blocks.md
@@ -0,0 +1,492 @@
+<!--Copyright 2025 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
+
+
+# Building Custom Blocks
+
+[ModularPipelineBlocks](./pipeline_block) are the fundamental building blocks of a [`ModularPipeline`]. You can create custom blocks by defining their inputs, outputs, and computation logic. This guide demonstrates how to create and use a custom block.
+
+> [!TIP]
+> Explore the [Modular Diffusers Custom Blocks](https://huggingface.co/collections/diffusers/modular-diffusers-custom-blocks) collection for official custom modular blocks like Nano Banana.
+
+## Project Structure
+
+Your custom block project should use the following structure:
+
+```shell
+.
+├── block.py
+└── modular_config.json
+```
+
+- `block.py` contains the custom block implementation
+- `modular_config.json` contains the metadata needed to load the block
+
+## Example: Florence 2 Inpainting Block
+
+In this example we will create a custom block that uses the [Florence 2](https://huggingface.co/docs/transformers/model_doc/florence2) model to process an input image and generate a mask for inpainting.
+
+The first step is to define the components that the block will use. In this case, we will need to use the `Florence2ForConditionalGeneration` model and its corresponding processor `AutoProcessor`. When defining components, we must specify the name of the component within our pipeline, model class via `type_hint`, and provide a `pretrained_model_name_or_path` for the component if we intend to load the model weights from a specific repository on the Hub.
+
+```py
+# Inside block.py
+from diffusers.modular_pipelines import (
+    ModularPipelineBlocks,
+    ComponentSpec,
+)
+from transformers import AutoProcessor, Florence2ForConditionalGeneration
+
+
+class Florence2ImageAnnotatorBlock(ModularPipelineBlocks):
+
+    @property
+    def expected_components(self):
+        return [
+            ComponentSpec(
+                name="image_annotator",
+                type_hint=Florence2ForConditionalGeneration,
+                pretrained_model_name_or_path="florence-community/Florence-2-base-ft",
+            ),
+            ComponentSpec(
+                name="image_annotator_processor",
+                type_hint=AutoProcessor,
+                pretrained_model_name_or_path="florence-community/Florence-2-base-ft",
+            ),
+        ]
+```
+
+Next, we define the inputs and outputs of the block. The inputs include the image to be annotated, the annotation task, and the annotation prompt. The outputs include the generated mask image and annotations.
+
+```py
+from typing import List, Union
+from PIL import Image, ImageDraw
+import torch
+import numpy as np
+
+from diffusers.modular_pipelines import (
+    PipelineState,
+    ModularPipelineBlocks,
+    InputParam,
+    ComponentSpec,
+    OutputParam,
+)
+from transformers import AutoProcessor, Florence2ForConditionalGeneration
+
+
+class Florence2ImageAnnotatorBlock(ModularPipelineBlocks):
+
+    @property
+    def expected_components(self):
+        return [
+            ComponentSpec(
+                name="image_annotator",
+                type_hint=Florence2ForConditionalGeneration,
+                pretrained_model_name_or_path="florence-community/Florence-2-base-ft",
+            ),
+            ComponentSpec(
+                name="image_annotator_processor",
+                type_hint=AutoProcessor,
+                pretrained_model_name_or_path="florence-community/Florence-2-base-ft",
+            ),
+        ]
+
+    @property
+    def inputs(self) -> List[InputParam]:
+        return [
+            InputParam(
+                "image",
+                type_hint=Union[Image.Image, List[Image.Image]],
+                required=True,
+                description="Image(s) to annotate",
+            ),
+            InputParam(
+                "annotation_task",
+                type_hint=Union[str, List[str]],
+                required=True,
+                default="<REFERRING_EXPRESSION_SEGMENTATION>",
+                description="""Annotation Task to perform on the image.
+                Supported Tasks:
+
+                <OD>
+                <REFERRING_EXPRESSION_SEGMENTATION>
+                <CAPTION>
+                <DETAILED_CAPTION>
+                <MORE_DETAILED_CAPTION>
+                <DENSE_REGION_CAPTION>
+                <CAPTION_TO_PHRASE_GROUNDING>
+                <OPEN_VOCABULARY_DETECTION>
+
+                """,
+            ),
+            InputParam(
+                "annotation_prompt",
+                type_hint=Union[str, List[str]],
+                required=True,
+                description="""Annotation Prompt to provide more context to the task.
+                Can be used to detect or segment out specific elements in the image
+                """,
+            ),
+            InputParam(
+                "annotation_output_type",
+                type_hint=str,
+                required=True,
+                default="mask_image",
+                description="""Output type from annotation predictions. Availabe options are
+                mask_image:
+                    -black and white mask image for the given image based on the task type
+                mask_overlay:
+                    - mask overlayed on the original image
+                bounding_box:
+                    - bounding boxes drawn on the original image
+                """,
+            ),
+            InputParam(
+                "annotation_overlay",
+                type_hint=bool,
+                required=True,
+                default=False,
+                description="",
+            ),
+        ]
+
+    @property
+    def intermediate_outputs(self) -> List[OutputParam]:
+        return [
+            OutputParam(
+                "mask_image",
+                type_hint=Image,
+                description="Inpainting Mask for input Image(s)",
+            ),
+            OutputParam(
+                "annotations",
+                type_hint=dict,
+                description="Annotations Predictions for input Image(s)",
+            ),
+            OutputParam(
+                "image",
+                type_hint=Image,
+                description="Annotated input Image(s)",
+            ),
+        ]
+
+```
+
+Now we implement the `__call__` method, which contains the logic for processing the input image and generating the mask.
+
+```py
+from typing import List, Union
+from PIL import Image, ImageDraw
+import torch
+import numpy as np
+
+from diffusers.modular_pipelines import (
+    PipelineState,
+    ModularPipelineBlocks,
+    InputParam,
+    ComponentSpec,
+    OutputParam,
+)
+from transformers import AutoProcessor, Florence2ForConditionalGeneration
+
+
+class Florence2ImageAnnotatorBlock(ModularPipelineBlocks):
+
+    @property
+    def expected_components(self):
+        return [
+            ComponentSpec(
+                name="image_annotator",
+                type_hint=Florence2ForConditionalGeneration,
+                pretrained_model_name_or_path="florence-community/Florence-2-base-ft",
+            ),
+            ComponentSpec(
+                name="image_annotator_processor",
+                type_hint=AutoProcessor,
+                pretrained_model_name_or_path="florence-community/Florence-2-base-ft",
+            ),
+        ]
+
+    @property
+    def inputs(self) -> List[InputParam]:
+        return [
+            InputParam(
+                "image",
+                type_hint=Union[Image.Image, List[Image.Image]],
+                required=True,
+                description="Image(s) to annotate",
+            ),
+            InputParam(
+                "annotation_task",
+                type_hint=Union[str, List[str]],
+                required=True,
+                default="<REFERRING_EXPRESSION_SEGMENTATION>",
+                description="""Annotation Task to perform on the image.
+                Supported Tasks:
+
+                <OD>
+                <REFERRING_EXPRESSION_SEGMENTATION>
+                <CAPTION>
+                <DETAILED_CAPTION>
+                <MORE_DETAILED_CAPTION>
+                <DENSE_REGION_CAPTION>
+                <CAPTION_TO_PHRASE_GROUNDING>
+                <OPEN_VOCABULARY_DETECTION>
+
+                """,
+            ),
+            InputParam(
+                "annotation_prompt",
+                type_hint=Union[str, List[str]],
+                required=True,
+                description="""Annotation Prompt to provide more context to the task.
+                Can be used to detect or segment out specific elements in the image
+                """,
+            ),
+            InputParam(
+                "annotation_output_type",
+                type_hint=str,
+                required=True,
+                default="mask_image",
+                description="""Output type from annotation predictions. Availabe options are
+                mask_image:
+                    -black and white mask image for the given image based on the task type
+                mask_overlay:
+                    - mask overlayed on the original image
+                bounding_box:
+                    - bounding boxes drawn on the original image
+                """,
+            ),
+            InputParam(
+                "annotation_overlay",
+                type_hint=bool,
+                required=True,
+                default=False,
+                description="",
+            ),
+        ]
+
+    @property
+    def intermediate_outputs(self) -> List[OutputParam]:
+        return [
+            OutputParam(
+                "mask_image",
+                type_hint=Image,
+                description="Inpainting Mask for input Image(s)",
+            ),
+            OutputParam(
+                "annotations",
+                type_hint=dict,
+                description="Annotations Predictions for input Image(s)",
+            ),
+            OutputParam(
+                "image",
+                type_hint=Image,
+                description="Annotated input Image(s)",
+            ),
+        ]
+
+    def get_annotations(self, components, images, prompts, task):
+        task_prompts = [task + prompt for prompt in prompts]
+
+        inputs = components.image_annotator_processor(
+            text=task_prompts, images=images, return_tensors="pt"
+        ).to(components.image_annotator.device, components.image_annotator.dtype)
+
+        generated_ids = components.image_annotator.generate(
+            input_ids=inputs["input_ids"],
+            pixel_values=inputs["pixel_values"],
+            max_new_tokens=1024,
+            early_stopping=False,
+            do_sample=False,
+            num_beams=3,
+        )
+        annotations = components.image_annotator_processor.batch_decode(
+            generated_ids, skip_special_tokens=False
+        )
+        outputs = []
+        for image, annotation in zip(images, annotations):
+            outputs.append(
+                components.image_annotator_processor.post_process_generation(
+                    annotation, task=task, image_size=(image.width, image.height)
+                )
+            )
+        return outputs
+
+    def prepare_mask(self, images, annotations, overlay=False, fill="white"):
+        masks = []
+        for image, annotation in zip(images, annotations):
+            mask_image = image.copy() if overlay else Image.new("L", image.size, 0)
+            draw = ImageDraw.Draw(mask_image)
+
+            for _, _annotation in annotation.items():
+                if "polygons" in _annotation:
+                    for polygon in _annotation["polygons"]:
+                        polygon = np.array(polygon).reshape(-1, 2)
+                        if len(polygon) < 3:
+                            continue
+                        polygon = polygon.reshape(-1).tolist()
+                        draw.polygon(polygon, fill=fill)
+
+                elif "bbox" in _annotation:
+                    bbox = _annotation["bbox"]
+                    draw.rectangle(bbox, fill="white")
+
+            masks.append(mask_image)
+
+        return masks
+
+    def prepare_bounding_boxes(self, images, annotations):
+        outputs = []
+        for image, annotation in zip(images, annotations):
+            image_copy = image.copy()
+            draw = ImageDraw.Draw(image_copy)
+            for _, _annotation in annotation.items():
+                bbox = _annotation["bbox"]
+                label = _annotation["label"]
+
+                draw.rectangle(bbox, outline="red", width=3)
+                draw.text((bbox[0], bbox[1] - 20), label, fill="red")
+
+            outputs.append(image_copy)
+
+        return outputs
+
+    def prepare_inputs(self, images, prompts):
+        prompts = prompts or ""
+
+        if isinstance(images, Image.Image):
+            images = [images]
+        if isinstance(prompts, str):
+            prompts = [prompts]
+
+        if len(images) != len(prompts):
+            raise ValueError("Number of images and annotation prompts must match.")
+
+        return images, prompts
+
+    @torch.no_grad()
+    def __call__(self, components, state: PipelineState) -> PipelineState:
+        block_state = self.get_block_state(state)
+        images, annotation_task_prompt = self.prepare_inputs(
+            block_state.image, block_state.annotation_prompt
+        )
+        task = block_state.annotation_task
+        fill = block_state.fill
+
+        annotations = self.get_annotations(
+            components, images, annotation_task_prompt, task
+        )
+        block_state.annotations = annotations
+        if block_state.annotation_output_type == "mask_image":
+            block_state.mask_image = self.prepare_mask(images, annotations)
+        else:
+            block_state.mask_image = None
+
+        if block_state.annotation_output_type == "mask_overlay":
+            block_state.image = self.prepare_mask(images, annotations, overlay=True, fill=fill)
+
+        elif block_state.annotation_output_type == "bounding_box":
+            block_state.image = self.prepare_bounding_boxes(images, annotations)
+
+        self.set_block_state(state, block_state)
+
+        return components, state
+
+```
+
+Once we have defined our custom block, we can save it to the Hub, using either the CLI or the [`push_to_hub`] method. This will make it easy to share and reuse our custom block with other pipelines.
+
+<hfoptions id="share">
+<hfoption id="hf CLI">
+
+```shell
+# In the folder with the `block.py` file, run:
+diffusers-cli custom_block
+```
+
+Then upload the block to the Hub:
+
+```shell
+hf upload <your repo id> . .
+```
+</hfoption>
+<hfoption id="push_to_hub">
+
+```py
+from block import Florence2ImageAnnotatorBlock
+block = Florence2ImageAnnotatorBlock()
+block.push_to_hub("<your repo id>")
+```
+
+</hfoption>
+</hfoptions>
+
+## Using Custom Blocks
+
+Load the custom block with [`~ModularPipelineBlocks.from_pretrained`] and set `trust_remote_code=True`.
+
+```py
+import torch
+from diffusers.modular_pipelines import ModularPipelineBlocks, SequentialPipelineBlocks
+from diffusers.modular_pipelines.stable_diffusion_xl import INPAINT_BLOCKS
+from diffusers.utils import load_image
+
+# Fetch the Florence2 image annotator block that will create our mask
+image_annotator_block = ModularPipelineBlocks.from_pretrained("diffusers/florence-2-custom-block", trust_remote_code=True)
+
+my_blocks = INPAINT_BLOCKS.copy()
+# insert the annotation block before the image encoding step
+my_blocks.insert("image_annotator", image_annotator_block, 1)
+
+# Create our initial set of inpainting blocks
+blocks = SequentialPipelineBlocks.from_blocks_dict(my_blocks)
+
+repo_id = "diffusers/modular-stable-diffusion-xl-base-1.0"
+pipe = blocks.init_pipeline(repo_id)
+pipe.load_components(torch_dtype=torch.float16, device_map="cuda", trust_remote_code=True)
+
+image = load_image("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/car.jpg?download=true")
+image = image.resize((1024, 1024))
+
+prompt = ["A red car"]
+annotation_task = "<REFERRING_EXPRESSION_SEGMENTATION>"
+annotation_prompt = ["the car"]
+
+output = pipe(
+    prompt=prompt,
+    image=image,
+    annotation_task=annotation_task,
+    annotation_prompt=annotation_prompt,
+    annotation_output_type="mask_image",
+    num_inference_steps=35,
+    guidance_scale=7.5,
+    strength=0.95,
+    output="images"
+)
+output[0].save("florence-inpainting.png")
+```
+
+## Editing Custom Blocks
+
+By default, custom blocks are saved in your cache directory. Use the `local_dir` argument to download and edit a custom block in a specific folder.
+
+```py
+import torch
+from diffusers.modular_pipelines import ModularPipelineBlocks, SequentialPipelineBlocks
+from diffusers.modular_pipelines.stable_diffusion_xl import INPAINT_BLOCKS
+from diffusers.utils import load_image
+
+# Fetch the Florence2 image annotator block that will create our mask
+image_annotator_block = ModularPipelineBlocks.from_pretrained("diffusers/florence-2-custom-block", trust_remote_code=True, local_dir="/my-local-folder")
+```
+
+Any changes made to the block files in this folder will be reflected when you load the block again.
diff --git a/docs/source/en/modular_diffusers/guiders.md b/docs/source/en/modular_diffusers/guiders.md
index fd0d27844205..6abe4fad2736 100644
--- a/docs/source/en/modular_diffusers/guiders.md
+++ b/docs/source/en/modular_diffusers/guiders.md
@@ -159,7 +159,7 @@ Change the [`~ComponentSpec.default_creation_method`] to `from_pretrained` and u
 ```py
 guider_spec = t2i_pipeline.get_component_spec("guider")
 guider_spec.default_creation_method="from_pretrained"
-guider_spec.repo="YiYiXu/modular-loader-t2i-guider"
+guider_spec.pretrained_model_name_or_path="YiYiXu/modular-loader-t2i-guider"
 guider_spec.subfolder="pag_guider"
 pag_guider = guider_spec.load()
 t2i_pipeline.update_components(guider=pag_guider)
diff --git a/docs/source/en/modular_diffusers/loop_sequential_pipeline_blocks.md b/docs/source/en/modular_diffusers/loop_sequential_pipeline_blocks.md
index 86c82b5145d3..a80309de19a6 100644
--- a/docs/source/en/modular_diffusers/loop_sequential_pipeline_blocks.md
+++ b/docs/source/en/modular_diffusers/loop_sequential_pipeline_blocks.md
@@ -12,7 +12,7 @@ specific language governing permissions and limitations under the License.
 
 # LoopSequentialPipelineBlocks
 
-[`~modular_pipelines.LoopSequentialPipelineBlocks`] are a multi-block type that composes other [`~modular_pipelines.ModularPipelineBlocks`] together in a loop. Data flows circularly, using `intermediate_inputs` and `intermediate_outputs`, and each block is run iteratively. This is typically used to create a denoising loop which is iterative by default.
+[`~modular_pipelines.LoopSequentialPipelineBlocks`] are a multi-block type that composes other [`~modular_pipelines.ModularPipelineBlocks`] together in a loop. Data flows circularly, using `inputs` and `intermediate_outputs`, and each block is run iteratively. This is typically used to create a denoising loop which is iterative by default.
 
 This guide shows you how to create [`~modular_pipelines.LoopSequentialPipelineBlocks`].
 
@@ -21,7 +21,6 @@ This guide shows you how to create [`~modular_pipelines.LoopSequentialPipelineBl
 [`~modular_pipelines.LoopSequentialPipelineBlocks`], is also known as the *loop wrapper* because it defines the loop structure, iteration variables, and configuration. Within the loop wrapper, you need the following variables.
 
 - `loop_inputs` are user provided values and equivalent to [`~modular_pipelines.ModularPipelineBlocks.inputs`].
-- `loop_intermediate_inputs` are intermediate variables from the [`~modular_pipelines.PipelineState`] and equivalent to [`~modular_pipelines.ModularPipelineBlocks.intermediate_inputs`].
 - `loop_intermediate_outputs` are new intermediate variables created by the block and added to the [`~modular_pipelines.PipelineState`]. It is equivalent to [`~modular_pipelines.ModularPipelineBlocks.intermediate_outputs`].
 - `__call__` method defines the loop structure and iteration logic.
 
@@ -90,4 +89,4 @@ Add more loop blocks to run within each iteration with [`~modular_pipelines.Loop
 
 ```py
 loop = LoopWrapper.from_blocks_dict({"block1": LoopBlock(), "block2": LoopBlock})
-```
\ No newline at end of file
+```
diff --git a/docs/source/en/modular_diffusers/modular_pipeline.md b/docs/source/en/modular_diffusers/modular_pipeline.md
index 0e0a7bd75d51..34cd8f72b5b7 100644
--- a/docs/source/en/modular_diffusers/modular_pipeline.md
+++ b/docs/source/en/modular_diffusers/modular_pipeline.md
@@ -313,14 +313,14 @@ unet_spec
 ComponentSpec(
     name='unet',
     type_hint=<class 'diffusers.models.unets.unet_2d_condition.UNet2DConditionModel'>,
-    repo='RunDiffusion/Juggernaut-XL-v9',
+    pretrained_model_name_or_path='RunDiffusion/Juggernaut-XL-v9',
     subfolder='unet',
     variant='fp16',
     default_creation_method='from_pretrained'
 )
 
 # modify to load from a different repository
-unet_spec.repo = "stabilityai/stable-diffusion-xl-base-1.0"
+unet_spec.pretrained_model_name_or_path = "stabilityai/stable-diffusion-xl-base-1.0"
 
 # load component with modified spec
 unet = unet_spec.load(torch_dtype=torch.float16)
diff --git a/docs/source/en/modular_diffusers/pipeline_block.md b/docs/source/en/modular_diffusers/pipeline_block.md
index 66d26b021456..06c115e1fb52 100644
--- a/docs/source/en/modular_diffusers/pipeline_block.md
+++ b/docs/source/en/modular_diffusers/pipeline_block.md
@@ -37,17 +37,7 @@ A [`~modular_pipelines.ModularPipelineBlocks`] requires `inputs`, and `intermedi
     ]
     ```
 
-- `intermediate_inputs` are values typically created from a previous block but it can also be directly provided if no preceding block generates them. Unlike `inputs`, `intermediate_inputs` can be modified.
-
-    Use `InputParam` to define `intermediate_inputs`.
-
-    ```py
-    user_intermediate_inputs = [
-        InputParam(name="processed_image", type_hint="torch.Tensor", description="image that has been preprocessed and normalized"),
-    ]
-    ```
-
-- `intermediate_outputs` are new values created by a block and added to the [`~modular_pipelines.PipelineState`]. The `intermediate_outputs` are available as `intermediate_inputs` for subsequent blocks or available as the final output from running the pipeline.
+- `intermediate_outputs` are new values created by a block and added to the [`~modular_pipelines.PipelineState`]. The `intermediate_outputs` are available as `inputs` for subsequent blocks or available as the final output from running the pipeline.
 
     Use `OutputParam` to define `intermediate_outputs`.
 
@@ -65,8 +55,8 @@ The intermediate inputs and outputs share data to connect blocks. They are acces
 
 The computation a block performs is defined in the `__call__` method and it follows a specific structure.
 
-1. Retrieve the [`~modular_pipelines.BlockState`] to get a local view of the `inputs` and `intermediate_inputs`.
-2. Implement the computation logic on the `inputs` and `intermediate_inputs`.
+1. Retrieve the [`~modular_pipelines.BlockState`] to get a local view of the `inputs`
+2. Implement the computation logic on the `inputs`.
 3. Update [`~modular_pipelines.PipelineState`] to push changes from the local [`~modular_pipelines.BlockState`] back to the global [`~modular_pipelines.PipelineState`].
 4. Return the components and state which becomes available to the next block.
 
@@ -76,7 +66,7 @@ def __call__(self, components, state):
     block_state = self.get_block_state(state)
 
     # Your computation logic here
-    # block_state contains all your inputs and intermediate_inputs
+    # block_state contains all your inputs
     # Access them like: block_state.image, block_state.processed_image
 
     # Update the pipeline state with your updated block_states
@@ -112,4 +102,4 @@ def __call__(self, components, state):
     unet = components.unet
     vae = components.vae
     scheduler = components.scheduler
-```
\ No newline at end of file
+```
diff --git a/docs/source/en/modular_diffusers/quickstart.md b/docs/source/en/modular_diffusers/quickstart.md
index 9d4eaa0c0c3d..32d14d84e243 100644
--- a/docs/source/en/modular_diffusers/quickstart.md
+++ b/docs/source/en/modular_diffusers/quickstart.md
@@ -183,7 +183,7 @@ from diffusers.modular_pipelines import ComponentsManager
 components = ComponentManager()
 
 dd_pipeline = dd_blocks.init_pipeline("YiYiXu/modular-demo-auto", components_manager=components, collection="diffdiff")
-dd_pipeline.load_default_componenets(torch_dtype=torch.float16)
+dd_pipeline.load_componenets(torch_dtype=torch.float16)
 dd_pipeline.to("cuda")
 ```
 
diff --git a/docs/source/en/modular_diffusers/sequential_pipeline_blocks.md b/docs/source/en/modular_diffusers/sequential_pipeline_blocks.md
index bbeb28aae5a4..f1549a26b86f 100644
--- a/docs/source/en/modular_diffusers/sequential_pipeline_blocks.md
+++ b/docs/source/en/modular_diffusers/sequential_pipeline_blocks.md
@@ -12,11 +12,11 @@ specific language governing permissions and limitations under the License.
 
 # SequentialPipelineBlocks
 
-[`~modular_pipelines.SequentialPipelineBlocks`] are a multi-block type that composes other [`~modular_pipelines.ModularPipelineBlocks`] together in a sequence. Data flows linearly from one block to the next using `intermediate_inputs` and `intermediate_outputs`. Each block in [`~modular_pipelines.SequentialPipelineBlocks`] usually represents a step in the pipeline, and by combining them, you gradually build a pipeline.
+[`~modular_pipelines.SequentialPipelineBlocks`] are a multi-block type that composes other [`~modular_pipelines.ModularPipelineBlocks`] together in a sequence. Data flows linearly from one block to the next using `inputs` and `intermediate_outputs`. Each block in [`~modular_pipelines.SequentialPipelineBlocks`] usually represents a step in the pipeline, and by combining them, you gradually build a pipeline.
 
 This guide shows you how to connect two blocks into a [`~modular_pipelines.SequentialPipelineBlocks`].
 
-Create two [`~modular_pipelines.ModularPipelineBlocks`]. The first block, `InputBlock`, outputs a `batch_size` value and the second block, `ImageEncoderBlock` uses `batch_size` as `intermediate_inputs`.
+Create two [`~modular_pipelines.ModularPipelineBlocks`]. The first block, `InputBlock`, outputs a `batch_size` value and the second block, `ImageEncoderBlock` uses `batch_size` as `inputs`.
 
 <hfoptions id="sequential">
 <hfoption id="InputBlock">
@@ -110,4 +110,4 @@ Inspect the sub-blocks in [`~modular_pipelines.SequentialPipelineBlocks`] by cal
 ```py
 print(blocks)
 print(blocks.doc)
-```
\ No newline at end of file
+```
diff --git a/docs/source/en/optimization/attention_backends.md b/docs/source/en/optimization/attention_backends.md
index edfdcc38b50b..f3ff4781c6ec 100644
--- a/docs/source/en/optimization/attention_backends.md
+++ b/docs/source/en/optimization/attention_backends.md
@@ -32,7 +32,7 @@ This guide will show you how to set and use the different attention backends.
 
 The [`~ModelMixin.set_attention_backend`] method iterates through all the modules in the model and sets the appropriate attention backend to use. The attention backend setting persists until [`~ModelMixin.reset_attention_backend`] is called.
 
-The example below demonstrates how to enable the `_flash_3_hub` implementation for FlashAttention-3 from the [kernel](https://github.com/huggingface/kernels) library, which allows you to instantly use optimized compute kernels from the Hub without requiring any setup.
+The example below demonstrates how to enable the `_flash_3_hub` implementation for FlashAttention-3 from the [`kernels`](https://github.com/huggingface/kernels) library, which allows you to instantly use optimized compute kernels from the Hub without requiring any setup.
 
 > [!NOTE]
 > FlashAttention-3 is not supported for non-Hopper architectures, in which case, use FlashAttention with `set_attention_backend("flash")`.
@@ -139,12 +139,16 @@ Refer to the table below for a complete list of available attention backends and
 | `_native_npu` | [PyTorch native](https://docs.pytorch.org/docs/stable/generated/torch.nn.attention.SDPBackend.html#torch.nn.attention.SDPBackend) | NPU-optimized attention |
 | `_native_xla` | [PyTorch native](https://docs.pytorch.org/docs/stable/generated/torch.nn.attention.SDPBackend.html#torch.nn.attention.SDPBackend) | XLA-optimized attention |
 | `flash` | [FlashAttention](https://github.com/Dao-AILab/flash-attention) | FlashAttention-2 |
+| `flash_hub` | [FlashAttention](https://github.com/Dao-AILab/flash-attention) | FlashAttention-2 from kernels |
 | `flash_varlen` | [FlashAttention](https://github.com/Dao-AILab/flash-attention) | Variable length FlashAttention |
+| `flash_varlen_hub` | [FlashAttention](https://github.com/Dao-AILab/flash-attention) | Variable length FlashAttention from kernels |
 | `aiter` | [AI Tensor Engine for ROCm](https://github.com/ROCm/aiter) | FlashAttention for AMD ROCm |
 | `_flash_3` | [FlashAttention](https://github.com/Dao-AILab/flash-attention) | FlashAttention-3 |
 | `_flash_varlen_3` | [FlashAttention](https://github.com/Dao-AILab/flash-attention) | Variable length FlashAttention-3 |
 | `_flash_3_hub` | [FlashAttention](https://github.com/Dao-AILab/flash-attention) | FlashAttention-3 from kernels |
+| `_flash_3_varlen_hub` | [FlashAttention](https://github.com/Dao-AILab/flash-attention) | Variable length FlashAttention-3 from kernels |
 | `sage` | [SageAttention](https://github.com/thu-ml/SageAttention) | Quantized attention (INT8 QK) |
+| `sage_hub` | [SageAttention](https://github.com/thu-ml/SageAttention) | Quantized attention (INT8 QK) from kernels |
 | `sage_varlen` | [SageAttention](https://github.com/thu-ml/SageAttention) | Variable length SageAttention |
 | `_sage_qk_int8_pv_fp8_cuda` | [SageAttention](https://github.com/thu-ml/SageAttention) | INT8 QK + FP8 PV (CUDA) |
 | `_sage_qk_int8_pv_fp8_cuda_sm90` | [SageAttention](https://github.com/thu-ml/SageAttention) | INT8 QK + FP8 PV (SM90) |
@@ -152,4 +156,4 @@ Refer to the table below for a complete list of available attention backends and
 | `_sage_qk_int8_pv_fp16_triton` | [SageAttention](https://github.com/thu-ml/SageAttention) | INT8 QK + FP16 PV (Triton) |
 | `xformers` | [xFormers](https://github.com/facebookresearch/xformers) | Memory-efficient attention |
 
-</details>
\ No newline at end of file
+</details>
diff --git a/docs/source/en/optimization/cache.md b/docs/source/en/optimization/cache.md
index 881529b27ff1..6397c7d4cd2e 100644
--- a/docs/source/en/optimization/cache.md
+++ b/docs/source/en/optimization/cache.md
@@ -66,4 +66,35 @@ config = FasterCacheConfig(
     tensor_format="BFCHW",
 )
 pipeline.transformer.enable_cache(config)
+```
+
+## TaylorSeer Cache
+
+[TaylorSeer Cache](https://huggingface.co/papers/2403.06923) accelerates diffusion inference by using Taylor series expansions to approximate and cache intermediate activations across denoising steps. The method predicts future outputs based on past computations, reusing them at specified intervals to reduce redundant calculations.
+
+This caching mechanism delivers strong results with minimal additional memory overhead. For detailed performance analysis, see [our findings here](https://github.com/huggingface/diffusers/pull/12648#issuecomment-3610615080).
+
+To enable TaylorSeer Cache, create a [`TaylorSeerCacheConfig`] and pass it to your pipeline's transformer:
+
+- `cache_interval`: Number of steps to reuse cached outputs before performing a full forward pass
+- `disable_cache_before_step`: Initial steps that use full computations to gather data for approximations
+- `max_order`: Approximation accuracy (in theory, higher values improve quality but increase memory usage but we recommend it should be set to `1`)
+
+```python
+import torch
+from diffusers import FluxPipeline, TaylorSeerCacheConfig
+
+pipe = FluxPipeline.from_pretrained(
+    "black-forest-labs/FLUX.1-dev",
+    torch_dtype=torch.bfloat16,
+)
+pipe.to("cuda")
+
+config = TaylorSeerCacheConfig(
+    cache_interval=5,
+    max_order=1,
+    disable_cache_before_step=10,
+    taylor_factors_dtype=torch.bfloat16,
+)
+pipe.transformer.enable_cache(config)
 ```
\ No newline at end of file
diff --git a/docs/source/en/quantization/modelopt.md b/docs/source/en/quantization/modelopt.md
index 06933d47c221..c7fca9d44491 100644
--- a/docs/source/en/quantization/modelopt.md
+++ b/docs/source/en/quantization/modelopt.md
@@ -11,7 +11,7 @@ specific language governing permissions and limitations under the License. -->
 
 # NVIDIA ModelOpt
 
-[NVIDIA-ModelOpt](https://github.com/NVIDIA/TensorRT-Model-Optimizer) is a unified library of state-of-the-art model optimization techniques like quantization, pruning, distillation, speculative decoding, etc. It compresses deep learning models for downstream deployment frameworks like TensorRT-LLM or TensorRT to optimize inference speed.
+[NVIDIA-ModelOpt](https://github.com/NVIDIA/Model-Optimizer) is a unified library of state-of-the-art model optimization techniques like quantization, pruning, distillation, speculative decoding, etc. It compresses deep learning models for downstream deployment frameworks like TensorRT-LLM or TensorRT to optimize inference speed.
 
 Before you begin, make sure you have nvidia_modelopt installed.
 
@@ -57,7 +57,7 @@ image.save("output.png")
 >
 > The quantization methods in NVIDIA-ModelOpt are designed to reduce the memory footprint of model weights using various QAT (Quantization-Aware Training) and PTQ (Post-Training Quantization) techniques while maintaining model performance. However, the actual performance gain during inference depends on the deployment framework (e.g., TRT-LLM, TensorRT) and the specific hardware configuration.  
 > 
-> More details can be found [here](https://github.com/NVIDIA/TensorRT-Model-Optimizer/tree/main/examples).
+> More details can be found [here](https://github.com/NVIDIA/Model-Optimizer/tree/main/examples).
 
 ## NVIDIAModelOptConfig
 
@@ -86,7 +86,7 @@ The quantization methods supported are as follows:
 | **NVFP4** | `nvfp4 weight only`, `nvfp4 block quantization` | `quant_type`, `quant_type + channel_quantize + block_quantize` | `channel_quantize = -1 is only supported for now`|
 
 
-Refer to the [official modelopt documentation](https://nvidia.github.io/TensorRT-Model-Optimizer/) for a better understanding of the available quantization methods and the exhaustive list of configuration options available.
+Refer to the [official modelopt documentation](https://nvidia.github.io/Model-Optimizer/) for a better understanding of the available quantization methods and the exhaustive list of configuration options available.
 
 ## Serializing and Deserializing quantized models
 
diff --git a/docs/source/en/using-diffusers/automodel.md b/docs/source/en/using-diffusers/automodel.md
new file mode 100644
index 000000000000..957cbd17e3f7
--- /dev/null
+++ b/docs/source/en/using-diffusers/automodel.md
@@ -0,0 +1,46 @@
+<!--Copyright 2025 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
+
+# AutoModel
+
+The [`AutoModel`] class automatically detects and loads the correct model class (UNet, transformer, VAE) from a `config.json` file. You don't need to know the specific model class name ahead of time. It supports data types and device placement, and works across model types and libraries.
+
+The example below loads a transformer from Diffusers and a text encoder from Transformers. Use the `subfolder` parameter to specify where to load the `config.json` file from.
+
+```py
+import torch
+from diffusers import AutoModel, DiffusionPipeline
+
+transformer = AutoModel.from_pretrained(
+    "Qwen/Qwen-Image", subfolder="transformer", torch_dtype=torch.bfloat16, device_map="cuda"
+)
+
+text_encoder = AutoModel.from_pretrained(
+    "Qwen/Qwen-Image", subfolder="text_encoder", torch_dtype=torch.bfloat16, device_map="cuda"
+)
+```
+
+[`AutoModel`] also loads models from the [Hub](https://huggingface.co/models) that aren't included in Diffusers. Set `trust_remote_code=True` in [`AutoModel.from_pretrained`] to load custom models.
+
+```py
+import torch
+from diffusers import AutoModel
+
+transformer = AutoModel.from_pretrained(
+    "custom/custom-transformer-model", trust_remote_code=True, torch_dtype=torch.bfloat16, device_map="cuda"
+)
+```
+
+If the custom model inherits from the [`ModelMixin`] class, it gets access to the same features as Diffusers model classes, like [regional compilation](../optimization/fp16#regional-compilation) and [group offloading](../optimization/memory#group-offloading).
+
+> [!NOTE]
+> Learn more about implementing custom models in the [Community components](../using-diffusers/custom_pipeline_overview#community-components) guide.
\ No newline at end of file
diff --git a/docs/source/pt/_toctree.yml b/docs/source/pt/_toctree.yml
index c34297a4743f..1ff6e8f7a1b6 100644
--- a/docs/source/pt/_toctree.yml
+++ b/docs/source/pt/_toctree.yml
@@ -1,8 +1,10 @@
 - sections:
-    - local: index
-      title: 🧨 Diffusers
-    - local: quicktour
-      title: Tour rápido
-    - local: installation
-      title: Instalação
+  - local: index
+    title: Diffusers
+  - local: installation
+    title: Instalação
+  - local: quicktour
+    title: Tour rápido
+  - local: stable_diffusion
+    title: Desempenho básico
   title: Primeiros passos
diff --git a/docs/source/pt/index.md b/docs/source/pt/index.md
index e491912b355a..69096ba95e9a 100644
--- a/docs/source/pt/index.md
+++ b/docs/source/pt/index.md
@@ -18,11 +18,11 @@ specific language governing permissions and limitations under the License.
 
 # Diffusers
 
-🤗 Diffusers é uma biblioteca de modelos de difusão de última geração para geração de imagens, áudio e até mesmo estruturas 3D de moléculas. Se você está procurando uma solução de geração simples ou queira treinar seu próprio modelo de difusão, 🤗 Diffusers é uma modular caixa de ferramentas que suporta ambos. Nossa biblioteca é desenhada com foco em [usabilidade em vez de desempenho](conceptual/philosophy#usability-over-performance), [simples em vez de fácil](conceptual/philosophy#simple-over-easy) e [customizável em vez de abstrações](conceptual/philosophy#tweakable-contributorfriendly-over-abstraction).
+🤗 Diffusers é uma biblioteca de modelos de difusão de última geração para geração de imagens, áudio e até mesmo estruturas 3D de moléculas. Se você está procurando uma solução de geração simples ou quer treinar seu próprio modelo de difusão, 🤗 Diffusers é uma caixa de ferramentas modular que suporta ambos. Nossa biblioteca é desenhada com foco em [usabilidade em vez de desempenho](conceptual/philosophy#usability-over-performance), [simples em vez de fácil](conceptual/philosophy#simple-over-easy) e [customizável em vez de abstrações](conceptual/philosophy#tweakable-contributorfriendly-over-abstraction).
 
 A Biblioteca tem três componentes principais:
 
-- Pipelines de última geração para a geração em poucas linhas de código. Têm muitos pipelines no 🤗 Diffusers, veja a tabela no pipeline [Visão geral](api/pipelines/overview) para uma lista completa de pipelines disponíveis e as tarefas que eles resolvem.
+- Pipelines de última geração para a geração em poucas linhas de código. Há muitos pipelines no 🤗 Diffusers, veja a tabela no pipeline [Visão geral](api/pipelines/overview) para uma lista completa de pipelines disponíveis e as tarefas que eles resolvem.
 - Intercambiáveis [agendadores de ruído](api/schedulers/overview) para balancear as compensações entre velocidade e qualidade de geração.
 - [Modelos](api/models) pré-treinados que podem ser usados como se fossem blocos de construção, e combinados com agendadores, para criar seu próprio sistema de difusão de ponta a ponta.
 
diff --git a/docs/source/pt/installation.md b/docs/source/pt/installation.md
index acc767110cb9..24c7e4bb7385 100644
--- a/docs/source/pt/installation.md
+++ b/docs/source/pt/installation.md
@@ -21,7 +21,7 @@ specific language governing permissions and limitations under the License.
 
 Recomenda-se instalar 🤗 Diffusers em um [ambiente virtual](https://docs.python.org/3/library/venv.html).
 Se você não está familiarizado com ambiente virtuals, veja o [guia](https://packaging.python.org/guides/installing-using-pip-and-virtual-environments/).
-Um ambiente virtual deixa mais fácil gerenciar diferentes projetos e evitar problemas de compatibilidade entre dependências.
+Um ambiente virtual facilita gerenciar diferentes projetos e evitar problemas de compatibilidade entre dependências.
 
 Comece criando um ambiente virtual no diretório do projeto:
 
@@ -100,12 +100,12 @@ pip install -e ".[flax]"
 </jax>
 </frameworkcontent>
 
-Esses comandos irá linkar a pasta que você clonou o repositório e os caminhos das suas bibliotecas Python.
+Esses comandos irão vincular a pasta que você clonou o repositório e os caminhos das suas bibliotecas Python.
 Python então irá procurar dentro da pasta que você clonou além dos caminhos normais das bibliotecas.
 Por exemplo, se o pacote python for tipicamente instalado no `~/anaconda3/envs/main/lib/python3.10/site-packages/`, o Python também irá procurar na pasta `~/diffusers/` que você clonou.
 
 > [!WARNING]
-> Você deve deixar a pasta `diffusers` se você quiser continuar usando a biblioteca.
+> Você deve manter a pasta `diffusers` se quiser continuar usando a biblioteca.
 
 Agora você pode facilmente atualizar seu clone para a última versão do 🤗 Diffusers com o seguinte comando:
 
diff --git a/docs/source/pt/stable_diffusion.md b/docs/source/pt/stable_diffusion.md
new file mode 100644
index 000000000000..ffc5df643454
--- /dev/null
+++ b/docs/source/pt/stable_diffusion.md
@@ -0,0 +1,132 @@
+<!--Copyright 2025 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
+
+[[open-in-colab]]
+
+# Desempenho básico
+
+Difusão é um processo aleatório que demanda muito processamento. Você pode precisar executar o [`DiffusionPipeline`] várias vezes antes de obter o resultado desejado. Por isso é importante equilibrar cuidadosamente a velocidade de geração e o uso de memória para iterar mais rápido.
+
+Este guia recomenda algumas dicas básicas de desempenho para usar o [`DiffusionPipeline`]. Consulte a seção de documentação sobre Otimização de Inferência, como [Acelerar inferência](./optimization/fp16) ou [Reduzir uso de memória](./optimization/memory) para guias de desempenho mais detalhados.
+
+## Uso de memória
+
+Reduzir a quantidade de memória usada indiretamente acelera a geração e pode ajudar um modelo a caber no dispositivo.
+
+O método [`~DiffusionPipeline.enable_model_cpu_offload`] move um modelo para a CPU quando não está em uso para economizar memória da GPU.
+
+```py
+import torch
+from diffusers import DiffusionPipeline
+
+pipeline = DiffusionPipeline.from_pretrained(
+  "stabilityai/stable-diffusion-xl-base-1.0",
+  torch_dtype=torch.bfloat16,
+  device_map="cuda"
+)
+pipeline.enable_model_cpu_offload()
+
+prompt = """
+cinematic film still of a cat sipping a margarita in a pool in Palm Springs, California
+highly detailed, high budget hollywood movie, cinemascope, moody, epic, gorgeous, film grain
+"""
+pipeline(prompt).images[0]
+print(f"Memória máxima reservada: {torch.cuda.max_memory_allocated() / 1024**3:.2f} GB")
+```
+
+## Velocidade de inferência
+
+O processo de remoção de ruído é o mais exigente computacionalmente durante a difusão. Métodos que otimizam este processo aceleram a velocidade de inferência. Experimente os seguintes métodos para acelerar.
+
+- Adicione `device_map="cuda"` para colocar o pipeline em uma GPU. Colocar um modelo em um acelerador, como uma GPU, aumenta a velocidade porque realiza computações em paralelo.
+- Defina `torch_dtype=torch.bfloat16` para executar o pipeline em meia-precisão. Reduzir a precisão do tipo de dado aumenta a velocidade porque leva menos tempo para realizar computações em precisão mais baixa.
+
+```py
+import torch
+import time
+from diffusers import DiffusionPipeline, DPMSolverMultistepScheduler
+
+pipeline = DiffusionPipeline.from_pretrained(
+  "stabilityai/stable-diffusion-xl-base-1.0",
+  torch_dtype=torch.bfloat16,
+  device_map="cuda"
+)
+```
+
+- Use um agendador mais rápido, como [`DPMSolverMultistepScheduler`], que requer apenas ~20-25 passos.
+- Defina `num_inference_steps` para um valor menor. Reduzir o número de passos de inferência reduz o número total de computações. No entanto, isso pode resultar em menor qualidade de geração.
+
+```py
+pipeline.scheduler = DPMSolverMultistepScheduler.from_config(pipeline.scheduler.config)
+
+prompt = """
+cinematic film still of a cat sipping a margarita in a pool in Palm Springs, California
+highly detailed, high budget hollywood movie, cinemascope, moody, epic, gorgeous, film grain
+"""
+
+start_time = time.perf_counter()
+image = pipeline(prompt).images[0]
+end_time = time.perf_counter()
+
+print(f"Geração de imagem levou {end_time - start_time:.3f} segundos")
+```
+
+## Qualidade de geração
+
+Muitos modelos de difusão modernos entregam imagens de alta qualidade imediatamente. No entanto, você ainda pode melhorar a qualidade de geração experimentando o seguinte.
+
+- Experimente um prompt mais detalhado e descritivo. Inclua detalhes como o meio da imagem, assunto, estilo e estética. Um prompt negativo também pode ajudar, guiando um modelo para longe de características indesejáveis usando palavras como baixa qualidade ou desfocado.
+
+    ```py
+    import torch
+    from diffusers import DiffusionPipeline
+
+    pipeline = DiffusionPipeline.from_pretrained(
+        "stabilityai/stable-diffusion-xl-base-1.0",
+        torch_dtype=torch.bfloat16,
+        device_map="cuda"
+    )
+
+    prompt = """
+    cinematic film still of a cat sipping a margarita in a pool in Palm Springs, California
+    highly detailed, high budget hollywood movie, cinemascope, moody, epic, gorgeous, film grain
+    """
+    negative_prompt = "low quality, blurry, ugly, poor details"
+    pipeline(prompt, negative_prompt=negative_prompt).images[0]
+    ```
+
+    Para mais detalhes sobre como criar prompts melhores, consulte a documentação sobre [Técnicas de prompt](./using-diffusers/weighted_prompts).
+
+- Experimente um agendador diferente, como [`HeunDiscreteScheduler`] ou [`LMSDiscreteScheduler`], que sacrifica velocidade de geração por qualidade.
+
+    ```py
+    import torch
+    from diffusers import DiffusionPipeline, HeunDiscreteScheduler
+
+    pipeline = DiffusionPipeline.from_pretrained(
+        "stabilityai/stable-diffusion-xl-base-1.0",
+        torch_dtype=torch.bfloat16,
+        device_map="cuda"
+    )
+    pipeline.scheduler = HeunDiscreteScheduler.from_config(pipeline.scheduler.config)
+
+    prompt = """
+    cinematic film still of a cat sipping a margarita in a pool in Palm Springs, California
+    highly detailed, high budget hollywood movie, cinemascope, moody, epic, gorgeous, film grain
+    """
+    negative_prompt = "low quality, blurry, ugly, poor details"
+    pipeline(prompt, negative_prompt=negative_prompt).images[0]
+    ```
+
+## Próximos passos
+
+Diffusers oferece otimizações mais avançadas e poderosas, como [group-offloading](./optimization/memory#group-offloading) e [compilação regional](./optimization/fp16#regional-compilation). Para saber mais sobre como maximizar o desempenho, consulte a seção sobre Otimização de Inferência.
diff --git a/docs/source/zh/modular_diffusers/guiders.md b/docs/source/zh/modular_diffusers/guiders.md
index 1006460a2bec..50436f90c4a5 100644
--- a/docs/source/zh/modular_diffusers/guiders.md
+++ b/docs/source/zh/modular_diffusers/guiders.md
@@ -157,7 +157,7 @@ guider.push_to_hub("YiYiXu/modular-loader-t2i-guider", subfolder="pag_guider")
 ```py
 guider_spec = t2i_pipeline.get_component_spec("guider")
 guider_spec.default_creation_method="from_pretrained"
-guider_spec.repo="YiYiXu/modular-loader-t2i-guider"
+guider_spec.pretrained_model_name_or_path="YiYiXu/modular-loader-t2i-guider"
 guider_spec.subfolder="pag_guider"
 pag_guider = guider_spec.load()
 t2i_pipeline.update_components(guider=pag_guider)
diff --git a/docs/source/zh/modular_diffusers/modular_pipeline.md b/docs/source/zh/modular_diffusers/modular_pipeline.md
index daf61ecf40d9..a57fdf227506 100644
--- a/docs/source/zh/modular_diffusers/modular_pipeline.md
+++ b/docs/source/zh/modular_diffusers/modular_pipeline.md
@@ -313,14 +313,14 @@ unet_spec
 ComponentSpec(
     name='unet',
     type_hint=<class 'diffusers.models.unets.unet_2d_condition.UNet2DConditionModel'>,
-    repo='RunDiffusion/Juggernaut-XL-v9',
+    pretrained_model_name_or_path='RunDiffusion/Juggernaut-XL-v9',
     subfolder='unet',
     variant='fp16',
     default_creation_method='from_pretrained'
 )
 
 # 修改以从不同的仓库加载
-unet_spec.repo = "stabilityai/stable-diffusion-xl-base-1.0"
+unet_spec.pretrained_model_name_or_path = "stabilityai/stable-diffusion-xl-base-1.0"
 
 # 使用修改后的规范加载组件
 unet = unet_spec.load(torch_dtype=torch.float16)
diff --git a/examples/community/README.md b/examples/community/README.md
index 4a4b0f5fd9f5..4ff9c4d77704 100644
--- a/examples/community/README.md
+++ b/examples/community/README.md
@@ -88,7 +88,7 @@ PIXART-α Controlnet pipeline | Implementation of the controlnet model for pixar
 | FaithDiff Stable Diffusion XL Pipeline | Implementation of [(CVPR 2025) FaithDiff: Unleashing Diffusion Priors for Faithful Image Super-resolutionUnleashing Diffusion Priors for Faithful Image Super-resolution](https://huggingface.co/papers/2411.18824) - FaithDiff is a faithful image super-resolution method that leverages latent diffusion models by actively adapting the diffusion prior and jointly fine-tuning its components (encoder and diffusion model) with an alignment module to ensure high fidelity and structural consistency. | [FaithDiff Stable Diffusion XL Pipeline](#faithdiff-stable-diffusion-xl-pipeline) | [![Hugging Face Models](https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Models-blue)](https://huggingface.co/jychen9811/FaithDiff) | [Junyang Chen, Jinshan Pan, Jiangxin Dong, IMAG Lab, (Adapted by Eliseu Silva)](https://github.com/JyChen9811/FaithDiff) |
 | Stable Diffusion 3 InstructPix2Pix Pipeline | Implementation of Stable Diffusion 3 InstructPix2Pix Pipeline | [Stable Diffusion 3 InstructPix2Pix Pipeline](#stable-diffusion-3-instructpix2pix-pipeline) | [![Hugging Face Models](https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Models-blue)](https://huggingface.co/BleachNick/SD3_UltraEdit_freeform) [![Hugging Face Models](https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Models-blue)](https://huggingface.co/CaptainZZZ/sd3-instructpix2pix) | [Jiayu Zhang](https://github.com/xduzhangjiayu) and [Haozhe Zhao](https://github.com/HaozheZhao)|
 | Flux Kontext multiple images | A modified version of the `FluxKontextPipeline` that supports calling Flux Kontext with multiple reference images.| [Flux Kontext multiple input Pipeline](#flux-kontext-multiple-images) | - |  [Net-Mist](https://github.com/Net-Mist) |
-
+| Flux Fill ControlNet Pipeline | A modified version of the `FluxFillPipeline` and `FluxControlNetInpaintPipeline` that supports Controlnet with Flux Fill model.| [Flux Fill ControlNet Pipeline](#Flux-Fill-ControlNet-Pipeline) | - |  [pratim4dasude](https://github.com/pratim4dasude) |
 
 To load a custom pipeline you just need to pass the `custom_pipeline` argument to `DiffusionPipeline`, as one of the files in `diffusers/examples/community`. Feel free to send a PR with your own pipelines, we will merge them quickly.
 
@@ -5488,7 +5488,7 @@ Editing at Scale", many thanks to their contribution!
 
 This implementation of Flux Kontext allows users to pass multiple reference images. Each image is encoded separately, and the resulting latent vectors are concatenated.
 
-As explained in Section 3 of [the paper](https://arxiv.org/pdf/2506.15742), the model's sequence concatenation mechanism can extend its capabilities to handle multiple reference images. However, note that the current version of Flux Kontext was not trained for this use case. In practice, stacking along the first axis does not yield correct results, while stacking along the other two axes appears to work.
+As explained in Section 3 of [the paper](https://huggingface.co/papers/2506.15742), the model's sequence concatenation mechanism can extend its capabilities to handle multiple reference images. However, note that the current version of Flux Kontext was not trained for this use case. In practice, stacking along the first axis does not yield correct results, while stacking along the other two axes appears to work.
 
 ## Example Usage
 
@@ -5527,3 +5527,106 @@ images = pipe(
 ).images
 images[0].save("pizzeria.png")
 ```
+
+# Flux Fill ControlNet Pipeline
+
+This implementation of Flux Fill + ControlNet Inpaint combines the fill-style masked editing of FLUX.1-Fill-dev with full ControlNet conditioning. The base image is processed through the Fill model while the ControlNet receives the corresponding conditioning input (depth, canny, pose, etc.), and both outputs are fused during denoising to guide structure and composition.
+
+While FLUX.1-Fill-dev is designed for mask-based edits, it was not originally trained to operate jointly with ControlNet. In practice, this combined setup works well for structured inpainting tasks, though results may vary depending on the conditioning strength and the alignment between the mask and the control input.
+
+## Example Usage
+
+
+```python
+import torch
+from diffusers import (
+    FluxControlNetModel,
+    FluxPriorReduxPipeline,
+)
+from diffusers.utils import load_image
+
+# NEW PIPELINE (updated name)
+from pipline_flux_fill_controlnet_Inpaint import  FluxControlNetFillInpaintPipeline
+
+device = "cuda" if torch.cuda.is_available() else "cpu"
+dtype = torch.bfloat16
+
+# Models
+base_model = "black-forest-labs/FLUX.1-Fill-dev"
+controlnet_model = "Shakker-Labs/FLUX.1-dev-ControlNet-Union-Pro-2.0"
+prior_model = "black-forest-labs/FLUX.1-Redux-dev"
+
+# Load ControlNet
+controlnet = FluxControlNetModel.from_pretrained(
+    controlnet_model,
+    torch_dtype=dtype,
+)
+
+# Load Fill + ControlNet Pipeline
+fill_pipe = FluxControlNetFillInpaintPipeline.from_pretrained(
+    base_model,
+    controlnet=controlnet,
+    torch_dtype=dtype,
+).to(device)
+
+# OPTIONAL FP8
+# fill_pipe.transformer.enable_layerwise_casting(
+#     storage_dtype=torch.float8_e4m3fn,
+#     compute_dtype=torch.bfloat16
+# )
+
+#  OPTIONAL Prior Redux
+#pipe_prior_redux = FluxPriorReduxPipeline.from_pretrained(
+#    prior_model,
+#    torch_dtype=dtype,
+#).to(device)
+
+# Inputs
+
+# combined_image = load_image("person_input.png")
+
+
+# 1. Prior conditioning
+#prior_out = pipe_prior_redux(
+#    image=cloth_image,
+#    prompt=cloth_prompt,
+#)
+
+# 2. Fill Inpaint with ControlNet
+
+# canny (0), tile (1), depth (2), blur (3), pose (4), gray (5), low quality (6).
+
+img = load_image(r"imgs/background.jpg")
+mask = load_image(r"imgs/mask.png")
+
+control_image_depth = load_image(r"imgs/dog_depth _2.png")
+
+result = fill_pipe(
+    prompt="a dog on a bench",
+    image=img,
+    mask_image=mask,
+
+    control_image=control_image_depth,
+    control_mode=[2],  # union mode
+    control_guidance_start=0.0,
+    control_guidance_end=0.8,
+    controlnet_conditioning_scale=0.9,
+
+    height=1024,
+    width=1024,
+
+    strength=1.0,
+    guidance_scale=50.0,
+    num_inference_steps=60,
+    max_sequence_length=512,
+
+#    **prior_out,
+)
+
+# result.images[0].save("flux_fill_controlnet_inpaint.png")
+
+from datetime import datetime
+timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+result.images[0].save(f"flux_fill_controlnet_inpaint_depth{timestamp}.jpg")
+```
+
diff --git a/examples/community/img2img_inpainting.py b/examples/community/img2img_inpainting.py
index 595df107cacb..bef682425a2c 100644
--- a/examples/community/img2img_inpainting.py
+++ b/examples/community/img2img_inpainting.py
@@ -45,7 +45,7 @@ def check_size(image, height, width):
         raise ValueError(f"Image size should be {height}x{width}, but got {h}x{w}")
 
 
-def overlay_inner_image(image, inner_image, paste_offset: Tuple[int] = (0, 0)):
+def overlay_inner_image(image, inner_image, paste_offset: Tuple[int, ...] = (0, 0)):
     inner_image = inner_image.convert("RGBA")
     image = image.convert("RGB")
 
diff --git a/examples/community/lpw_stable_diffusion_xl.py b/examples/community/lpw_stable_diffusion_xl.py
index 95d3405df50d..1aba1f270198 100644
--- a/examples/community/lpw_stable_diffusion_xl.py
+++ b/examples/community/lpw_stable_diffusion_xl.py
@@ -29,7 +29,6 @@
     TextualInversionLoaderMixin,
 )
 from diffusers.models import AutoencoderKL, ImageProjection, UNet2DConditionModel
-from diffusers.models.attention_processor import AttnProcessor2_0, XFormersAttnProcessor
 from diffusers.models.lora import adjust_lora_scale_text_encoder
 from diffusers.pipelines.pipeline_utils import StableDiffusionMixin
 from diffusers.pipelines.stable_diffusion_xl.pipeline_output import StableDiffusionXLPipelineOutput
@@ -1328,18 +1327,8 @@ def _get_add_time_ids(self, original_size, crops_coords_top_left, target_size, d
 
     # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_upscale.StableDiffusionUpscalePipeline.upcast_vae
     def upcast_vae(self):
-        dtype = self.vae.dtype
+        deprecate("upcast_vae", "1.0.0", "`upcast_vae` is deprecated. Please use `pipe.vae.to(torch.float32)`")
         self.vae.to(dtype=torch.float32)
-        use_torch_2_0_or_xformers = isinstance(
-            self.vae.decoder.mid_block.attentions[0].processor,
-            (AttnProcessor2_0, XFormersAttnProcessor),
-        )
-        # if xformers or torch_2_0 is used attention block does not need
-        # to be in float32 which can save lots of memory
-        if use_torch_2_0_or_xformers:
-            self.vae.post_quant_conv.to(dtype)
-            self.vae.decoder.conv_in.to(dtype)
-            self.vae.decoder.mid_block.to(dtype)
 
     # Copied from diffusers.pipelines.latent_consistency_models.pipeline_latent_consistency_text2img.LatentConsistencyModelPipeline.get_guidance_scale_embedding
     def get_guidance_scale_embedding(self, w, embedding_dim=512, dtype=torch.float32):
diff --git a/examples/community/matryoshka.py b/examples/community/matryoshka.py
index 97ad8b9e86c6..09b1d1b24465 100644
--- a/examples/community/matryoshka.py
+++ b/examples/community/matryoshka.py
@@ -1966,16 +1966,21 @@ def __init__(
         center_input_sample: bool = False,
         flip_sin_to_cos: bool = True,
         freq_shift: int = 0,
-        down_block_types: Tuple[str] = (
+        down_block_types: Tuple[str, ...] = (
             "CrossAttnDownBlock2D",
             "CrossAttnDownBlock2D",
             "CrossAttnDownBlock2D",
             "DownBlock2D",
         ),
         mid_block_type: Optional[str] = "UNetMidBlock2DCrossAttn",
-        up_block_types: Tuple[str] = ("UpBlock2D", "CrossAttnUpBlock2D", "CrossAttnUpBlock2D", "CrossAttnUpBlock2D"),
+        up_block_types: Tuple[str, ...] = (
+            "UpBlock2D",
+            "CrossAttnUpBlock2D",
+            "CrossAttnUpBlock2D",
+            "CrossAttnUpBlock2D",
+        ),
         only_cross_attention: Union[bool, Tuple[bool]] = False,
-        block_out_channels: Tuple[int] = (320, 640, 1280, 1280),
+        block_out_channels: Tuple[int, ...] = (320, 640, 1280, 1280),
         layers_per_block: Union[int, Tuple[int]] = 2,
         downsample_padding: int = 1,
         mid_block_scale_factor: float = 1,
@@ -2294,10 +2299,10 @@ def __init__(
 
     def _check_config(
         self,
-        down_block_types: Tuple[str],
-        up_block_types: Tuple[str],
+        down_block_types: Tuple[str, ...],
+        up_block_types: Tuple[str, ...],
         only_cross_attention: Union[bool, Tuple[bool]],
-        block_out_channels: Tuple[int],
+        block_out_channels: Tuple[int, ...],
         layers_per_block: Union[int, Tuple[int]],
         cross_attention_dim: Union[int, Tuple[int]],
         transformer_layers_per_block: Union[int, Tuple[int], Tuple[Tuple[int]]],
diff --git a/examples/community/mixture_tiling_sdxl.py b/examples/community/mixture_tiling_sdxl.py
index 66c338b5b2a2..e09f5a25db73 100644
--- a/examples/community/mixture_tiling_sdxl.py
+++ b/examples/community/mixture_tiling_sdxl.py
@@ -30,17 +30,13 @@
     TextualInversionLoaderMixin,
 )
 from diffusers.models import AutoencoderKL, UNet2DConditionModel
-from diffusers.models.attention_processor import (
-    AttnProcessor2_0,
-    FusedAttnProcessor2_0,
-    XFormersAttnProcessor,
-)
 from diffusers.models.lora import adjust_lora_scale_text_encoder
 from diffusers.pipelines.pipeline_utils import DiffusionPipeline, StableDiffusionMixin
 from diffusers.pipelines.stable_diffusion_xl.pipeline_output import StableDiffusionXLPipelineOutput
 from diffusers.schedulers import KarrasDiffusionSchedulers, LMSDiscreteScheduler
 from diffusers.utils import (
     USE_PEFT_BACKEND,
+    deprecate,
     is_invisible_watermark_available,
     is_torch_xla_available,
     logging,
@@ -710,22 +706,8 @@ def _gaussian_weights(self, tile_width, tile_height, nbatches, device, dtype):
         return torch.tile(weights_torch, (nbatches, self.unet.config.in_channels, 1, 1))
 
     def upcast_vae(self):
-        dtype = self.vae.dtype
+        deprecate("upcast_vae", "1.0.0", "`upcast_vae` is deprecated. Please use `pipe.vae.to(torch.float32)`")
         self.vae.to(dtype=torch.float32)
-        use_torch_2_0_or_xformers = isinstance(
-            self.vae.decoder.mid_block.attentions[0].processor,
-            (
-                AttnProcessor2_0,
-                XFormersAttnProcessor,
-                FusedAttnProcessor2_0,
-            ),
-        )
-        # if xformers or torch_2_0 is used attention block does not need
-        # to be in float32 which can save lots of memory
-        if use_torch_2_0_or_xformers:
-            self.vae.post_quant_conv.to(dtype)
-            self.vae.decoder.conv_in.to(dtype)
-            self.vae.decoder.mid_block.to(dtype)
 
     # Copied from diffusers.pipelines.latent_consistency_models.pipeline_latent_consistency_text2img.LatentConsistencyModelPipeline.get_guidance_scale_embedding
     def get_guidance_scale_embedding(
diff --git a/examples/community/mod_controlnet_tile_sr_sdxl.py b/examples/community/mod_controlnet_tile_sr_sdxl.py
index 27249ce3fb5e..fe8bd73eabbe 100644
--- a/examples/community/mod_controlnet_tile_sr_sdxl.py
+++ b/examples/community/mod_controlnet_tile_sr_sdxl.py
@@ -39,16 +39,13 @@
     MultiControlNetModel,
     UNet2DConditionModel,
 )
-from diffusers.models.attention_processor import (
-    AttnProcessor2_0,
-    XFormersAttnProcessor,
-)
 from diffusers.models.lora import adjust_lora_scale_text_encoder
 from diffusers.pipelines.pipeline_utils import DiffusionPipeline, StableDiffusionMixin
 from diffusers.pipelines.stable_diffusion_xl.pipeline_output import StableDiffusionXLPipelineOutput
 from diffusers.schedulers import KarrasDiffusionSchedulers, LMSDiscreteScheduler
 from diffusers.utils import (
     USE_PEFT_BACKEND,
+    deprecate,
     logging,
     replace_example_docstring,
     scale_lora_layers,
@@ -1220,23 +1217,9 @@ def prepare_tiles(
 
         return tile_weights, tile_row_overlaps, tile_col_overlaps
 
-    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_upscale.StableDiffusionUpscalePipeline.upcast_vae
     def upcast_vae(self):
-        dtype = self.vae.dtype
+        deprecate("upcast_vae", "1.0.0", "`upcast_vae` is deprecated. Please use `pipe.vae.to(torch.float32)`")
         self.vae.to(dtype=torch.float32)
-        use_torch_2_0_or_xformers = isinstance(
-            self.vae.decoder.mid_block.attentions[0].processor,
-            (
-                AttnProcessor2_0,
-                XFormersAttnProcessor,
-            ),
-        )
-        # if xformers or torch_2_0 is used attention block does not need
-        # to be in float32 which can save lots of memory
-        if use_torch_2_0_or_xformers:
-            self.vae.post_quant_conv.to(dtype)
-            self.vae.decoder.conv_in.to(dtype)
-            self.vae.decoder.mid_block.to(dtype)
 
     @property
     def guidance_scale(self):
diff --git a/examples/community/pipeline_controlnet_xl_kolors.py b/examples/community/pipeline_controlnet_xl_kolors.py
index dc90aacdbc6b..96c801bab379 100644
--- a/examples/community/pipeline_controlnet_xl_kolors.py
+++ b/examples/community/pipeline_controlnet_xl_kolors.py
@@ -40,10 +40,6 @@
     MultiControlNetModel,
     UNet2DConditionModel,
 )
-from diffusers.models.attention_processor import (
-    AttnProcessor2_0,
-    XFormersAttnProcessor,
-)
 from diffusers.pipelines.kolors import ChatGLMModel, ChatGLMTokenizer
 from diffusers.pipelines.pipeline_utils import DiffusionPipeline, StableDiffusionMixin
 from diffusers.pipelines.stable_diffusion_xl.pipeline_output import StableDiffusionXLPipelineOutput
@@ -760,21 +756,8 @@ def _get_add_time_ids(
 
     # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_upscale.StableDiffusionUpscalePipeline.upcast_vae
     def upcast_vae(self):
-        dtype = self.vae.dtype
+        deprecate("upcast_vae", "1.0.0", "`upcast_vae` is deprecated. Please use `pipe.vae.to(torch.float32)`")
         self.vae.to(dtype=torch.float32)
-        use_torch_2_0_or_xformers = isinstance(
-            self.vae.decoder.mid_block.attentions[0].processor,
-            (
-                AttnProcessor2_0,
-                XFormersAttnProcessor,
-            ),
-        )
-        # if xformers or torch_2_0 is used attention block does not need
-        # to be in float32 which can save lots of memory
-        if use_torch_2_0_or_xformers:
-            self.vae.post_quant_conv.to(dtype)
-            self.vae.decoder.conv_in.to(dtype)
-            self.vae.decoder.mid_block.to(dtype)
 
     @property
     def guidance_scale(self):
diff --git a/examples/community/pipeline_controlnet_xl_kolors_img2img.py b/examples/community/pipeline_controlnet_xl_kolors_img2img.py
index 189d0312143f..41f2550138b3 100644
--- a/examples/community/pipeline_controlnet_xl_kolors_img2img.py
+++ b/examples/community/pipeline_controlnet_xl_kolors_img2img.py
@@ -40,10 +40,6 @@
     MultiControlNetModel,
     UNet2DConditionModel,
 )
-from diffusers.models.attention_processor import (
-    AttnProcessor2_0,
-    XFormersAttnProcessor,
-)
 from diffusers.pipelines.kolors import ChatGLMModel, ChatGLMTokenizer
 from diffusers.pipelines.pipeline_utils import DiffusionPipeline, StableDiffusionMixin
 from diffusers.pipelines.stable_diffusion_xl.pipeline_output import StableDiffusionXLPipelineOutput
@@ -930,21 +926,8 @@ def _get_add_time_ids(
 
     # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_upscale.StableDiffusionUpscalePipeline.upcast_vae
     def upcast_vae(self):
-        dtype = self.vae.dtype
+        deprecate("upcast_vae", "1.0.0", "`upcast_vae` is deprecated. Please use `pipe.vae.to(torch.float32)`")
         self.vae.to(dtype=torch.float32)
-        use_torch_2_0_or_xformers = isinstance(
-            self.vae.decoder.mid_block.attentions[0].processor,
-            (
-                AttnProcessor2_0,
-                XFormersAttnProcessor,
-            ),
-        )
-        # if xformers or torch_2_0 is used attention block does not need
-        # to be in float32 which can save lots of memory
-        if use_torch_2_0_or_xformers:
-            self.vae.post_quant_conv.to(dtype)
-            self.vae.decoder.conv_in.to(dtype)
-            self.vae.decoder.mid_block.to(dtype)
 
     @property
     def guidance_scale(self):
diff --git a/examples/community/pipeline_controlnet_xl_kolors_inpaint.py b/examples/community/pipeline_controlnet_xl_kolors_inpaint.py
index 3abd98482980..07bd552a29e7 100644
--- a/examples/community/pipeline_controlnet_xl_kolors_inpaint.py
+++ b/examples/community/pipeline_controlnet_xl_kolors_inpaint.py
@@ -39,10 +39,6 @@
     MultiControlNetModel,
     UNet2DConditionModel,
 )
-from diffusers.models.attention_processor import (
-    AttnProcessor2_0,
-    XFormersAttnProcessor,
-)
 from diffusers.pipelines.kolors import ChatGLMModel, ChatGLMTokenizer
 from diffusers.pipelines.pipeline_utils import DiffusionPipeline, StableDiffusionMixin
 from diffusers.pipelines.stable_diffusion_xl.pipeline_output import StableDiffusionXLPipelineOutput
@@ -1006,21 +1002,8 @@ def _get_add_time_ids(
 
     # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_upscale.StableDiffusionUpscalePipeline.upcast_vae
     def upcast_vae(self):
-        dtype = self.vae.dtype
+        deprecate("upcast_vae", "1.0.0", "`upcast_vae` is deprecated. Please use `pipe.vae.to(torch.float32)`")
         self.vae.to(dtype=torch.float32)
-        use_torch_2_0_or_xformers = isinstance(
-            self.vae.decoder.mid_block.attentions[0].processor,
-            (
-                AttnProcessor2_0,
-                XFormersAttnProcessor,
-            ),
-        )
-        # if xformers or torch_2_0 is used attention block does not need
-        # to be in float32 which can save lots of memory
-        if use_torch_2_0_or_xformers:
-            self.vae.post_quant_conv.to(dtype)
-            self.vae.decoder.conv_in.to(dtype)
-            self.vae.decoder.mid_block.to(dtype)
 
     @property
     def denoising_end(self):
diff --git a/examples/community/pipeline_demofusion_sdxl.py b/examples/community/pipeline_demofusion_sdxl.py
index 119b39cefe68..b5b68577982b 100644
--- a/examples/community/pipeline_demofusion_sdxl.py
+++ b/examples/community/pipeline_demofusion_sdxl.py
@@ -16,11 +16,11 @@
     TextualInversionLoaderMixin,
 )
 from diffusers.models import AutoencoderKL, UNet2DConditionModel
-from diffusers.models.attention_processor import AttnProcessor2_0, XFormersAttnProcessor
 from diffusers.models.lora import adjust_lora_scale_text_encoder
 from diffusers.pipelines.pipeline_utils import DiffusionPipeline, StableDiffusionMixin
 from diffusers.schedulers import KarrasDiffusionSchedulers
 from diffusers.utils import (
+    deprecate,
     is_accelerate_available,
     is_accelerate_version,
     is_invisible_watermark_available,
@@ -612,20 +612,9 @@ def tiled_decode(self, latents, current_height, current_width):
 
         return image
 
-    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_upscale.StableDiffusionUpscalePipeline.upcast_vae
     def upcast_vae(self):
-        dtype = self.vae.dtype
+        deprecate("upcast_vae", "1.0.0", "`upcast_vae` is deprecated. Please use `pipe.vae.to(torch.float32)`")
         self.vae.to(dtype=torch.float32)
-        use_torch_2_0_or_xformers = isinstance(
-            self.vae.decoder.mid_block.attentions[0].processor,
-            (AttnProcessor2_0, XFormersAttnProcessor),
-        )
-        # if xformers or torch_2_0 is used attention block does not need
-        # to be in float32 which can save lots of memory
-        if use_torch_2_0_or_xformers:
-            self.vae.post_quant_conv.to(dtype)
-            self.vae.decoder.conv_in.to(dtype)
-            self.vae.decoder.mid_block.to(dtype)
 
     @torch.no_grad()
     @replace_example_docstring(EXAMPLE_DOC_STRING)
diff --git a/examples/community/pipeline_faithdiff_stable_diffusion_xl.py b/examples/community/pipeline_faithdiff_stable_diffusion_xl.py
index a8fdc133d08b..4b564d5ee5c8 100644
--- a/examples/community/pipeline_faithdiff_stable_diffusion_xl.py
+++ b/examples/community/pipeline_faithdiff_stable_diffusion_xl.py
@@ -40,13 +40,6 @@
     UNet2DConditionLoadersMixin,
 )
 from diffusers.models import AutoencoderKL
-from diffusers.models.attention_processor import (
-    AttnProcessor2_0,
-    FusedAttnProcessor2_0,
-    LoRAAttnProcessor2_0,
-    LoRAXFormersAttnProcessor,
-    XFormersAttnProcessor,
-)
 from diffusers.models.lora import adjust_lora_scale_text_encoder
 from diffusers.models.unets.unet_2d_blocks import UNetMidBlock2D, get_down_block
 from diffusers.pipelines.pipeline_utils import DiffusionPipeline, StableDiffusionMixin
@@ -438,16 +431,21 @@ def __init__(
         center_input_sample: bool = False,
         flip_sin_to_cos: bool = True,
         freq_shift: int = 0,
-        down_block_types: Tuple[str] = (
+        down_block_types: Tuple[str, ...] = (
             "CrossAttnDownBlock2D",
             "CrossAttnDownBlock2D",
             "CrossAttnDownBlock2D",
             "DownBlock2D",
         ),
         mid_block_type: Optional[str] = "UNetMidBlock2DCrossAttn",
-        up_block_types: Tuple[str] = ("UpBlock2D", "CrossAttnUpBlock2D", "CrossAttnUpBlock2D", "CrossAttnUpBlock2D"),
+        up_block_types: Tuple[str, ...] = (
+            "UpBlock2D",
+            "CrossAttnUpBlock2D",
+            "CrossAttnUpBlock2D",
+            "CrossAttnUpBlock2D",
+        ),
         only_cross_attention: Union[bool, Tuple[bool]] = False,
-        block_out_channels: Tuple[int] = (320, 640, 1280, 1280),
+        block_out_channels: Tuple[int, ...] = (320, 640, 1280, 1280),
         layers_per_block: Union[int, Tuple[int]] = 2,
         downsample_padding: int = 1,
         mid_block_scale_factor: float = 1,
@@ -1637,24 +1635,8 @@ def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype
         return latents
 
     def upcast_vae(self):
-        dtype = self.vae.dtype
+        deprecate("upcast_vae", "1.0.0", "`upcast_vae` is deprecated. Please use `pipe.vae.to(torch.float32)`")
         self.vae.to(dtype=torch.float32)
-        use_torch_2_0_or_xformers = isinstance(
-            self.vae.decoder.mid_block.attentions[0].processor,
-            (
-                AttnProcessor2_0,
-                XFormersAttnProcessor,
-                LoRAXFormersAttnProcessor,
-                LoRAAttnProcessor2_0,
-                FusedAttnProcessor2_0,
-            ),
-        )
-        # if xformers or torch_2_0 is used attention block does not need
-        # to be in float32 which can save lots of memory
-        if use_torch_2_0_or_xformers:
-            self.vae.post_quant_conv.to(dtype)
-            self.vae.decoder.conv_in.to(dtype)
-            self.vae.decoder.mid_block.to(dtype)
 
     # Copied from diffusers.pipelines.latent_consistency_models.pipeline_latent_consistency_text2img.LatentConsistencyModelPipeline.get_guidance_scale_embedding
     def get_guidance_scale_embedding(
diff --git a/examples/community/pipeline_kolors_differential_img2img.py b/examples/community/pipeline_kolors_differential_img2img.py
index d299c839815e..d456af8b3385 100644
--- a/examples/community/pipeline_kolors_differential_img2img.py
+++ b/examples/community/pipeline_kolors_differential_img2img.py
@@ -22,13 +22,12 @@
 from diffusers.image_processor import PipelineImageInput, VaeImageProcessor
 from diffusers.loaders import IPAdapterMixin, StableDiffusionXLLoraLoaderMixin
 from diffusers.models import AutoencoderKL, ImageProjection, UNet2DConditionModel
-from diffusers.models.attention_processor import AttnProcessor2_0, FusedAttnProcessor2_0, XFormersAttnProcessor
 from diffusers.pipelines.kolors.pipeline_output import KolorsPipelineOutput
 from diffusers.pipelines.kolors.text_encoder import ChatGLMModel
 from diffusers.pipelines.kolors.tokenizer import ChatGLMTokenizer
 from diffusers.pipelines.pipeline_utils import DiffusionPipeline, StableDiffusionMixin
 from diffusers.schedulers import KarrasDiffusionSchedulers
-from diffusers.utils import is_torch_xla_available, logging, replace_example_docstring
+from diffusers.utils import deprecate, is_torch_xla_available, logging, replace_example_docstring
 from diffusers.utils.torch_utils import randn_tensor
 
 
@@ -709,24 +708,9 @@ def _get_add_time_ids(
         add_time_ids = torch.tensor([add_time_ids], dtype=dtype)
         return add_time_ids
 
-    # Copied from diffusers.pipelines.stable_diffusion_xl.pipeline_stable_diffusion_xl.StableDiffusionXLPipeline.upcast_vae
     def upcast_vae(self):
-        dtype = self.vae.dtype
+        deprecate("upcast_vae", "1.0.0", "`upcast_vae` is deprecated. Please use `pipe.vae.to(torch.float32)`")
         self.vae.to(dtype=torch.float32)
-        use_torch_2_0_or_xformers = isinstance(
-            self.vae.decoder.mid_block.attentions[0].processor,
-            (
-                AttnProcessor2_0,
-                XFormersAttnProcessor,
-                FusedAttnProcessor2_0,
-            ),
-        )
-        # if xformers or torch_2_0 is used attention block does not need
-        # to be in float32 which can save lots of memory
-        if use_torch_2_0_or_xformers:
-            self.vae.post_quant_conv.to(dtype)
-            self.vae.decoder.conv_in.to(dtype)
-            self.vae.decoder.mid_block.to(dtype)
 
     # Copied from diffusers.pipelines.latent_consistency_models.pipeline_latent_consistency_text2img.LatentConsistencyModelPipeline.get_guidance_scale_embedding
     def get_guidance_scale_embedding(
diff --git a/examples/community/pipeline_kolors_inpainting.py b/examples/community/pipeline_kolors_inpainting.py
index 26517819ebd0..85a3d698efa1 100644
--- a/examples/community/pipeline_kolors_inpainting.py
+++ b/examples/community/pipeline_kolors_inpainting.py
@@ -32,12 +32,6 @@
     TextualInversionLoaderMixin,
 )
 from diffusers.models import AutoencoderKL, ImageProjection, UNet2DConditionModel
-from diffusers.models.attention_processor import (
-    AttnProcessor2_0,
-    LoRAAttnProcessor2_0,
-    LoRAXFormersAttnProcessor,
-    XFormersAttnProcessor,
-)
 from diffusers.pipelines.kolors import ChatGLMModel, ChatGLMTokenizer
 from diffusers.pipelines.pipeline_utils import DiffusionPipeline, StableDiffusionMixin
 from diffusers.pipelines.stable_diffusion_xl.pipeline_output import StableDiffusionXLPipelineOutput
@@ -1008,23 +1002,8 @@ def _get_add_time_ids(
 
     # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_upscale.StableDiffusionUpscalePipeline.upcast_vae
     def upcast_vae(self):
-        dtype = self.vae.dtype
+        deprecate("upcast_vae", "1.0.0", "`upcast_vae` is deprecated. Please use `pipe.vae.to(torch.float32)`")
         self.vae.to(dtype=torch.float32)
-        use_torch_2_0_or_xformers = isinstance(
-            self.vae.decoder.mid_block.attentions[0].processor,
-            (
-                AttnProcessor2_0,
-                XFormersAttnProcessor,
-                LoRAXFormersAttnProcessor,
-                LoRAAttnProcessor2_0,
-            ),
-        )
-        # if xformers or torch_2_0 is used attention block does not need
-        # to be in float32 which can save lots of memory
-        if use_torch_2_0_or_xformers:
-            self.vae.post_quant_conv.to(dtype)
-            self.vae.decoder.conv_in.to(dtype)
-            self.vae.decoder.mid_block.to(dtype)
 
     # Copied from diffusers.pipelines.latent_consistency_models.pipeline_latent_consistency_text2img.LatentConsistencyModelPipeline.get_guidance_scale_embedding
     def get_guidance_scale_embedding(
diff --git a/examples/community/pipeline_sdxl_style_aligned.py b/examples/community/pipeline_sdxl_style_aligned.py
index 51547599f531..ac7ad33b68c5 100644
--- a/examples/community/pipeline_sdxl_style_aligned.py
+++ b/examples/community/pipeline_sdxl_style_aligned.py
@@ -45,8 +45,6 @@
 from diffusers.models.attention_processor import (
     Attention,
     AttnProcessor2_0,
-    FusedAttnProcessor2_0,
-    XFormersAttnProcessor,
 )
 from diffusers.models.lora import adjust_lora_scale_text_encoder
 from diffusers.pipelines.pipeline_utils import DiffusionPipeline, StableDiffusionMixin
@@ -1151,22 +1149,8 @@ def _get_add_time_ids(self, original_size, crops_coords_top_left, target_size, d
         return add_time_ids
 
     def upcast_vae(self):
-        dtype = self.vae.dtype
+        deprecate("upcast_vae", "1.0.0", "`upcast_vae` is deprecated. Please use `pipe.vae.to(torch.float32)`")
         self.vae.to(dtype=torch.float32)
-        use_torch_2_0_or_xformers = isinstance(
-            self.vae.decoder.mid_block.attentions[0].processor,
-            (
-                AttnProcessor2_0,
-                XFormersAttnProcessor,
-                FusedAttnProcessor2_0,
-            ),
-        )
-        # if xformers or torch_2_0 is used attention block does not need
-        # to be in float32 which can save lots of memory
-        if use_torch_2_0_or_xformers:
-            self.vae.post_quant_conv.to(dtype)
-            self.vae.decoder.conv_in.to(dtype)
-            self.vae.decoder.mid_block.to(dtype)
 
     def _enable_shared_attention_processors(
         self,
diff --git a/examples/community/pipeline_stable_diffusion_upscale_ldm3d.py b/examples/community/pipeline_stable_diffusion_upscale_ldm3d.py
index e358f66b4acc..ba89556a5e4d 100644
--- a/examples/community/pipeline_stable_diffusion_upscale_ldm3d.py
+++ b/examples/community/pipeline_stable_diffusion_upscale_ldm3d.py
@@ -503,24 +503,9 @@ def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype
         latents = latents * self.scheduler.init_noise_sigma
         return latents
 
-    # def upcast_vae(self):
-    #     dtype = self.vae.dtype
-    #     self.vae.to(dtype=torch.float32)
-    #     use_torch_2_0_or_xformers = isinstance(
-    #         self.vae.decoder.mid_block.attentions[0].processor,
-    #         (
-    #             AttnProcessor2_0,
-    #             XFormersAttnProcessor,
-    #             LoRAXFormersAttnProcessor,
-    #             LoRAAttnProcessor2_0,
-    #         ),
-    #     )
-    #     # if xformers or torch_2_0 is used attention block does not need
-    #     # to be in float32 which can save lots of memory
-    #     if use_torch_2_0_or_xformers:
-    #         self.vae.post_quant_conv.to(dtype)
-    #         self.vae.decoder.conv_in.to(dtype)
-    #         self.vae.decoder.mid_block.to(dtype)
+    def upcast_vae(self):
+        deprecate("upcast_vae", "1.0.0", "`upcast_vae` is deprecated. Please use `pipe.vae.to(torch.float32)`")
+        self.vae.to(dtype=torch.float32)
 
     @torch.no_grad()
     def __call__(
diff --git a/examples/community/pipeline_stable_diffusion_xl_attentive_eraser.py b/examples/community/pipeline_stable_diffusion_xl_attentive_eraser.py
index 65df4c03eb2c..822c804ec8a9 100644
--- a/examples/community/pipeline_stable_diffusion_xl_attentive_eraser.py
+++ b/examples/community/pipeline_stable_diffusion_xl_attentive_eraser.py
@@ -35,12 +35,6 @@
     TextualInversionLoaderMixin,
 )
 from diffusers.models import AutoencoderKL, ImageProjection, UNet2DConditionModel
-from diffusers.models.attention_processor import (
-    AttnProcessor2_0,
-    LoRAAttnProcessor2_0,
-    LoRAXFormersAttnProcessor,
-    XFormersAttnProcessor,
-)
 from diffusers.models.lora import adjust_lora_scale_text_encoder
 from diffusers.pipelines.pipeline_utils import DiffusionPipeline, StableDiffusionMixin
 from diffusers.pipelines.stable_diffusion_xl.pipeline_output import StableDiffusionXLPipelineOutput
@@ -1282,23 +1276,8 @@ def _get_add_time_ids(
 
     # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_upscale.StableDiffusionUpscalePipeline.upcast_vae
     def upcast_vae(self):
-        dtype = self.vae.dtype
+        deprecate("upcast_vae", "1.0.0", "`upcast_vae` is deprecated. Please use `pipe.vae.to(torch.float32)`")
         self.vae.to(dtype=torch.float32)
-        use_torch_2_0_or_xformers = isinstance(
-            self.vae.decoder.mid_block.attentions[0].processor,
-            (
-                AttnProcessor2_0,
-                XFormersAttnProcessor,
-                LoRAXFormersAttnProcessor,
-                LoRAAttnProcessor2_0,
-            ),
-        )
-        # if xformers or torch_2_0 is used attention block does not need
-        # to be in float32 which can save lots of memory
-        if use_torch_2_0_or_xformers:
-            self.vae.post_quant_conv.to(dtype)
-            self.vae.decoder.conv_in.to(dtype)
-            self.vae.decoder.mid_block.to(dtype)
 
     # Copied from diffusers.pipelines.latent_consistency_models.pipeline_latent_consistency_text2img.LatentConsistencyModelPipeline.get_guidance_scale_embedding
     def get_guidance_scale_embedding(self, w, embedding_dim=512, dtype=torch.float32):
diff --git a/examples/community/pipeline_stable_diffusion_xl_controlnet_adapter.py b/examples/community/pipeline_stable_diffusion_xl_controlnet_adapter.py
index 6dade126f282..d435a6e146b2 100644
--- a/examples/community/pipeline_stable_diffusion_xl_controlnet_adapter.py
+++ b/examples/community/pipeline_stable_diffusion_xl_controlnet_adapter.py
@@ -25,7 +25,6 @@
 from diffusers.image_processor import PipelineImageInput, VaeImageProcessor
 from diffusers.loaders import FromSingleFileMixin, StableDiffusionXLLoraLoaderMixin, TextualInversionLoaderMixin
 from diffusers.models import AutoencoderKL, ControlNetModel, MultiAdapter, T2IAdapter, UNet2DConditionModel
-from diffusers.models.attention_processor import AttnProcessor2_0, XFormersAttnProcessor
 from diffusers.models.lora import adjust_lora_scale_text_encoder
 from diffusers.pipelines.controlnet.multicontrolnet import MultiControlNetModel
 from diffusers.pipelines.pipeline_utils import DiffusionPipeline, StableDiffusionMixin
@@ -34,6 +33,7 @@
 from diffusers.utils import (
     PIL_INTERPOLATION,
     USE_PEFT_BACKEND,
+    deprecate,
     logging,
     replace_example_docstring,
     scale_lora_layers,
@@ -793,20 +793,9 @@ def _get_add_time_ids(
         add_time_ids = torch.tensor([add_time_ids], dtype=dtype)
         return add_time_ids
 
-    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_upscale.StableDiffusionUpscalePipeline.upcast_vae
     def upcast_vae(self):
-        dtype = self.vae.dtype
+        deprecate("upcast_vae", "1.0.0", "`upcast_vae` is deprecated. Please use `pipe.vae.to(torch.float32)`")
         self.vae.to(dtype=torch.float32)
-        use_torch_2_0_or_xformers = isinstance(
-            self.vae.decoder.mid_block.attentions[0].processor,
-            (AttnProcessor2_0, XFormersAttnProcessor),
-        )
-        # if xformers or torch_2_0 is used attention block does not need
-        # to be in float32 which can save lots of memory
-        if use_torch_2_0_or_xformers:
-            self.vae.post_quant_conv.to(dtype)
-            self.vae.decoder.conv_in.to(dtype)
-            self.vae.decoder.mid_block.to(dtype)
 
     # Copied from diffusers.pipelines.t2i_adapter.pipeline_stable_diffusion_adapter.StableDiffusionAdapterPipeline._default_height_width
     def _default_height_width(self, height, width, image):
diff --git a/examples/community/pipeline_stable_diffusion_xl_controlnet_adapter_inpaint.py b/examples/community/pipeline_stable_diffusion_xl_controlnet_adapter_inpaint.py
index 9ec6a90b072d..8d70e8f3c2a1 100644
--- a/examples/community/pipeline_stable_diffusion_xl_controlnet_adapter_inpaint.py
+++ b/examples/community/pipeline_stable_diffusion_xl_controlnet_adapter_inpaint.py
@@ -43,7 +43,6 @@
     T2IAdapter,
     UNet2DConditionModel,
 )
-from diffusers.models.attention_processor import AttnProcessor2_0, XFormersAttnProcessor
 from diffusers.models.lora import adjust_lora_scale_text_encoder
 from diffusers.pipelines.controlnet.multicontrolnet import MultiControlNetModel
 from diffusers.pipelines.pipeline_utils import StableDiffusionMixin
@@ -52,6 +51,7 @@
 from diffusers.utils import (
     PIL_INTERPOLATION,
     USE_PEFT_BACKEND,
+    deprecate,
     logging,
     replace_example_docstring,
     scale_lora_layers,
@@ -1130,20 +1130,9 @@ def _get_add_time_ids(
 
         return add_time_ids, add_neg_time_ids
 
-    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_upscale.StableDiffusionUpscalePipeline.upcast_vae
     def upcast_vae(self):
-        dtype = self.vae.dtype
+        deprecate("upcast_vae", "1.0.0", "`upcast_vae` is deprecated. Please use `pipe.vae.to(torch.float32)`")
         self.vae.to(dtype=torch.float32)
-        use_torch_2_0_or_xformers = isinstance(
-            self.vae.decoder.mid_block.attentions[0].processor,
-            (AttnProcessor2_0, XFormersAttnProcessor),
-        )
-        # if xformers or torch_2_0 is used attention block does not need
-        # to be in float32 which can save lots of memory
-        if use_torch_2_0_or_xformers:
-            self.vae.post_quant_conv.to(dtype)
-            self.vae.decoder.conv_in.to(dtype)
-            self.vae.decoder.mid_block.to(dtype)
 
     # Copied from diffusers.pipelines.t2i_adapter.pipeline_stable_diffusion_adapter.StableDiffusionAdapterPipeline._default_height_width
     def _default_height_width(self, height, width, image):
diff --git a/examples/community/pipeline_stable_diffusion_xl_differential_img2img.py b/examples/community/pipeline_stable_diffusion_xl_differential_img2img.py
index 89388e10cb19..99657facddf6 100644
--- a/examples/community/pipeline_stable_diffusion_xl_differential_img2img.py
+++ b/examples/community/pipeline_stable_diffusion_xl_differential_img2img.py
@@ -35,10 +35,6 @@
     TextualInversionLoaderMixin,
 )
 from diffusers.models import AutoencoderKL, ImageProjection, UNet2DConditionModel
-from diffusers.models.attention_processor import (
-    AttnProcessor2_0,
-    XFormersAttnProcessor,
-)
 from diffusers.models.lora import adjust_lora_scale_text_encoder
 from diffusers.pipelines.pipeline_utils import DiffusionPipeline, StableDiffusionMixin
 from diffusers.pipelines.stable_diffusion_xl.pipeline_output import StableDiffusionXLPipelineOutput
@@ -848,21 +844,8 @@ def _get_add_time_ids(
 
     # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_upscale.StableDiffusionUpscalePipeline.upcast_vae
     def upcast_vae(self):
-        dtype = self.vae.dtype
+        deprecate("upcast_vae", "1.0.0", "`upcast_vae` is deprecated. Please use `pipe.vae.to(torch.float32)`")
         self.vae.to(dtype=torch.float32)
-        use_torch_2_0_or_xformers = isinstance(
-            self.vae.decoder.mid_block.attentions[0].processor,
-            (
-                AttnProcessor2_0,
-                XFormersAttnProcessor,
-            ),
-        )
-        # if xformers or torch_2_0 is used attention block does not need
-        # to be in float32 which can save lots of memory
-        if use_torch_2_0_or_xformers:
-            self.vae.post_quant_conv.to(dtype)
-            self.vae.decoder.conv_in.to(dtype)
-            self.vae.decoder.mid_block.to(dtype)
 
     # Copied from diffusers.pipelines.latent_consistency_models.pipeline_latent_consistency_text2img.LatentConsistencyModelPipeline.get_guidance_scale_embedding
     def get_guidance_scale_embedding(
diff --git a/examples/community/pipeline_stable_diffusion_xl_ipex.py b/examples/community/pipeline_stable_diffusion_xl_ipex.py
index aa2b24f3965a..8e8357db3c7a 100644
--- a/examples/community/pipeline_stable_diffusion_xl_ipex.py
+++ b/examples/community/pipeline_stable_diffusion_xl_ipex.py
@@ -32,10 +32,6 @@
     TextualInversionLoaderMixin,
 )
 from diffusers.models import AutoencoderKL, UNet2DConditionModel
-from diffusers.models.attention_processor import (
-    AttnProcessor2_0,
-    XFormersAttnProcessor,
-)
 from diffusers.models.lora import adjust_lora_scale_text_encoder
 from diffusers.pipelines.stable_diffusion_xl import StableDiffusionXLPipelineOutput
 from diffusers.schedulers import KarrasDiffusionSchedulers
@@ -658,23 +654,9 @@ def _get_add_time_ids(
         add_time_ids = torch.tensor([add_time_ids], dtype=dtype)
         return add_time_ids
 
-    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_upscale.StableDiffusionUpscalePipeline.upcast_vae
     def upcast_vae(self):
-        dtype = self.vae.dtype
+        deprecate("upcast_vae", "1.0.0", "`upcast_vae` is deprecated. Please use `pipe.vae.to(torch.float32)`")
         self.vae.to(dtype=torch.float32)
-        use_torch_2_0_or_xformers = isinstance(
-            self.vae.decoder.mid_block.attentions[0].processor,
-            (
-                AttnProcessor2_0,
-                XFormersAttnProcessor,
-            ),
-        )
-        # if xformers or torch_2_0 is used attention block does not need
-        # to be in float32 which can save lots of memory
-        if use_torch_2_0_or_xformers:
-            self.vae.post_quant_conv.to(dtype)
-            self.vae.decoder.conv_in.to(dtype)
-            self.vae.decoder.mid_block.to(dtype)
 
     # Copied from diffusers.pipelines.latent_consistency_models.pipeline_latent_consistency_text2img.LatentConsistencyModelPipeline.get_guidance_scale_embedding
     def get_guidance_scale_embedding(self, w, embedding_dim=512, dtype=torch.float32):
diff --git a/examples/community/pipline_flux_fill_controlnet_Inpaint.py b/examples/community/pipline_flux_fill_controlnet_Inpaint.py
new file mode 100644
index 000000000000..cc642a767f87
--- /dev/null
+++ b/examples/community/pipline_flux_fill_controlnet_Inpaint.py
@@ -0,0 +1,1319 @@
+import inspect
+from typing import Any, Callable, Dict, List, Optional, Tuple, Union
+
+import numpy as np
+import PIL
+import torch
+from transformers import (
+    CLIPTextModel,
+    CLIPTokenizer,
+    T5EncoderModel,
+    T5TokenizerFast,
+)
+
+from diffusers.image_processor import PipelineImageInput, VaeImageProcessor
+from diffusers.loaders import FluxLoraLoaderMixin, FromSingleFileMixin, TextualInversionLoaderMixin
+from diffusers.models.autoencoders import AutoencoderKL
+from diffusers.models.controlnets.controlnet_flux import FluxControlNetModel, FluxMultiControlNetModel
+from diffusers.models.transformers import FluxTransformer2DModel
+from diffusers.pipelines.flux.pipeline_output import FluxPipelineOutput
+from diffusers.pipelines.pipeline_utils import DiffusionPipeline
+from diffusers.schedulers import FlowMatchEulerDiscreteScheduler
+from diffusers.utils import (
+    USE_PEFT_BACKEND,
+    is_torch_xla_available,
+    logging,
+    replace_example_docstring,
+    scale_lora_layers,
+    unscale_lora_layers,
+)
+from diffusers.utils.torch_utils import randn_tensor
+
+
+if is_torch_xla_available():
+    import torch_xla.core.xla_model as xm
+
+    XLA_AVAILABLE = True
+else:
+    XLA_AVAILABLE = False
+
+logger = logging.get_logger(__name__)
+
+EXAMPLE_DOC_STRING = """
+    Examples:
+        ```py
+        >>> import torch
+        >>> from diffusers import FluxControlNetInpaintPipeline
+        >>> from diffusers.models import FluxControlNetModel
+        >>> from diffusers.utils import load_image
+
+        >>> controlnet = FluxControlNetModel.from_pretrained(
+        ...     "InstantX/FLUX.1-dev-controlnet-canny", torch_dtype=torch.float16
+        ... )
+        >>> pipe = FluxControlNetInpaintPipeline.from_pretrained(
+        ...     "black-forest-labs/FLUX.1-schnell", controlnet=controlnet, torch_dtype=torch.float16
+        ... )
+        >>> pipe.to("cuda")
+
+        >>> control_image = load_image(
+        ...     "https://huggingface.co/InstantX/FLUX.1-dev-Controlnet-Canny-alpha/resolve/main/canny.jpg"
+        ... )
+        >>> init_image = load_image(
+        ...     "https://raw.githubusercontent.com/CompVis/latent-diffusion/main/data/inpainting_examples/overture-creations-5sI6fQgYIuo.png"
+        ... )
+        >>> mask_image = load_image(
+        ...     "https://raw.githubusercontent.com/CompVis/latent-diffusion/main/data/inpainting_examples/overture-creations-5sI6fQgYIuo_mask.png"
+        ... )
+
+        >>> prompt = "A girl holding a sign that says InstantX"
+        >>> image = pipe(
+        ...     prompt,
+        ...     image=init_image,
+        ...     mask_image=mask_image,
+        ...     control_image=control_image,
+        ...     control_guidance_start=0.2,
+        ...     control_guidance_end=0.8,
+        ...     controlnet_conditioning_scale=0.7,
+        ...     strength=0.7,
+        ...     num_inference_steps=28,
+        ...     guidance_scale=3.5,
+        ... ).images[0]
+        >>> image.save("flux_controlnet_inpaint.png")
+        ```
+"""
+
+
+# Copied from diffusers.pipelines.flux.pipeline_flux.calculate_shift
+def calculate_shift(
+    image_seq_len,
+    base_seq_len: int = 256,
+    max_seq_len: int = 4096,
+    base_shift: float = 0.5,
+    max_shift: float = 1.15,
+):
+    m = (max_shift - base_shift) / (max_seq_len - base_seq_len)
+    b = base_shift - m * base_seq_len
+    mu = image_seq_len * m + b
+    return mu
+
+
+# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img.retrieve_latents
+def retrieve_latents(
+    encoder_output: torch.Tensor, generator: Optional[torch.Generator] = None, sample_mode: str = "sample"
+):
+    if hasattr(encoder_output, "latent_dist") and sample_mode == "sample":
+        return encoder_output.latent_dist.sample(generator)
+    elif hasattr(encoder_output, "latent_dist") and sample_mode == "argmax":
+        return encoder_output.latent_dist.mode()
+    elif hasattr(encoder_output, "latents"):
+        return encoder_output.latents
+    else:
+        raise AttributeError("Could not access latents of provided encoder_output")
+
+
+def retrieve_latents_fill(
+    encoder_output: torch.Tensor, generator: Optional[torch.Generator] = None, sample_mode: str = "sample"
+):
+    if hasattr(encoder_output, "latent_dist") and sample_mode == "sample":
+        return encoder_output.latent_dist.sample(generator)
+    elif hasattr(encoder_output, "latent_dist") and sample_mode == "argmax":
+        return encoder_output.latent_dist.mode()
+    elif hasattr(encoder_output, "latents"):
+        return encoder_output.latents
+    else:
+        raise AttributeError("Could not access latents of provided encoder_output")
+
+
+# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.retrieve_timesteps
+def retrieve_timesteps(
+    scheduler,
+    num_inference_steps: Optional[int] = None,
+    device: Optional[Union[str, torch.device]] = None,
+    timesteps: Optional[List[int]] = None,
+    sigmas: Optional[List[float]] = None,
+    **kwargs,
+):
+    r"""
+    Calls the scheduler's `set_timesteps` method and retrieves timesteps from the scheduler after the call. Handles
+    custom timesteps. Any kwargs will be supplied to `scheduler.set_timesteps`.
+
+    Args:
+        scheduler (`SchedulerMixin`):
+            The scheduler to get timesteps from.
+        num_inference_steps (`int`):
+            The number of diffusion steps used when generating samples with a pre-trained model. If used, `timesteps`
+            must be `None`.
+        device (`str` or `torch.device`, *optional*):
+            The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
+        timesteps (`List[int]`, *optional*):
+            Custom timesteps used to override the timestep spacing strategy of the scheduler. If `timesteps` is passed,
+            `num_inference_steps` and `sigmas` must be `None`.
+        sigmas (`List[float]`, *optional*):
+            Custom sigmas used to override the timestep spacing strategy of the scheduler. If `sigmas` is passed,
+            `num_inference_steps` and `timesteps` must be `None`.
+
+    Returns:
+        `Tuple[torch.Tensor, int]`: A tuple where the first element is the timestep schedule from the scheduler and the
+        second element is the number of inference steps.
+    """
+    if timesteps is not None and sigmas is not None:
+        raise ValueError("Only one of `timesteps` or `sigmas` can be passed. Please choose one to set custom values")
+    if timesteps is not None:
+        accepts_timesteps = "timesteps" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
+        if not accepts_timesteps:
+            raise ValueError(
+                f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
+                f" timestep schedules. Please check whether you are using the correct scheduler."
+            )
+        scheduler.set_timesteps(timesteps=timesteps, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+        num_inference_steps = len(timesteps)
+    elif sigmas is not None:
+        accept_sigmas = "sigmas" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
+        if not accept_sigmas:
+            raise ValueError(
+                f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
+                f" sigmas schedules. Please check whether you are using the correct scheduler."
+            )
+        scheduler.set_timesteps(sigmas=sigmas, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+        num_inference_steps = len(timesteps)
+    else:
+        scheduler.set_timesteps(num_inference_steps, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+    return timesteps, num_inference_steps
+
+
+class FluxControlNetFillInpaintPipeline(DiffusionPipeline, FluxLoraLoaderMixin, FromSingleFileMixin):
+    r"""
+    The Flux controlnet pipeline for inpainting.
+
+    Reference: https://blackforestlabs.ai/announcing-black-forest-labs/
+
+    Args:
+        transformer ([`FluxTransformer2DModel`]):
+            Conditional Transformer (MMDiT) architecture to denoise the encoded image latents.
+        scheduler ([`FlowMatchEulerDiscreteScheduler`]):
+            A scheduler to be used in combination with `transformer` to denoise the encoded image latents.
+        vae ([`AutoencoderKL`]):
+            Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.
+        text_encoder ([`CLIPTextModel`]):
+            [CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModel), specifically
+            the [clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14) variant.
+        text_encoder_2 ([`T5EncoderModel`]):
+            [T5](https://huggingface.co/docs/transformers/en/model_doc/t5#transformers.T5EncoderModel), specifically
+            the [google/t5-v1_1-xxl](https://huggingface.co/google/t5-v1_1-xxl) variant.
+        tokenizer (`CLIPTokenizer`):
+            Tokenizer of class
+            [CLIPTokenizer](https://huggingface.co/docs/transformers/en/model_doc/clip#transformers.CLIPTokenizer).
+        tokenizer_2 (`T5TokenizerFast`):
+            Second Tokenizer of class
+            [T5TokenizerFast](https://huggingface.co/docs/transformers/en/model_doc/t5#transformers.T5TokenizerFast).
+    """
+
+    model_cpu_offload_seq = "text_encoder->text_encoder_2->transformer->vae"
+    _optional_components = []
+    _callback_tensor_inputs = ["latents", "prompt_embeds", "control_image", "mask", "masked_image_latents"]
+
+    def __init__(
+        self,
+        scheduler: FlowMatchEulerDiscreteScheduler,
+        vae: AutoencoderKL,
+        text_encoder: CLIPTextModel,
+        tokenizer: CLIPTokenizer,
+        text_encoder_2: T5EncoderModel,
+        tokenizer_2: T5TokenizerFast,
+        transformer: FluxTransformer2DModel,
+        controlnet: Union[
+            FluxControlNetModel, List[FluxControlNetModel], Tuple[FluxControlNetModel], FluxMultiControlNetModel
+        ],
+    ):
+        super().__init__()
+        if isinstance(controlnet, (list, tuple)):
+            controlnet = FluxMultiControlNetModel(controlnet)
+
+        self.register_modules(
+            scheduler=scheduler,
+            vae=vae,
+            text_encoder=text_encoder,
+            tokenizer=tokenizer,
+            text_encoder_2=text_encoder_2,
+            tokenizer_2=tokenizer_2,
+            transformer=transformer,
+            controlnet=controlnet,
+        )
+
+        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1) if getattr(self, "vae", None) else 8
+        # Flux latents are turned into 2x2 patches and packed. This means the latent width and height has to be divisible
+        # by the patch size. So the vae scale factor is multiplied by the patch size to account for this
+        self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor * 2)
+        latent_channels = self.vae.config.latent_channels if getattr(self, "vae", None) else 16
+        self.mask_processor = VaeImageProcessor(
+            vae_scale_factor=self.vae_scale_factor * 2,
+            vae_latent_channels=latent_channels,
+            do_normalize=False,
+            do_binarize=True,
+            do_convert_grayscale=True,
+        )
+        self.tokenizer_max_length = (
+            self.tokenizer.model_max_length if hasattr(self, "tokenizer") and self.tokenizer is not None else 77
+        )
+        self.default_sample_size = 128
+
+    # Copied from diffusers.pipelines.flux.pipeline_flux.FluxPipeline._get_t5_prompt_embeds
+    def _get_t5_prompt_embeds(
+        self,
+        prompt: Union[str, List[str]] = None,
+        num_images_per_prompt: int = 1,
+        max_sequence_length: int = 512,
+        device: Optional[torch.device] = None,
+        dtype: Optional[torch.dtype] = None,
+    ):
+        device = device or self._execution_device
+        dtype = dtype or self.text_encoder.dtype
+
+        prompt = [prompt] if isinstance(prompt, str) else prompt
+        batch_size = len(prompt)
+
+        if isinstance(self, TextualInversionLoaderMixin):
+            prompt = self.maybe_convert_prompt(prompt, self.tokenizer_2)
+
+        text_inputs = self.tokenizer_2(
+            prompt,
+            padding="max_length",
+            max_length=max_sequence_length,
+            truncation=True,
+            return_length=False,
+            return_overflowing_tokens=False,
+            return_tensors="pt",
+        )
+        text_input_ids = text_inputs.input_ids
+        untruncated_ids = self.tokenizer_2(prompt, padding="longest", return_tensors="pt").input_ids
+
+        if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(text_input_ids, untruncated_ids):
+            removed_text = self.tokenizer_2.batch_decode(untruncated_ids[:, self.tokenizer_max_length - 1 : -1])
+            logger.warning(
+                "The following part of your input was truncated because `max_sequence_length` is set to "
+                f" {max_sequence_length} tokens: {removed_text}"
+            )
+
+        prompt_embeds = self.text_encoder_2(text_input_ids.to(device), output_hidden_states=False)[0]
+
+        dtype = self.text_encoder_2.dtype
+        prompt_embeds = prompt_embeds.to(dtype=dtype, device=device)
+
+        _, seq_len, _ = prompt_embeds.shape
+
+        # duplicate text embeddings and attention mask for each generation per prompt, using mps friendly method
+        prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
+        prompt_embeds = prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
+
+        return prompt_embeds
+
+    # Copied from diffusers.pipelines.flux.pipeline_flux.FluxPipeline._get_clip_prompt_embeds
+    def _get_clip_prompt_embeds(
+        self,
+        prompt: Union[str, List[str]],
+        num_images_per_prompt: int = 1,
+        device: Optional[torch.device] = None,
+    ):
+        device = device or self._execution_device
+
+        prompt = [prompt] if isinstance(prompt, str) else prompt
+        batch_size = len(prompt)
+
+        if isinstance(self, TextualInversionLoaderMixin):
+            prompt = self.maybe_convert_prompt(prompt, self.tokenizer)
+
+        text_inputs = self.tokenizer(
+            prompt,
+            padding="max_length",
+            max_length=self.tokenizer_max_length,
+            truncation=True,
+            return_overflowing_tokens=False,
+            return_length=False,
+            return_tensors="pt",
+        )
+
+        text_input_ids = text_inputs.input_ids
+        untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pt").input_ids
+        if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(text_input_ids, untruncated_ids):
+            removed_text = self.tokenizer.batch_decode(untruncated_ids[:, self.tokenizer_max_length - 1 : -1])
+            logger.warning(
+                "The following part of your input was truncated because CLIP can only handle sequences up to"
+                f" {self.tokenizer_max_length} tokens: {removed_text}"
+            )
+        prompt_embeds = self.text_encoder(text_input_ids.to(device), output_hidden_states=False)
+
+        # Use pooled output of CLIPTextModel
+        prompt_embeds = prompt_embeds.pooler_output
+        prompt_embeds = prompt_embeds.to(dtype=self.text_encoder.dtype, device=device)
+
+        # duplicate text embeddings for each generation per prompt, using mps friendly method
+        prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt)
+        prompt_embeds = prompt_embeds.view(batch_size * num_images_per_prompt, -1)
+
+        return prompt_embeds
+
+    # Copied from diffusers.pipelines.flux.pipeline_flux.FluxPipeline.encode_prompt
+    def encode_prompt(
+        self,
+        prompt: Union[str, List[str]],
+        prompt_2: Union[str, List[str]],
+        device: Optional[torch.device] = None,
+        num_images_per_prompt: int = 1,
+        prompt_embeds: Optional[torch.FloatTensor] = None,
+        pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
+        max_sequence_length: int = 512,
+        lora_scale: Optional[float] = None,
+    ):
+        r"""
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                prompt to be encoded
+            prompt_2 (`str` or `List[str]`, *optional*):
+                The prompt or prompts to be sent to the `tokenizer_2` and `text_encoder_2`. If not defined, `prompt` is
+                used in all text-encoders
+            device: (`torch.device`):
+                torch device
+            num_images_per_prompt (`int`):
+                number of images that should be generated per prompt
+            prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            pooled_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting.
+                If not provided, pooled text embeddings will be generated from `prompt` input argument.
+            lora_scale (`float`, *optional*):
+                A lora scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded.
+        """
+        device = device or self._execution_device
+
+        # set lora scale so that monkey patched LoRA
+        # function of text encoder can correctly access it
+        if lora_scale is not None and isinstance(self, FluxLoraLoaderMixin):
+            self._lora_scale = lora_scale
+
+            # dynamically adjust the LoRA scale
+            if self.text_encoder is not None and USE_PEFT_BACKEND:
+                scale_lora_layers(self.text_encoder, lora_scale)
+            if self.text_encoder_2 is not None and USE_PEFT_BACKEND:
+                scale_lora_layers(self.text_encoder_2, lora_scale)
+
+        prompt = [prompt] if isinstance(prompt, str) else prompt
+
+        if prompt_embeds is None:
+            prompt_2 = prompt_2 or prompt
+            prompt_2 = [prompt_2] if isinstance(prompt_2, str) else prompt_2
+
+            # We only use the pooled prompt output from the CLIPTextModel
+            pooled_prompt_embeds = self._get_clip_prompt_embeds(
+                prompt=prompt,
+                device=device,
+                num_images_per_prompt=num_images_per_prompt,
+            )
+            prompt_embeds = self._get_t5_prompt_embeds(
+                prompt=prompt_2,
+                num_images_per_prompt=num_images_per_prompt,
+                max_sequence_length=max_sequence_length,
+                device=device,
+            )
+
+        if self.text_encoder is not None:
+            if isinstance(self, FluxLoraLoaderMixin) and USE_PEFT_BACKEND:
+                # Retrieve the original scale by scaling back the LoRA layers
+                unscale_lora_layers(self.text_encoder, lora_scale)
+
+        if self.text_encoder_2 is not None:
+            if isinstance(self, FluxLoraLoaderMixin) and USE_PEFT_BACKEND:
+                # Retrieve the original scale by scaling back the LoRA layers
+                unscale_lora_layers(self.text_encoder_2, lora_scale)
+
+        dtype = self.text_encoder.dtype if self.text_encoder is not None else self.transformer.dtype
+        text_ids = torch.zeros(prompt_embeds.shape[1], 3).to(device=device, dtype=dtype)
+
+        return prompt_embeds, pooled_prompt_embeds, text_ids
+
+    # Copied from diffusers.pipelines.stable_diffusion_3.pipeline_stable_diffusion_3_inpaint.StableDiffusion3InpaintPipeline._encode_vae_image
+    def _encode_vae_image(self, image: torch.Tensor, generator: torch.Generator):
+        if isinstance(generator, list):
+            image_latents = [
+                retrieve_latents(self.vae.encode(image[i : i + 1]), generator=generator[i])
+                for i in range(image.shape[0])
+            ]
+            image_latents = torch.cat(image_latents, dim=0)
+        else:
+            image_latents = retrieve_latents(self.vae.encode(image), generator=generator)
+
+        image_latents = (image_latents - self.vae.config.shift_factor) * self.vae.config.scaling_factor
+
+        return image_latents
+
+    # Copied from diffusers.pipelines.stable_diffusion_3.pipeline_stable_diffusion_3_img2img.StableDiffusion3Img2ImgPipeline.get_timesteps
+    def get_timesteps(self, num_inference_steps, strength, device):
+        # get the original timestep using init_timestep
+        init_timestep = min(num_inference_steps * strength, num_inference_steps)
+
+        t_start = int(max(num_inference_steps - init_timestep, 0))
+        timesteps = self.scheduler.timesteps[t_start * self.scheduler.order :]
+        if hasattr(self.scheduler, "set_begin_index"):
+            self.scheduler.set_begin_index(t_start * self.scheduler.order)
+
+        return timesteps, num_inference_steps - t_start
+
+    def check_inputs(
+        self,
+        prompt,
+        prompt_2,
+        image,
+        mask_image,
+        strength,
+        height,
+        width,
+        output_type,
+        prompt_embeds=None,
+        pooled_prompt_embeds=None,
+        callback_on_step_end_tensor_inputs=None,
+        padding_mask_crop=None,
+        max_sequence_length=None,
+    ):
+        if strength < 0 or strength > 1:
+            raise ValueError(f"The value of strength should in [0.0, 1.0] but is {strength}")
+
+        if height % (self.vae_scale_factor * 2) != 0 or width % (self.vae_scale_factor * 2) != 0:
+            logger.warning(
+                f"`height` and `width` have to be divisible by {self.vae_scale_factor * 2} but are {height} and {width}. Dimensions will be resized accordingly"
+            )
+
+        if callback_on_step_end_tensor_inputs is not None and not all(
+            k in self._callback_tensor_inputs for k in callback_on_step_end_tensor_inputs
+        ):
+            raise ValueError(
+                f"`callback_on_step_end_tensor_inputs` has to be in {self._callback_tensor_inputs}, but found {[k for k in callback_on_step_end_tensor_inputs if k not in self._callback_tensor_inputs]}"
+            )
+
+        if prompt is not None and prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
+                " only forward one of the two."
+            )
+        elif prompt_2 is not None and prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt_2`: {prompt_2} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
+                " only forward one of the two."
+            )
+        elif prompt is None and prompt_embeds is None:
+            raise ValueError(
+                "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
+            )
+        elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+        elif prompt_2 is not None and (not isinstance(prompt_2, str) and not isinstance(prompt_2, list)):
+            raise ValueError(f"`prompt_2` has to be of type `str` or `list` but is {type(prompt_2)}")
+
+        if prompt_embeds is not None and pooled_prompt_embeds is None:
+            raise ValueError(
+                "If `prompt_embeds` are provided, `pooled_prompt_embeds` also have to be passed. Make sure to generate `pooled_prompt_embeds` from the same text encoder that was used to generate `prompt_embeds`."
+            )
+
+        if padding_mask_crop is not None:
+            if not isinstance(image, PIL.Image.Image):
+                raise ValueError(
+                    f"The image should be a PIL image when inpainting mask crop, but is of type {type(image)}."
+                )
+            if not isinstance(mask_image, PIL.Image.Image):
+                raise ValueError(
+                    f"The mask image should be a PIL image when inpainting mask crop, but is of type"
+                    f" {type(mask_image)}."
+                )
+            if output_type != "pil":
+                raise ValueError(f"The output type should be PIL when inpainting mask crop, but is {output_type}.")
+
+        if max_sequence_length is not None and max_sequence_length > 512:
+            raise ValueError(f"`max_sequence_length` cannot be greater than 512 but is {max_sequence_length}")
+
+    @staticmethod
+    # Copied from diffusers.pipelines.flux.pipeline_flux.FluxPipeline._prepare_latent_image_ids
+    def _prepare_latent_image_ids(batch_size, height, width, device, dtype):
+        latent_image_ids = torch.zeros(height, width, 3)
+        latent_image_ids[..., 1] = latent_image_ids[..., 1] + torch.arange(height)[:, None]
+        latent_image_ids[..., 2] = latent_image_ids[..., 2] + torch.arange(width)[None, :]
+
+        latent_image_id_height, latent_image_id_width, latent_image_id_channels = latent_image_ids.shape
+
+        latent_image_ids = latent_image_ids.reshape(
+            latent_image_id_height * latent_image_id_width, latent_image_id_channels
+        )
+
+        return latent_image_ids.to(device=device, dtype=dtype)
+
+    @staticmethod
+    # Copied from diffusers.pipelines.flux.pipeline_flux.FluxPipeline._pack_latents
+    def _pack_latents(latents, batch_size, num_channels_latents, height, width):
+        latents = latents.view(batch_size, num_channels_latents, height // 2, 2, width // 2, 2)
+        latents = latents.permute(0, 2, 4, 1, 3, 5)
+        latents = latents.reshape(batch_size, (height // 2) * (width // 2), num_channels_latents * 4)
+
+        return latents
+
+    @staticmethod
+    # Copied from diffusers.pipelines.flux.pipeline_flux.FluxPipeline._unpack_latents
+    def _unpack_latents(latents, height, width, vae_scale_factor):
+        batch_size, num_patches, channels = latents.shape
+
+        # VAE applies 8x compression on images but we must also account for packing which requires
+        # latent height and width to be divisible by 2.
+        height = 2 * (int(height) // (vae_scale_factor * 2))
+        width = 2 * (int(width) // (vae_scale_factor * 2))
+
+        latents = latents.view(batch_size, height // 2, width // 2, channels // 4, 2, 2)
+        latents = latents.permute(0, 3, 1, 4, 2, 5)
+
+        latents = latents.reshape(batch_size, channels // (2 * 2), height, width)
+
+        return latents
+
+    def prepare_latents(
+        self,
+        image,
+        timestep,
+        batch_size,
+        num_channels_latents,
+        height,
+        width,
+        dtype,
+        device,
+        generator,
+        latents=None,
+    ):
+        if isinstance(generator, list) and len(generator) != batch_size:
+            raise ValueError(
+                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+            )
+
+        # VAE applies 8x compression on images but we must also account for packing which requires
+        # latent height and width to be divisible by 2.
+        height = 2 * (int(height) // (self.vae_scale_factor * 2))
+        width = 2 * (int(width) // (self.vae_scale_factor * 2))
+        shape = (batch_size, num_channels_latents, height, width)
+        latent_image_ids = self._prepare_latent_image_ids(batch_size, height // 2, width // 2, device, dtype)
+
+        image = image.to(device=device, dtype=dtype)
+        image_latents = self._encode_vae_image(image=image, generator=generator)
+
+        if batch_size > image_latents.shape[0] and batch_size % image_latents.shape[0] == 0:
+            # expand init_latents for batch_size
+            additional_image_per_prompt = batch_size // image_latents.shape[0]
+            image_latents = torch.cat([image_latents] * additional_image_per_prompt, dim=0)
+        elif batch_size > image_latents.shape[0] and batch_size % image_latents.shape[0] != 0:
+            raise ValueError(
+                f"Cannot duplicate `image` of batch size {image_latents.shape[0]} to {batch_size} text prompts."
+            )
+        else:
+            image_latents = torch.cat([image_latents], dim=0)
+
+        if latents is None:
+            noise = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+            latents = self.scheduler.scale_noise(image_latents, timestep, noise)
+        else:
+            noise = latents.to(device)
+            latents = noise
+
+        noise = self._pack_latents(noise, batch_size, num_channels_latents, height, width)
+        image_latents = self._pack_latents(image_latents, batch_size, num_channels_latents, height, width)
+        latents = self._pack_latents(latents, batch_size, num_channels_latents, height, width)
+
+        return latents, noise, image_latents, latent_image_ids
+
+    def prepare_mask_latents(
+        self,
+        mask,
+        masked_image,
+        batch_size,
+        num_channels_latents,
+        num_images_per_prompt,
+        height,
+        width,
+        dtype,
+        device,
+        generator,
+    ):
+        # VAE applies 8x compression on images but we must also account for packing which requires
+        # latent height and width to be divisible by 2.
+        height = 2 * (int(height) // (self.vae_scale_factor * 2))
+        width = 2 * (int(width) // (self.vae_scale_factor * 2))
+        # resize the mask to latents shape as we concatenate the mask to the latents
+        # we do that before converting to dtype to avoid breaking in case we're using cpu_offload
+        # and half precision
+        mask = torch.nn.functional.interpolate(mask, size=(height, width))
+        mask = mask.to(device=device, dtype=dtype)
+
+        batch_size = batch_size * num_images_per_prompt
+
+        masked_image = masked_image.to(device=device, dtype=dtype)
+
+        if masked_image.shape[1] == 16:
+            masked_image_latents = masked_image
+        else:
+            masked_image_latents = retrieve_latents(self.vae.encode(masked_image), generator=generator)
+
+        masked_image_latents = (masked_image_latents - self.vae.config.shift_factor) * self.vae.config.scaling_factor
+
+        # duplicate mask and masked_image_latents for each generation per prompt, using mps friendly method
+        if mask.shape[0] < batch_size:
+            if not batch_size % mask.shape[0] == 0:
+                raise ValueError(
+                    "The passed mask and the required batch size don't match. Masks are supposed to be duplicated to"
+                    f" a total batch size of {batch_size}, but {mask.shape[0]} masks were passed. Make sure the number"
+                    " of masks that you pass is divisible by the total requested batch size."
+                )
+            mask = mask.repeat(batch_size // mask.shape[0], 1, 1, 1)
+        if masked_image_latents.shape[0] < batch_size:
+            if not batch_size % masked_image_latents.shape[0] == 0:
+                raise ValueError(
+                    "The passed images and the required batch size don't match. Images are supposed to be duplicated"
+                    f" to a total batch size of {batch_size}, but {masked_image_latents.shape[0]} images were passed."
+                    " Make sure the number of images that you pass is divisible by the total requested batch size."
+                )
+            masked_image_latents = masked_image_latents.repeat(batch_size // masked_image_latents.shape[0], 1, 1, 1)
+
+        # aligning device to prevent device errors when concating it with the latent model input
+        masked_image_latents = masked_image_latents.to(device=device, dtype=dtype)
+        masked_image_latents = self._pack_latents(
+            masked_image_latents,
+            batch_size,
+            num_channels_latents,
+            height,
+            width,
+        )
+
+        mask = self._pack_latents(
+            mask.repeat(1, num_channels_latents, 1, 1),
+            batch_size,
+            num_channels_latents,
+            height,
+            width,
+        )
+        return mask, masked_image_latents
+
+    # Copied from diffusers.pipelines.controlnet_sd3.pipeline_stable_diffusion_3_controlnet.StableDiffusion3ControlNetPipeline.prepare_image
+    def prepare_image(
+        self,
+        image,
+        width,
+        height,
+        batch_size,
+        num_images_per_prompt,
+        device,
+        dtype,
+        do_classifier_free_guidance=False,
+        guess_mode=False,
+    ):
+        if isinstance(image, torch.Tensor):
+            pass
+        else:
+            image = self.image_processor.preprocess(image, height=height, width=width)
+
+        image_batch_size = image.shape[0]
+
+        if image_batch_size == 1:
+            repeat_by = batch_size
+        else:
+            # image batch size is the same as prompt batch size
+            repeat_by = num_images_per_prompt
+
+        image = image.repeat_interleave(repeat_by, dim=0)
+
+        image = image.to(device=device, dtype=dtype)
+
+        if do_classifier_free_guidance and not guess_mode:
+            image = torch.cat([image] * 2)
+
+        return image
+
+    def prepare_mask_latents_fill(
+        self,
+        mask,
+        masked_image,
+        batch_size,
+        num_channels_latents,
+        num_images_per_prompt,
+        height,
+        width,
+        dtype,
+        device,
+        generator,
+    ):
+        # 1. calculate the height and width of the latents
+        # VAE applies 8x compression on images but we must also account for packing which requires
+        # latent height and width to be divisible by 2.
+        height = 2 * (int(height) // (self.vae_scale_factor * 2))
+        width = 2 * (int(width) // (self.vae_scale_factor * 2))
+
+        # 2. encode the masked image
+        if masked_image.shape[1] == num_channels_latents:
+            masked_image_latents = masked_image
+        else:
+            masked_image_latents = retrieve_latents_fill(self.vae.encode(masked_image), generator=generator)
+
+        masked_image_latents = (masked_image_latents - self.vae.config.shift_factor) * self.vae.config.scaling_factor
+        masked_image_latents = masked_image_latents.to(device=device, dtype=dtype)
+
+        # 3. duplicate mask and masked_image_latents for each generation per prompt, using mps friendly method
+        batch_size = batch_size * num_images_per_prompt
+        if mask.shape[0] < batch_size:
+            if not batch_size % mask.shape[0] == 0:
+                raise ValueError(
+                    "The passed mask and the required batch size don't match. Masks are supposed to be duplicated to"
+                    f" a total batch size of {batch_size}, but {mask.shape[0]} masks were passed. Make sure the number"
+                    " of masks that you pass is divisible by the total requested batch size."
+                )
+            mask = mask.repeat(batch_size // mask.shape[0], 1, 1, 1)
+        if masked_image_latents.shape[0] < batch_size:
+            if not batch_size % masked_image_latents.shape[0] == 0:
+                raise ValueError(
+                    "The passed images and the required batch size don't match. Images are supposed to be duplicated"
+                    f" to a total batch size of {batch_size}, but {masked_image_latents.shape[0]} images were passed."
+                    " Make sure the number of images that you pass is divisible by the total requested batch size."
+                )
+            masked_image_latents = masked_image_latents.repeat(batch_size // masked_image_latents.shape[0], 1, 1, 1)
+
+        # 4. pack the masked_image_latents
+        # batch_size, num_channels_latents, height, width -> batch_size, height//2 * width//2 , num_channels_latents*4
+        masked_image_latents = self._pack_latents(
+            masked_image_latents,
+            batch_size,
+            num_channels_latents,
+            height,
+            width,
+        )
+
+        # 5.resize mask to latents shape we we concatenate the mask to the latents
+        mask = mask[:, 0, :, :]  # batch_size, 8 * height, 8 * width (mask has not been 8x compressed)
+        mask = mask.view(
+            batch_size, height, self.vae_scale_factor, width, self.vae_scale_factor
+        )  # batch_size, height, 8, width, 8
+        mask = mask.permute(0, 2, 4, 1, 3)  # batch_size, 8, 8, height, width
+        mask = mask.reshape(
+            batch_size, self.vae_scale_factor * self.vae_scale_factor, height, width
+        )  # batch_size, 8*8, height, width
+
+        # 6. pack the mask:
+        # batch_size, 64, height, width -> batch_size, height//2 * width//2 , 64*2*2
+        mask = self._pack_latents(
+            mask,
+            batch_size,
+            self.vae_scale_factor * self.vae_scale_factor,
+            height,
+            width,
+        )
+        mask = mask.to(device=device, dtype=dtype)
+
+        return mask, masked_image_latents
+
+    @property
+    def guidance_scale(self):
+        return self._guidance_scale
+
+    @property
+    def joint_attention_kwargs(self):
+        return self._joint_attention_kwargs
+
+    @property
+    def num_timesteps(self):
+        return self._num_timesteps
+
+    @property
+    def interrupt(self):
+        return self._interrupt
+
+    @torch.no_grad()
+    @replace_example_docstring(EXAMPLE_DOC_STRING)
+    def __call__(
+        self,
+        prompt: Union[str, List[str]] = None,
+        prompt_2: Optional[Union[str, List[str]]] = None,
+        image: PipelineImageInput = None,
+        mask_image: PipelineImageInput = None,
+        masked_image_latents: PipelineImageInput = None,
+        control_image: PipelineImageInput = None,
+        height: Optional[int] = None,
+        width: Optional[int] = None,
+        strength: float = 0.6,
+        padding_mask_crop: Optional[int] = None,
+        sigmas: Optional[List[float]] = None,
+        num_inference_steps: int = 28,
+        guidance_scale: float = 7.0,
+        control_guidance_start: Union[float, List[float]] = 0.0,
+        control_guidance_end: Union[float, List[float]] = 1.0,
+        control_mode: Optional[Union[int, List[int]]] = None,
+        controlnet_conditioning_scale: Union[float, List[float]] = 1.0,
+        num_images_per_prompt: Optional[int] = 1,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.FloatTensor] = None,
+        prompt_embeds: Optional[torch.FloatTensor] = None,
+        pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        joint_attention_kwargs: Optional[Dict[str, Any]] = None,
+        callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None,
+        callback_on_step_end_tensor_inputs: List[str] = ["latents"],
+        max_sequence_length: int = 512,
+    ):
+        """
+        Function invoked when calling the pipeline for generation.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide the image generation.
+            prompt_2 (`str` or `List[str]`, *optional*):
+                The prompt or prompts to be sent to the `tokenizer_2` and `text_encoder_2`.
+            image (`PIL.Image.Image` or `List[PIL.Image.Image]` or `torch.FloatTensor`):
+                The image(s) to inpaint.
+            mask_image (`PIL.Image.Image` or `List[PIL.Image.Image]` or `torch.FloatTensor`):
+                The mask image(s) to use for inpainting. White pixels in the mask will be repainted, while black pixels
+                will be preserved.
+            masked_image_latents (`torch.FloatTensor`, *optional*):
+                Pre-generated masked image latents.
+            control_image (`PIL.Image.Image` or `List[PIL.Image.Image]` or `torch.FloatTensor`):
+                The ControlNet input condition. Image to control the generation.
+            height (`int`, *optional*, defaults to self.default_sample_size * self.vae_scale_factor):
+                The height in pixels of the generated image.
+            width (`int`, *optional*, defaults to self.default_sample_size * self.vae_scale_factor):
+                The width in pixels of the generated image.
+            strength (`float`, *optional*, defaults to 0.6):
+                Conceptually, indicates how much to inpaint the masked area. Must be between 0 and 1.
+            padding_mask_crop (`int`, *optional*):
+                The size of the padding to use when cropping the mask.
+            num_inference_steps (`int`, *optional*, defaults to 28):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            sigmas (`List[float]`, *optional*):
+                Custom sigmas to use for the denoising process with schedulers which support a `sigmas` argument in
+                their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is passed
+                will be used.
+            guidance_scale (`float`, *optional*, defaults to 7.0):
+                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
+            control_guidance_start (`float` or `List[float]`, *optional*, defaults to 0.0):
+                The percentage of total steps at which the ControlNet starts applying.
+            control_guidance_end (`float` or `List[float]`, *optional*, defaults to 1.0):
+                The percentage of total steps at which the ControlNet stops applying.
+            control_mode (`int` or `List[int]`, *optional*):
+                The mode for the ControlNet. If multiple ControlNets are used, this should be a list.
+            controlnet_conditioning_scale (`float` or `List[float]`, *optional*, defaults to 1.0):
+                The outputs of the ControlNet are multiplied by `controlnet_conditioning_scale` before they are added
+                to the residual in the original transformer.
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                One or more [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html) to
+                make generation deterministic.
+            latents (`torch.FloatTensor`, *optional*):
+                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts.
+            prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting.
+            pooled_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated pooled text embeddings.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generate image. Choose between `PIL.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.flux.FluxPipelineOutput`] instead of a plain tuple.
+            joint_attention_kwargs (`dict`, *optional*):
+                Additional keyword arguments to be passed to the joint attention mechanism.
+            callback_on_step_end (`Callable`, *optional*):
+                A function that calls at the end of each denoising step during the inference.
+            callback_on_step_end_tensor_inputs (`List[str]`, *optional*):
+                The list of tensor inputs for the `callback_on_step_end` function.
+            max_sequence_length (`int`, *optional*, defaults to 512):
+                The maximum length of the sequence to be generated.
+
+        Examples:
+
+        Returns:
+            [`~pipelines.flux.FluxPipelineOutput`] or `tuple`: [`~pipelines.flux.FluxPipelineOutput`] if `return_dict`
+            is True, otherwise a `tuple`. When returning a tuple, the first element is a list with the generated
+            images.
+        """
+        height = height or self.default_sample_size * self.vae_scale_factor
+        width = width or self.default_sample_size * self.vae_scale_factor
+
+        global_height = height
+        global_width = width
+
+        if not isinstance(control_guidance_start, list) and isinstance(control_guidance_end, list):
+            control_guidance_start = len(control_guidance_end) * [control_guidance_start]
+        elif not isinstance(control_guidance_end, list) and isinstance(control_guidance_start, list):
+            control_guidance_end = len(control_guidance_start) * [control_guidance_end]
+        elif not isinstance(control_guidance_start, list) and not isinstance(control_guidance_end, list):
+            mult = len(self.controlnet.nets) if isinstance(self.controlnet, FluxMultiControlNetModel) else 1
+            control_guidance_start, control_guidance_end = (
+                mult * [control_guidance_start],
+                mult * [control_guidance_end],
+            )
+
+        # 1. Check inputs
+        self.check_inputs(
+            prompt,
+            prompt_2,
+            image,
+            mask_image,
+            strength,
+            height,
+            width,
+            output_type=output_type,
+            prompt_embeds=prompt_embeds,
+            pooled_prompt_embeds=pooled_prompt_embeds,
+            callback_on_step_end_tensor_inputs=callback_on_step_end_tensor_inputs,
+            padding_mask_crop=padding_mask_crop,
+            max_sequence_length=max_sequence_length,
+        )
+
+        self._guidance_scale = guidance_scale
+        self._joint_attention_kwargs = joint_attention_kwargs
+        self._interrupt = False
+
+        # 2. Define call parameters
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        device = self._execution_device
+        dtype = self.transformer.dtype
+
+        # 3. Encode input prompt
+        lora_scale = (
+            self.joint_attention_kwargs.get("scale", None) if self.joint_attention_kwargs is not None else None
+        )
+        prompt_embeds, pooled_prompt_embeds, text_ids = self.encode_prompt(
+            prompt=prompt,
+            prompt_2=prompt_2,
+            prompt_embeds=prompt_embeds,
+            pooled_prompt_embeds=pooled_prompt_embeds,
+            device=device,
+            num_images_per_prompt=num_images_per_prompt,
+            max_sequence_length=max_sequence_length,
+            lora_scale=lora_scale,
+        )
+
+        # 4. Preprocess mask and image
+        if padding_mask_crop is not None:
+            crops_coords = self.mask_processor.get_crop_region(
+                mask_image, global_width, global_height, pad=padding_mask_crop
+            )
+            resize_mode = "fill"
+        else:
+            crops_coords = None
+            resize_mode = "default"
+
+        original_image = image
+        init_image = self.image_processor.preprocess(
+            image, height=global_height, width=global_width, crops_coords=crops_coords, resize_mode=resize_mode
+        )
+        init_image = init_image.to(dtype=torch.float32)
+
+        # 5. Prepare control image
+        # num_channels_latents = self.transformer.config.in_channels // 4
+        num_channels_latents = self.vae.config.latent_channels
+
+        if isinstance(self.controlnet, FluxControlNetModel):
+            control_image = self.prepare_image(
+                image=control_image,
+                width=width,
+                height=height,
+                batch_size=batch_size * num_images_per_prompt,
+                num_images_per_prompt=num_images_per_prompt,
+                device=device,
+                dtype=self.vae.dtype,
+            )
+            height, width = control_image.shape[-2:]
+
+            # xlab controlnet has a input_hint_block and instantx controlnet does not
+            controlnet_blocks_repeat = False if self.controlnet.input_hint_block is None else True
+            if self.controlnet.input_hint_block is None:
+                # vae encode
+                control_image = retrieve_latents(self.vae.encode(control_image), generator=generator)
+                control_image = (control_image - self.vae.config.shift_factor) * self.vae.config.scaling_factor
+
+                # pack
+                height_control_image, width_control_image = control_image.shape[2:]
+                control_image = self._pack_latents(
+                    control_image,
+                    batch_size * num_images_per_prompt,
+                    num_channels_latents,
+                    height_control_image,
+                    width_control_image,
+                )
+
+            # set control mode
+            if control_mode is not None:
+                control_mode = torch.tensor(control_mode).to(device, dtype=torch.long)
+                control_mode = control_mode.reshape([-1, 1])
+
+        elif isinstance(self.controlnet, FluxMultiControlNetModel):
+            control_images = []
+
+            # xlab controlnet has a input_hint_block and instantx controlnet does not
+            controlnet_blocks_repeat = False if self.controlnet.nets[0].input_hint_block is None else True
+            for i, control_image_ in enumerate(control_image):
+                control_image_ = self.prepare_image(
+                    image=control_image_,
+                    width=width,
+                    height=height,
+                    batch_size=batch_size * num_images_per_prompt,
+                    num_images_per_prompt=num_images_per_prompt,
+                    device=device,
+                    dtype=self.vae.dtype,
+                )
+                height, width = control_image_.shape[-2:]
+
+                if self.controlnet.nets[0].input_hint_block is None:
+                    # vae encode
+                    control_image_ = retrieve_latents(self.vae.encode(control_image_), generator=generator)
+                    control_image_ = (control_image_ - self.vae.config.shift_factor) * self.vae.config.scaling_factor
+
+                    # pack
+                    height_control_image, width_control_image = control_image_.shape[2:]
+                    control_image_ = self._pack_latents(
+                        control_image_,
+                        batch_size * num_images_per_prompt,
+                        num_channels_latents,
+                        height_control_image,
+                        width_control_image,
+                    )
+
+                control_images.append(control_image_)
+
+            control_image = control_images
+
+            # set control mode
+            control_mode_ = []
+            if isinstance(control_mode, list):
+                for cmode in control_mode:
+                    if cmode is None:
+                        control_mode_.append(-1)
+                    else:
+                        control_mode_.append(cmode)
+            control_mode = torch.tensor(control_mode_).to(device, dtype=torch.long)
+            control_mode = control_mode.reshape([-1, 1])
+
+        # 6. Prepare timesteps
+
+        sigmas = np.linspace(1.0, 1 / num_inference_steps, num_inference_steps) if sigmas is None else sigmas
+        image_seq_len = (int(global_height) // self.vae_scale_factor // 2) * (
+            int(global_width) // self.vae_scale_factor // 2
+        )
+        mu = calculate_shift(
+            image_seq_len,
+            self.scheduler.config.get("base_image_seq_len", 256),
+            self.scheduler.config.get("max_image_seq_len", 4096),
+            self.scheduler.config.get("base_shift", 0.5),
+            self.scheduler.config.get("max_shift", 1.15),
+        )
+        timesteps, num_inference_steps = retrieve_timesteps(
+            self.scheduler,
+            num_inference_steps,
+            device,
+            sigmas=sigmas,
+            mu=mu,
+        )
+        timesteps, num_inference_steps = self.get_timesteps(num_inference_steps, strength, device)
+
+        if num_inference_steps < 1:
+            raise ValueError(
+                f"After adjusting the num_inference_steps by strength parameter: {strength}, the number of pipeline"
+                f"steps is {num_inference_steps} which is < 1 and not appropriate for this pipeline."
+            )
+        latent_timestep = timesteps[:1].repeat(batch_size * num_images_per_prompt)
+
+        # 7. Prepare latent variables
+
+        latents, noise, image_latents, latent_image_ids = self.prepare_latents(
+            init_image,
+            latent_timestep,
+            batch_size * num_images_per_prompt,
+            num_channels_latents,
+            global_height,
+            global_width,
+            prompt_embeds.dtype,
+            device,
+            generator,
+            latents,
+        )
+
+        # 8. Prepare mask latents
+        mask_condition = self.mask_processor.preprocess(
+            mask_image, height=global_height, width=global_width, resize_mode=resize_mode, crops_coords=crops_coords
+        )
+        if masked_image_latents is None:
+            masked_image = init_image * (mask_condition < 0.5)
+        else:
+            masked_image = masked_image_latents
+
+        mask, masked_image_latents = self.prepare_mask_latents(
+            mask_condition,
+            masked_image,
+            batch_size,
+            num_channels_latents,
+            num_images_per_prompt,
+            global_height,
+            global_width,
+            prompt_embeds.dtype,
+            device,
+            generator,
+        )
+
+        mask_image_fill = self.mask_processor.preprocess(mask_image, height=height, width=width)
+        masked_image_fill = init_image * (1 - mask_image_fill)
+        masked_image_fill = masked_image_fill.to(dtype=self.vae.dtype, device=device)
+        mask_fill, masked_latents_fill = self.prepare_mask_latents_fill(
+            mask_image_fill,
+            masked_image_fill,
+            batch_size,
+            num_channels_latents,
+            num_images_per_prompt,
+            height,
+            width,
+            prompt_embeds.dtype,
+            device,
+            generator,
+        )
+
+        controlnet_keep = []
+        for i in range(len(timesteps)):
+            keeps = [
+                1.0 - float(i / len(timesteps) < s or (i + 1) / len(timesteps) > e)
+                for s, e in zip(control_guidance_start, control_guidance_end)
+            ]
+            controlnet_keep.append(keeps[0] if isinstance(self.controlnet, FluxControlNetModel) else keeps)
+
+        # 9. Denoising loop
+        num_warmup_steps = max(len(timesteps) - num_inference_steps * self.scheduler.order, 0)
+        self._num_timesteps = len(timesteps)
+
+        with self.progress_bar(total=num_inference_steps) as progress_bar:
+            for i, t in enumerate(timesteps):
+                if self.interrupt:
+                    continue
+
+                timestep = t.expand(latents.shape[0]).to(latents.dtype)
+
+                # predict the noise residual
+                if isinstance(self.controlnet, FluxMultiControlNetModel):
+                    use_guidance = self.controlnet.nets[0].config.guidance_embeds
+                else:
+                    use_guidance = self.controlnet.config.guidance_embeds
+                if use_guidance:
+                    guidance = torch.full([1], guidance_scale, device=device, dtype=torch.float32)
+                    guidance = guidance.expand(latents.shape[0])
+                else:
+                    guidance = None
+
+                if isinstance(controlnet_keep[i], list):
+                    cond_scale = [c * s for c, s in zip(controlnet_conditioning_scale, controlnet_keep[i])]
+                else:
+                    controlnet_cond_scale = controlnet_conditioning_scale
+                    if isinstance(controlnet_cond_scale, list):
+                        controlnet_cond_scale = controlnet_cond_scale[0]
+                    cond_scale = controlnet_cond_scale * controlnet_keep[i]
+
+                controlnet_block_samples, controlnet_single_block_samples = self.controlnet(
+                    hidden_states=latents,
+                    controlnet_cond=control_image,
+                    controlnet_mode=control_mode,
+                    conditioning_scale=cond_scale,
+                    timestep=timestep / 1000,
+                    guidance=guidance,
+                    pooled_projections=pooled_prompt_embeds,
+                    encoder_hidden_states=prompt_embeds,
+                    txt_ids=text_ids,
+                    img_ids=latent_image_ids,
+                    joint_attention_kwargs=self.joint_attention_kwargs,
+                    return_dict=False,
+                )
+
+                if self.transformer.config.guidance_embeds:
+                    guidance = torch.full([1], guidance_scale, device=device, dtype=torch.float32)
+                    guidance = guidance.expand(latents.shape[0])
+                else:
+                    guidance = None
+
+                masked_image_latents_fill = torch.cat((masked_latents_fill, mask_fill), dim=-1)
+                latent_model_input = torch.cat([latents, masked_image_latents_fill], dim=2)
+
+                noise_pred = self.transformer(
+                    hidden_states=latent_model_input,
+                    timestep=timestep / 1000,
+                    guidance=guidance,
+                    pooled_projections=pooled_prompt_embeds,
+                    encoder_hidden_states=prompt_embeds,
+                    controlnet_block_samples=controlnet_block_samples,
+                    controlnet_single_block_samples=controlnet_single_block_samples,
+                    txt_ids=text_ids,
+                    img_ids=latent_image_ids,
+                    joint_attention_kwargs=self.joint_attention_kwargs,
+                    return_dict=False,
+                    controlnet_blocks_repeat=controlnet_blocks_repeat,
+                )[0]
+
+                # compute the previous noisy sample x_t -> x_t-1
+                latents_dtype = latents.dtype
+                latents = self.scheduler.step(noise_pred, t, latents, return_dict=False)[0]
+
+                # For inpainting, we need to apply the mask and add the masked image latents
+                init_latents_proper = image_latents
+                init_mask = mask
+
+                if i < len(timesteps) - 1:
+                    noise_timestep = timesteps[i + 1]
+                    init_latents_proper = self.scheduler.scale_noise(
+                        init_latents_proper, torch.tensor([noise_timestep]), noise
+                    )
+
+                latents = (1 - init_mask) * init_latents_proper + init_mask * latents
+
+                if latents.dtype != latents_dtype:
+                    if torch.backends.mps.is_available():
+                        # some platforms (eg. apple mps) misbehave due to a pytorch bug: https://github.com/pytorch/pytorch/pull/99272
+                        latents = latents.to(latents_dtype)
+
+                # call the callback, if provided
+                if callback_on_step_end is not None:
+                    callback_kwargs = {}
+                    for k in callback_on_step_end_tensor_inputs:
+                        callback_kwargs[k] = locals()[k]
+                    callback_outputs = callback_on_step_end(self, i, t, callback_kwargs)
+
+                    latents = callback_outputs.pop("latents", latents)
+                    prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds)
+                    control_image = callback_outputs.pop("control_image", control_image)
+                    mask = callback_outputs.pop("mask", mask)
+                    masked_image_latents = callback_outputs.pop("masked_image_latents", masked_image_latents)
+
+                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+                    progress_bar.update()
+
+                if XLA_AVAILABLE:
+                    xm.mark_step()
+
+        # Post-processing
+        if output_type == "latent":
+            image = latents
+        else:
+            latents = self._unpack_latents(latents, global_height, global_width, self.vae_scale_factor)
+            latents = (latents / self.vae.config.scaling_factor) + self.vae.config.shift_factor
+            image = self.vae.decode(latents, return_dict=False)[0]
+            image = self.image_processor.postprocess(image, output_type=output_type)
+
+        # Offload all models
+        self.maybe_free_model_hooks()
+
+        if not return_dict:
+            return (image,)
+
+        return FluxPipelineOutput(images=image)
diff --git a/examples/community/regional_prompting_stable_diffusion.py b/examples/community/regional_prompting_stable_diffusion.py
index bca67e3959d8..3bc780cfcf7a 100644
--- a/examples/community/regional_prompting_stable_diffusion.py
+++ b/examples/community/regional_prompting_stable_diffusion.py
@@ -490,7 +490,7 @@ def hook_forwards(root_module: torch.nn.Module):
     def prepare_extra_step_kwargs(self, generator, eta):
         # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
         # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
-        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
+        # eta corresponds to η in DDIM paper: https://huggingface.co/papers/2010.02502
         # and should be between [0, 1]
 
         accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
@@ -841,7 +841,7 @@ def stable_diffusion_call(
             num_images_per_prompt (`int`, *optional*, defaults to 1):
                 The number of images to generate per prompt.
             eta (`float`, *optional*, defaults to 0.0):
-                Corresponds to parameter eta (η) from the [DDIM](https://arxiv.org/abs/2010.02502) paper. Only applies
+                Corresponds to parameter eta (η) from the [DDIM](https://huggingface.co/papers/2010.02502) paper. Only applies
                 to the [`~schedulers.DDIMScheduler`], and is ignored in other schedulers.
             generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
                 A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
@@ -872,7 +872,7 @@ def stable_diffusion_call(
                 [`self.processor`](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
             guidance_rescale (`float`, *optional*, defaults to 0.0):
                 Guidance rescale factor from [Common Diffusion Noise Schedules and Sample Steps are
-                Flawed](https://arxiv.org/pdf/2305.08891.pdf). Guidance rescale factor should fix overexposure when
+                Flawed](https://huggingface.co/papers/2305.08891). Guidance rescale factor should fix overexposure when
                 using zero terminal SNR.
             clip_skip (`int`, *optional*):
                 Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
@@ -1062,7 +1062,7 @@ def stable_diffusion_call(
                     noise_pred = noise_pred_uncond + self.guidance_scale * (noise_pred_text - noise_pred_uncond)
 
                 if self.do_classifier_free_guidance and self.guidance_rescale > 0.0:
-                    # Based on 3.4. in https://arxiv.org/pdf/2305.08891.pdf
+                    # Based on 3.4. in https://huggingface.co/papers/2305.08891
                     noise_pred = rescale_noise_cfg(noise_pred, noise_pred_text, guidance_rescale=self.guidance_rescale)
 
                 # compute the previous noisy sample x_t -> x_t-1
@@ -1668,7 +1668,7 @@ def rescale_noise_cfg(noise_cfg, noise_pred_text, guidance_rescale=0.0):
     r"""
     Rescales `noise_cfg` tensor based on `guidance_rescale` to improve image quality and fix overexposure. Based on
     Section 3.4 from [Common Diffusion Noise Schedules and Sample Steps are
-    Flawed](https://arxiv.org/pdf/2305.08891.pdf).
+    Flawed](https://huggingface.co/papers/2305.08891).
 
     Args:
         noise_cfg (`torch.Tensor`):
diff --git a/examples/dreambooth/README_flux.md b/examples/dreambooth/README_flux.md
index 242f018b654b..42edbb122136 100644
--- a/examples/dreambooth/README_flux.md
+++ b/examples/dreambooth/README_flux.md
@@ -268,12 +268,11 @@ provide a simple script for LoRA fine-tuning Kontext in [train_dreambooth_lora_f
 **important**
 
 > [!NOTE] 
-> To make sure you can successfully run the latest version of the kontext example script, we highly recommend installing from source, specifically from the commit mentioned below.
+> To make sure you can successfully run the latest version of the kontext example script, we highly recommend installing from source.
 > To do this, execute the following steps in a new virtual environment:
 > ```
 > git clone https://github.com/huggingface/diffusers
 > cd diffusers
-> git checkout 05e7a854d0a5661f5b433f6dd5954c224b104f0b
 > pip install -e .
 > ```
 
diff --git a/examples/dreambooth/README_flux2.md b/examples/dreambooth/README_flux2.md
new file mode 100644
index 000000000000..1d1777811387
--- /dev/null
+++ b/examples/dreambooth/README_flux2.md
@@ -0,0 +1,315 @@
+# DreamBooth training example for FLUX.2 [dev]
+
+[DreamBooth](https://huggingface.co/papers/2208.12242) is a method to personalize image generation models given just a few (3~5) images of a subject/concept.
+
+The `train_dreambooth_lora_flux2.py` script shows how to implement the training procedure for [LoRAs](https://huggingface.co/blog/lora) and adapt it for [FLUX.2 [dev]](https://github.com/black-forest-labs/flux2).
+
+> [!NOTE]
+> **Memory consumption**
+>
+> Flux can be quite expensive to run on consumer hardware devices and as a result finetuning it comes with high memory requirements -
+> a LoRA with a rank of 16 can exceed XXGB of VRAM for training. below we provide some tips and tricks to reduce memory consumption during training.
+
+> For more tips & guidance on training on a resource-constrained device and general good practices please check out these great guides and trainers for FLUX: 
+> 1) [`@bghira`'s guide](https://github.com/bghira/SimpleTuner/blob/main/documentation/quickstart/FLUX2.md)
+> 2) [`ostris`'s guide](https://github.com/ostris/ai-toolkit?tab=readme-ov-file#flux2-training)
+
+> [!NOTE]
+> **Gated model**
+>
+> As the model is gated, before using it with diffusers you first need to go to the [FLUX.2 [dev] Hugging Face page](https://huggingface.co/black-forest-labs/FLUX.2-dev), fill in the form and accept the gate. Once you are in, you need to log in so that your system knows you’ve accepted the gate. Use the command below to log in:
+
+```bash
+hf auth login
+```
+
+This will also allow us to push the trained model parameters to the Hugging Face Hub platform.
+
+## Running locally with PyTorch
+
+### Installing the dependencies
+
+Before running the scripts, make sure to install the library's training dependencies:
+
+**Important**
+
+To make sure you can successfully run the latest versions of the example scripts, we highly recommend **installing from source** and keeping the install up to date as we update the example scripts frequently and install some example-specific requirements. To do this, execute the following steps in a new virtual environment:
+
+```bash
+git clone https://github.com/huggingface/diffusers
+cd diffusers
+pip install -e .
+```
+
+Then cd in the `examples/dreambooth` folder and run
+```bash
+pip install -r requirements_flux.txt
+```
+
+And initialize an [🤗Accelerate](https://github.com/huggingface/accelerate/) environment with:
+
+```bash
+accelerate config
+```
+
+Or for a default accelerate configuration without answering questions about your environment
+
+```bash
+accelerate config default
+```
+
+Or if your environment doesn't support an interactive shell (e.g., a notebook)
+
+```python
+from accelerate.utils import write_basic_config
+write_basic_config()
+```
+
+When running `accelerate config`, if we specify torch compile mode to True there can be dramatic speedups.
+Note also that we use PEFT library as backend for LoRA training, make sure to have `peft>=0.6.0` installed in your environment.
+
+
+### Dog toy example
+
+Now let's get our dataset. For this example we will use some dog images: https://huggingface.co/datasets/diffusers/dog-example.
+
+Let's first download it locally:
+
+```python
+from huggingface_hub import snapshot_download
+
+local_dir = "./dog"
+snapshot_download(
+    "diffusers/dog-example",
+    local_dir=local_dir, repo_type="dataset",
+    ignore_patterns=".gitattributes",
+)
+```
+
+This will also allow us to push the trained LoRA parameters to the Hugging Face Hub platform.
+
+As mentioned, Flux2 LoRA training is *very* memory intensive. Here are memory optimizations we can use (some still experimental) for a more memory efficient training:
+
+## Memory Optimizations
+> [!NOTE] many of these techniques complement each other and can be used together to further reduce memory consumption. 
+> However some techniques may be mutually exclusive so be sure to check before launching a training run.
+### Remote Text Encoder 
+Flux.2 uses  Mistral Small 3.1 as text encoder which is quite large and can take up a lot of memory. To mitigate this, we can use the `--remote_text_encoder` flag to enable remote computation of the prompt embeddings using the HuggingFace Inference API. 
+This way, the text encoder model is not loaded into memory during training.
+> [!NOTE] 
+> to enable remote text encoding you must either be logged in to your HuggingFace account (`hf auth login`) OR pass a token with `--hub_token`.
+### CPU Offloading 
+To offload parts of the model to CPU memory, you can use `--offload` flag. This will offload the vae and text encoder to CPU memory and only move them to GPU when needed.
+### Latent Caching 
+Pre-encode the training images with the vae, and then delete it to free up some memory. To enable `latent_caching` simply pass `--cache_latents`.
+### QLoRA: Low Precision Training with Quantization
+Perform low precision training using 8-bit or 4-bit quantization to reduce memory usage. You can use the following flags:
+- **FP8 training** with `torchao`: 
+enable FP8 training by passing `--do_fp8_training`. 
+> [!IMPORTANT] Since we are utilizing FP8 tensor cores we need CUDA GPUs with compute capability at least 8.9 or greater. 
+> If you're looking for memory-efficient training on relatively older cards, we encourage you to check out other trainers like SimpleTuner, ai-toolkit, etc.
+- **NF4 training** with `bitsandbytes`: 
+Alternatively, you can use 8-bit or 4-bit quantization with `bitsandbytes` by passing:
+`--bnb_quantization_config_path` to enable 4-bit NF4 quantization.
+### Gradient Checkpointing and Accumulation
+* `--gradient accumulation` refers to the number of updates steps to accumulate before performing a backward/update pass.
+by passing a value > 1 you can reduce the amount of backward/update passes and hence also memory reqs.
+* with `--gradient checkpointing` we can save memory by not storing all intermediate activations during the forward pass.
+Instead, only a subset of these activations (the checkpoints) are stored and the rest is recomputed as needed during the backward pass. Note that this comes at the expanse of a slower backward pass.
+### 8-bit-Adam Optimizer
+When training with `AdamW`(doesn't apply to `prodigy`) You can pass `--use_8bit_adam` to reduce the memory requirements of training. 
+Make sure to install `bitsandbytes` if you want to do so.
+### Image Resolution
+An easy way to mitigate some of the memory requirements is through `--resolution`. `--resolution` refers to the resolution for input images, all the images in the train/validation dataset are resized to this.
+Note that by default, images are resized to resolution of 512, but it's good to keep in mind in case you're accustomed to training on higher resolutions.
+### Precision of saved LoRA layers
+By default, trained transformer layers are saved in the precision dtype in which training was performed. E.g. when training in mixed precision is enabled with `--mixed_precision="bf16"`, final finetuned layers will be saved in `torch.bfloat16` as well. 
+This reduces memory requirements significantly w/o a significant quality loss. Note that if you do wish to save the final layers in float32 at the expanse of more memory usage, you can do so by passing `--upcast_before_saving`.
+
+
+```bash
+export MODEL_NAME="black-forest-labs/FLUX.2-dev"
+export INSTANCE_DIR="dog"
+export OUTPUT_DIR="trained-flux2"
+
+accelerate launch train_dreambooth_lora_flux2.py \
+  --pretrained_model_name_or_path=$MODEL_NAME  \
+  --instance_data_dir=$INSTANCE_DIR \
+  --output_dir=$OUTPUT_DIR \
+  --do_fp8_training \
+  --gradient_checkpointing \
+  --remote_text_encoder \
+  --cache_latents \
+  --instance_prompt="a photo of sks dog" \
+  --resolution=1024 \
+  --train_batch_size=1 \
+  --guidance_scale=1 \
+  --use_8bit_adam \
+  --gradient_accumulation_steps=4 \
+  --optimizer="adamW" \
+  --learning_rate=1e-4 \
+  --report_to="wandb" \
+  --lr_scheduler="constant" \
+  --lr_warmup_steps=100 \
+  --max_train_steps=500 \
+  --validation_prompt="A photo of sks dog in a bucket" \
+  --validation_epochs=25 \
+  --seed="0" \
+  --push_to_hub
+```
+
+To better track our training experiments, we're using the following flags in the command above:
+
+* `report_to="wandb` will ensure the training runs are tracked on [Weights and Biases](https://wandb.ai/site). To use it, be sure to install `wandb` with `pip install wandb`. Don't forget to call `wandb login <your_api_key>` before training if you haven't done it before.
+* `validation_prompt` and `validation_epochs` to allow the script to do a few validation inference runs. This allows us to qualitatively check if the training is progressing as expected.
+
+> [!NOTE]
+> If you want to train using long prompts with the T5 text encoder, you can use `--max_sequence_length` to set the token limit. The default is 77, but it can be increased to as high as 512. Note that this will use more resources and may slow down the training in some cases.
+
+## LoRA + DreamBooth
+
+[LoRA](https://huggingface.co/docs/peft/conceptual_guides/adapter#low-rank-adaptation-lora) is a popular parameter-efficient fine-tuning technique that allows you to achieve full-finetuning like performance but with a fraction of learnable parameters.
+
+Note also that we use PEFT library as backend for LoRA training, make sure to have `peft>=0.6.0` installed in your environment.
+
+### Prodigy Optimizer
+Prodigy is an adaptive optimizer that dynamically adjusts the learning rate learned parameters based on past gradients, allowing for more efficient convergence. 
+By using prodigy we can "eliminate" the need for manual learning rate tuning. read more [here](https://huggingface.co/blog/sdxl_lora_advanced_script#adaptive-optimizers).
+
+to use prodigy, first make sure to install the prodigyopt library: `pip install prodigyopt`, and then specify -
+```bash
+--optimizer="prodigy"
+```
+> [!TIP]
+> When using prodigy it's generally good practice to set- `--learning_rate=1.0`
+
+To perform DreamBooth with LoRA, run:
+
+```bash
+export MODEL_NAME="black-forest-labs/FLUX.2-dev"
+export INSTANCE_DIR="dog"
+export OUTPUT_DIR="trained-flux2-lora"
+
+accelerate launch train_dreambooth_lora_flux2.py \
+  --pretrained_model_name_or_path=$MODEL_NAME  \
+  --instance_data_dir=$INSTANCE_DIR \
+  --output_dir=$OUTPUT_DIR \
+  --do_fp8_training \
+  --gradient_checkpointing \
+  --remote_text_encoder \
+  --cache_latents \
+  --instance_prompt="a photo of sks dog" \
+  --resolution=512 \
+  --train_batch_size=1 \
+  --guidance_scale=1 \
+  --gradient_accumulation_steps=4 \
+  --optimizer="prodigy" \
+  --learning_rate=1. \
+  --report_to="wandb" \
+  --lr_scheduler="constant_with_warmup" \
+  --lr_warmup_steps=100 \
+  --max_train_steps=500 \
+  --validation_prompt="A photo of sks dog in a bucket" \
+  --validation_epochs=25 \
+  --seed="0" \
+  --push_to_hub
+```
+
+### LoRA Rank and Alpha
+Two key LoRA hyperparameters are LoRA rank and LoRA alpha. 
+- `--rank`: Defines the dimension of the trainable LoRA matrices. A higher rank means more expressiveness and capacity to learn (and more parameters).
+- `--lora_alpha`: A scaling factor for the LoRA's output. The LoRA update is scaled by lora_alpha / lora_rank.
+- lora_alpha vs. rank:
+This ratio dictates the LoRA's effective strength:
+lora_alpha == rank: Scaling factor is 1. The LoRA is applied with its learned strength. (e.g., alpha=16, rank=16)
+lora_alpha < rank: Scaling factor < 1. Reduces the LoRA's impact. Useful for subtle changes or to prevent overpowering the base model. (e.g., alpha=8, rank=16)
+lora_alpha > rank: Scaling factor > 1. Amplifies the LoRA's impact. Allows a lower rank LoRA to have a stronger effect. (e.g., alpha=32, rank=16)
+
+> [!TIP]
+> A common starting point is to set `lora_alpha` equal to `rank`. 
+> Some also set `lora_alpha` to be twice the `rank` (e.g., lora_alpha=32 for lora_rank=16) 
+> to give the LoRA updates more influence without increasing parameter count. 
+> If you find your LoRA is "overcooking" or learning too aggressively, consider setting `lora_alpha` to half of `rank` 
+> (e.g., lora_alpha=8 for rank=16). Experimentation is often key to finding the optimal balance for your use case.
+
+### Target Modules
+When LoRA was first adapted from language models to diffusion models, it was applied to the cross-attention layers in the Unet that relate the image representations with the prompts that describe them. 
+More recently, SOTA text-to-image diffusion models replaced the Unet with a diffusion Transformer(DiT). With this change, we may also want to explore 
+applying LoRA training onto different types of layers and blocks. To allow more flexibility and control over the targeted modules we added `--lora_layers`- in which you can specify in a comma separated string
+the exact modules for LoRA training. Here are some examples of target modules you can provide: 
+- for attention only layers: `--lora_layers="attn.to_k,attn.to_q,attn.to_v,attn.to_out.0"`
+- to train the same modules as in the fal trainer: `--lora_layers="attn.to_k,attn.to_q,attn.to_v,attn.to_out.0,attn.add_k_proj,attn.add_q_proj,attn.add_v_proj,attn.to_add_out,ff.net.0.proj,ff.net.2,ff_context.net.0.proj,ff_context.net.2"`
+- to train the same modules as in ostris ai-toolkit / replicate trainer: `--lora_blocks="attn.to_k,attn.to_q,attn.to_v,attn.to_out.0,attn.add_k_proj,attn.add_q_proj,attn.add_v_proj,attn.to_add_out,ff.net.0.proj,ff.net.2,ff_context.net.0.proj,ff_context.net.2,norm1_context.linear, norm1.linear,norm.linear,proj_mlp,proj_out"`
+> [!NOTE]
+> `--lora_layers` can also be used to specify which **blocks** to apply LoRA training to. To do so, simply add a block prefix to each layer in the comma separated string:
+> **single DiT blocks**: to target the ith single transformer block, add the prefix `single_transformer_blocks.i`, e.g. - `single_transformer_blocks.i.attn.to_k`
+> **MMDiT blocks**: to target the ith MMDiT block, add the prefix `transformer_blocks.i`, e.g. - `transformer_blocks.i.attn.to_k` 
+> [!NOTE]
+> keep in mind that while training more layers can improve quality and expressiveness, it also increases the size of the output LoRA weights.
+
+
+
+## Training Image-to-Image
+
+Flux.2 lets us perform image editing as well as image generation. We provide a simple script for image-to-image(I2I) LoRA fine-tuning in [train_dreambooth_lora_flux2_img2img.py](./train_dreambooth_lora_flux2_img2img.py) for both T2I and I2I. The optimizations discussed above apply this script, too.
+
+**important**
+
+**Important**
+To make sure you can successfully run the latest version of the image-to-image example script, we highly recommend installing from source, specifically from the commit mentioned below. To do this, execute the following steps in a new virtual environment:
+
+```bash
+git clone https://github.com/huggingface/diffusers
+cd diffusers
+pip install -e .
+
+To start, you must have a dataset containing triplets:
+
+* Condition image - the input image to be transformed.
+* Target image - the desired output image after transformation.
+* Instruction - a text prompt describing the transformation from the condition image to the target image.
+
+[kontext-community/relighting](https://huggingface.co/datasets/kontext-community/relighting) is a good example of such a dataset. If you are using such a dataset, you can use the command below to launch training:
+
+```bash
+accelerate launch train_dreambooth_lora_flux2_img2img.py \
+  --pretrained_model_name_or_path=black-forest-labs/FLUX.2-dev  \
+  --output_dir="flux2-i2i" \
+  --dataset_name="kontext-community/relighting" \
+  --image_column="output" --cond_image_column="file_name" --caption_column="instruction" \
+  --do_fp8_training \
+  --gradient_checkpointing \
+  --remote_text_encoder \
+  --cache_latents \
+  --resolution=1024 \
+  --train_batch_size=1 \
+  --guidance_scale=1 \
+  --gradient_accumulation_steps=4 \
+  --gradient_checkpointing \
+  --optimizer="adamw" \
+  --use_8bit_adam \
+  --cache_latents \
+  --learning_rate=1e-4 \
+  --lr_scheduler="constant_with_warmup" \
+  --lr_warmup_steps=200 \
+  --max_train_steps=1000 \
+  --rank=16\
+  --seed="0" 
+```
+
+More generally, when performing I2I fine-tuning, we expect you to:
+
+* Have a dataset `kontext-community/relighting`
+* Supply `image_column`, `cond_image_column`, and `caption_column` values when launching training
+
+### Misc notes
+
+* By default, we use `mode` as the value of `--vae_encode_mode` argument. This is because Kontext uses `mode()` of the distribution predicted by the VAE instead of sampling from it.
+### Aspect Ratio Bucketing
+we've added aspect ratio bucketing support which allows training on images with different aspect ratios without cropping them to a single square resolution. This technique helps preserve the original composition of training images and can improve training efficiency.
+
+To enable aspect ratio bucketing, pass `--aspect_ratio_buckets` argument with a semicolon-separated list of height,width pairs, such as:
+
+`--aspect_ratio_buckets="672,1568;688,1504;720,1456;752,1392;800,1328;832,1248;880,1184;944,1104;1024,1024;1104,944;1184,880;1248,832;1328,800;1392,752;1456,720;1504,688;1568,672"
+`
+Since Flux.2 finetuning is still an experimental phase, we encourage you to explore different settings and share your insights! 🤗
diff --git a/examples/dreambooth/test_dreambooth_lora_flux2.py b/examples/dreambooth/test_dreambooth_lora_flux2.py
new file mode 100644
index 000000000000..80a0b502f9a2
--- /dev/null
+++ b/examples/dreambooth/test_dreambooth_lora_flux2.py
@@ -0,0 +1,262 @@
+# coding=utf-8
+# Copyright 2025 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import json
+import logging
+import os
+import sys
+import tempfile
+
+import safetensors
+
+from diffusers.loaders.lora_base import LORA_ADAPTER_METADATA_KEY
+
+
+sys.path.append("..")
+from test_examples_utils import ExamplesTestsAccelerate, run_command  # noqa: E402
+
+
+logging.basicConfig(level=logging.DEBUG)
+
+logger = logging.getLogger()
+stream_handler = logging.StreamHandler(sys.stdout)
+logger.addHandler(stream_handler)
+
+
+class DreamBoothLoRAFlux2(ExamplesTestsAccelerate):
+    instance_data_dir = "docs/source/en/imgs"
+    instance_prompt = "dog"
+    pretrained_model_name_or_path = "hf-internal-testing/tiny-flux2"
+    script_path = "examples/dreambooth/train_dreambooth_lora_flux2.py"
+    transformer_layer_type = "single_transformer_blocks.0.attn.to_qkv_mlp_proj"
+
+    def test_dreambooth_lora_flux2(self):
+        with tempfile.TemporaryDirectory() as tmpdir:
+            test_args = f"""
+                {self.script_path}
+                --pretrained_model_name_or_path {self.pretrained_model_name_or_path}
+                --instance_data_dir {self.instance_data_dir}
+                --instance_prompt {self.instance_prompt}
+                --resolution 64
+                --train_batch_size 1
+                --gradient_accumulation_steps 1
+                --max_train_steps 2
+                --learning_rate 5.0e-04
+                --scale_lr
+                --lr_scheduler constant
+                --lr_warmup_steps 0
+                --max_sequence_length 8
+                --text_encoder_out_layers 1
+                --output_dir {tmpdir}
+                """.split()
+
+            run_command(self._launch_args + test_args)
+            # save_pretrained smoke test
+            self.assertTrue(os.path.isfile(os.path.join(tmpdir, "pytorch_lora_weights.safetensors")))
+
+            # make sure the state_dict has the correct naming in the parameters.
+            lora_state_dict = safetensors.torch.load_file(os.path.join(tmpdir, "pytorch_lora_weights.safetensors"))
+            is_lora = all("lora" in k for k in lora_state_dict.keys())
+            self.assertTrue(is_lora)
+
+            # when not training the text encoder, all the parameters in the state dict should start
+            # with `"transformer"` in their names.
+            starts_with_transformer = all(key.startswith("transformer") for key in lora_state_dict.keys())
+            self.assertTrue(starts_with_transformer)
+
+    def test_dreambooth_lora_latent_caching(self):
+        with tempfile.TemporaryDirectory() as tmpdir:
+            test_args = f"""
+                {self.script_path}
+                --pretrained_model_name_or_path {self.pretrained_model_name_or_path}
+                --instance_data_dir {self.instance_data_dir}
+                --instance_prompt {self.instance_prompt}
+                --resolution 64
+                --train_batch_size 1
+                --gradient_accumulation_steps 1
+                --max_train_steps 2
+                --cache_latents
+                --learning_rate 5.0e-04
+                --scale_lr
+                --lr_scheduler constant
+                --lr_warmup_steps 0
+                --max_sequence_length 8
+                --text_encoder_out_layers 1
+                --output_dir {tmpdir}
+                """.split()
+
+            run_command(self._launch_args + test_args)
+            # save_pretrained smoke test
+            self.assertTrue(os.path.isfile(os.path.join(tmpdir, "pytorch_lora_weights.safetensors")))
+
+            # make sure the state_dict has the correct naming in the parameters.
+            lora_state_dict = safetensors.torch.load_file(os.path.join(tmpdir, "pytorch_lora_weights.safetensors"))
+            is_lora = all("lora" in k for k in lora_state_dict.keys())
+            self.assertTrue(is_lora)
+
+            # when not training the text encoder, all the parameters in the state dict should start
+            # with `"transformer"` in their names.
+            starts_with_transformer = all(key.startswith("transformer") for key in lora_state_dict.keys())
+            self.assertTrue(starts_with_transformer)
+
+    def test_dreambooth_lora_layers(self):
+        with tempfile.TemporaryDirectory() as tmpdir:
+            test_args = f"""
+                {self.script_path}
+                --pretrained_model_name_or_path {self.pretrained_model_name_or_path}
+                --instance_data_dir {self.instance_data_dir}
+                --instance_prompt {self.instance_prompt}
+                --resolution 64
+                --train_batch_size 1
+                --gradient_accumulation_steps 1
+                --max_train_steps 2
+                --cache_latents
+                --learning_rate 5.0e-04
+                --scale_lr
+                --lora_layers {self.transformer_layer_type}
+                --lr_scheduler constant
+                --lr_warmup_steps 0
+                --max_sequence_length 8
+                --text_encoder_out_layers 1
+                --output_dir {tmpdir}
+                """.split()
+
+            run_command(self._launch_args + test_args)
+            # save_pretrained smoke test
+            self.assertTrue(os.path.isfile(os.path.join(tmpdir, "pytorch_lora_weights.safetensors")))
+
+            # make sure the state_dict has the correct naming in the parameters.
+            lora_state_dict = safetensors.torch.load_file(os.path.join(tmpdir, "pytorch_lora_weights.safetensors"))
+            is_lora = all("lora" in k for k in lora_state_dict.keys())
+            self.assertTrue(is_lora)
+
+            # when not training the text encoder, all the parameters in the state dict should start
+            # with `"transformer"` in their names. In this test, we only params of
+            # transformer.single_transformer_blocks.0.attn.to_k should be in the state dict
+            starts_with_transformer = all(
+                key.startswith(f"transformer.{self.transformer_layer_type}") for key in lora_state_dict.keys()
+            )
+            self.assertTrue(starts_with_transformer)
+
+    def test_dreambooth_lora_flux2_checkpointing_checkpoints_total_limit(self):
+        with tempfile.TemporaryDirectory() as tmpdir:
+            test_args = f"""
+            {self.script_path}
+            --pretrained_model_name_or_path={self.pretrained_model_name_or_path}
+            --instance_data_dir={self.instance_data_dir}
+            --output_dir={tmpdir}
+            --instance_prompt={self.instance_prompt}
+            --resolution=64
+            --train_batch_size=1
+            --gradient_accumulation_steps=1
+            --max_train_steps=6
+            --checkpoints_total_limit=2
+            --max_sequence_length 8
+            --checkpointing_steps=2
+            --text_encoder_out_layers 1
+            """.split()
+
+            run_command(self._launch_args + test_args)
+
+            self.assertEqual(
+                {x for x in os.listdir(tmpdir) if "checkpoint" in x},
+                {"checkpoint-4", "checkpoint-6"},
+            )
+
+    def test_dreambooth_lora_flux2_checkpointing_checkpoints_total_limit_removes_multiple_checkpoints(self):
+        with tempfile.TemporaryDirectory() as tmpdir:
+            test_args = f"""
+            {self.script_path}
+            --pretrained_model_name_or_path={self.pretrained_model_name_or_path}
+            --instance_data_dir={self.instance_data_dir}
+            --output_dir={tmpdir}
+            --instance_prompt={self.instance_prompt}
+            --resolution=64
+            --train_batch_size=1
+            --gradient_accumulation_steps=1
+            --max_train_steps=4
+            --checkpointing_steps=2
+            --max_sequence_length 8
+            --text_encoder_out_layers 1
+            """.split()
+
+            run_command(self._launch_args + test_args)
+
+            self.assertEqual({x for x in os.listdir(tmpdir) if "checkpoint" in x}, {"checkpoint-2", "checkpoint-4"})
+
+            resume_run_args = f"""
+            {self.script_path}
+            --pretrained_model_name_or_path={self.pretrained_model_name_or_path}
+            --instance_data_dir={self.instance_data_dir}
+            --output_dir={tmpdir}
+            --instance_prompt={self.instance_prompt}
+            --resolution=64
+            --train_batch_size=1
+            --gradient_accumulation_steps=1
+            --max_train_steps=8
+            --checkpointing_steps=2
+            --resume_from_checkpoint=checkpoint-4
+            --checkpoints_total_limit=2
+            --max_sequence_length 8
+            --text_encoder_out_layers 1
+            """.split()
+
+            run_command(self._launch_args + resume_run_args)
+
+            self.assertEqual({x for x in os.listdir(tmpdir) if "checkpoint" in x}, {"checkpoint-6", "checkpoint-8"})
+
+    def test_dreambooth_lora_with_metadata(self):
+        # Use a `lora_alpha` that is different from `rank`.
+        lora_alpha = 8
+        rank = 4
+        with tempfile.TemporaryDirectory() as tmpdir:
+            test_args = f"""
+                {self.script_path}
+                --pretrained_model_name_or_path {self.pretrained_model_name_or_path}
+                --instance_data_dir {self.instance_data_dir}
+                --instance_prompt {self.instance_prompt}
+                --resolution 64
+                --train_batch_size 1
+                --gradient_accumulation_steps 1
+                --max_train_steps 2
+                --lora_alpha={lora_alpha}
+                --rank={rank}
+                --learning_rate 5.0e-04
+                --scale_lr
+                --lr_scheduler constant
+                --lr_warmup_steps 0
+                --max_sequence_length 8
+                --text_encoder_out_layers 1
+                --output_dir {tmpdir}
+                """.split()
+
+            run_command(self._launch_args + test_args)
+            # save_pretrained smoke test
+            state_dict_file = os.path.join(tmpdir, "pytorch_lora_weights.safetensors")
+            self.assertTrue(os.path.isfile(state_dict_file))
+
+            # Check if the metadata was properly serialized.
+            with safetensors.torch.safe_open(state_dict_file, framework="pt", device="cpu") as f:
+                metadata = f.metadata() or {}
+
+            metadata.pop("format", None)
+            raw = metadata.get(LORA_ADAPTER_METADATA_KEY)
+            if raw:
+                raw = json.loads(raw)
+
+            loaded_lora_alpha = raw["transformer.lora_alpha"]
+            self.assertTrue(loaded_lora_alpha == lora_alpha)
+            loaded_lora_rank = raw["transformer.r"]
+            self.assertTrue(loaded_lora_rank == rank)
diff --git a/examples/dreambooth/train_dreambooth_lora_flux2.py b/examples/dreambooth/train_dreambooth_lora_flux2.py
new file mode 100644
index 000000000000..733abe16d2eb
--- /dev/null
+++ b/examples/dreambooth/train_dreambooth_lora_flux2.py
@@ -0,0 +1,1914 @@
+#!/usr/bin/env python
+# coding=utf-8
+# Copyright 2025 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# /// script
+# dependencies = [
+#     "diffusers @ git+https://github.com/huggingface/diffusers.git",
+#     "torch>=2.0.0",
+#     "accelerate>=0.31.0",
+#     "transformers>=4.41.2",
+#     "ftfy",
+#     "tensorboard",
+#     "Jinja2",
+#     "peft>=0.11.1",
+#     "sentencepiece",
+#     "torchvision",
+#     "datasets",
+#     "bitsandbytes",
+#     "prodigyopt",
+# ]
+# ///
+
+import argparse
+import copy
+import itertools
+import json
+import logging
+import math
+import os
+import random
+import shutil
+import warnings
+from contextlib import nullcontext
+from pathlib import Path
+
+import numpy as np
+import torch
+import transformers
+from accelerate import Accelerator
+from accelerate.logging import get_logger
+from accelerate.utils import DistributedDataParallelKwargs, ProjectConfiguration, set_seed
+from huggingface_hub import create_repo, upload_folder
+from huggingface_hub.utils import insecure_hashlib
+from peft import LoraConfig, prepare_model_for_kbit_training, set_peft_model_state_dict
+from peft.utils import get_peft_model_state_dict
+from PIL import Image
+from PIL.ImageOps import exif_transpose
+from torch.utils.data import Dataset
+from torch.utils.data.sampler import BatchSampler
+from torchvision import transforms
+from torchvision.transforms import functional as TF
+from tqdm.auto import tqdm
+from transformers import Mistral3ForConditionalGeneration, PixtralProcessor
+
+import diffusers
+from diffusers import (
+    AutoencoderKLFlux2,
+    BitsAndBytesConfig,
+    FlowMatchEulerDiscreteScheduler,
+    Flux2Pipeline,
+    Flux2Transformer2DModel,
+)
+from diffusers.optimization import get_scheduler
+from diffusers.training_utils import (
+    _collate_lora_metadata,
+    cast_training_params,
+    compute_density_for_timestep_sampling,
+    compute_loss_weighting_for_sd3,
+    find_nearest_bucket,
+    free_memory,
+    offload_models,
+    parse_buckets_string,
+)
+from diffusers.utils import (
+    check_min_version,
+    convert_unet_state_dict_to_peft,
+    is_wandb_available,
+)
+from diffusers.utils.hub_utils import load_or_create_model_card, populate_model_card
+from diffusers.utils.import_utils import is_torch_npu_available
+from diffusers.utils.torch_utils import is_compiled_module
+
+
+if is_wandb_available():
+    import wandb
+
+# Will error if the minimal version of diffusers is not installed. Remove at your own risks.
+check_min_version("0.36.0.dev0")
+
+logger = get_logger(__name__)
+
+
+def save_model_card(
+    repo_id: str,
+    images=None,
+    base_model: str = None,
+    instance_prompt=None,
+    validation_prompt=None,
+    repo_folder=None,
+    quant_training=None,
+):
+    widget_dict = []
+    if images is not None:
+        for i, image in enumerate(images):
+            image.save(os.path.join(repo_folder, f"image_{i}.png"))
+            widget_dict.append(
+                {"text": validation_prompt if validation_prompt else " ", "output": {"url": f"image_{i}.png"}}
+            )
+
+    model_description = f"""
+# Flux2 DreamBooth LoRA - {repo_id}
+
+<Gallery />
+
+## Model description
+
+These are {repo_id} DreamBooth LoRA weights for {base_model}.
+
+The weights were trained using [DreamBooth](https://dreambooth.github.io/) with the [Flux2 diffusers trainer](https://github.com/huggingface/diffusers/blob/main/examples/dreambooth/README_flux2.md).
+
+Quant training? {quant_training}
+
+## Trigger words
+
+You should use `{instance_prompt}` to trigger the image generation.
+
+## Download model
+
+[Download the *.safetensors LoRA]({repo_id}/tree/main) in the Files & versions tab.
+
+## Use it with the [🧨 diffusers library](https://github.com/huggingface/diffusers)
+
+```py
+from diffusers import AutoPipelineForText2Image
+import torch
+pipeline = AutoPipelineForText2Image.from_pretrained("black-forest-labs/FLUX.2", torch_dtype=torch.bfloat16).to('cuda')
+pipeline.load_lora_weights('{repo_id}', weight_name='pytorch_lora_weights.safetensors')
+image = pipeline('{validation_prompt if validation_prompt else instance_prompt}').images[0]
+```
+
+For more details, including weighting, merging and fusing LoRAs, check the [documentation on loading LoRAs in diffusers](https://huggingface.co/docs/diffusers/main/en/using-diffusers/loading_adapters)
+
+## License
+
+Please adhere to the licensing terms as described [here](https://huggingface.co/black-forest-labs/FLUX.2/blob/main/LICENSE.md).
+"""
+    model_card = load_or_create_model_card(
+        repo_id_or_path=repo_id,
+        from_training=True,
+        license="other",
+        base_model=base_model,
+        prompt=instance_prompt,
+        model_description=model_description,
+        widget=widget_dict,
+    )
+    tags = [
+        "text-to-image",
+        "diffusers-training",
+        "diffusers",
+        "lora",
+        "flux2",
+        "flux2-diffusers",
+        "template:sd-lora",
+    ]
+
+    model_card = populate_model_card(model_card, tags=tags)
+    model_card.save(os.path.join(repo_folder, "README.md"))
+
+
+def log_validation(
+    pipeline,
+    args,
+    accelerator,
+    pipeline_args,
+    epoch,
+    torch_dtype,
+    is_final_validation=False,
+):
+    args.num_validation_images = args.num_validation_images if args.num_validation_images else 1
+    logger.info(
+        f"Running validation... \n Generating {args.num_validation_images} images with prompt:"
+        f" {args.validation_prompt}."
+    )
+    pipeline = pipeline.to(dtype=torch_dtype)
+    pipeline.enable_model_cpu_offload()
+    pipeline.set_progress_bar_config(disable=True)
+
+    # run inference
+    generator = torch.Generator(device=accelerator.device).manual_seed(args.seed) if args.seed is not None else None
+    autocast_ctx = torch.autocast(accelerator.device.type) if not is_final_validation else nullcontext()
+
+    images = []
+    for _ in range(args.num_validation_images):
+        with autocast_ctx:
+            image = pipeline(
+                prompt_embeds=pipeline_args["prompt_embeds"],
+                generator=generator,
+            ).images[0]
+            images.append(image)
+
+    for tracker in accelerator.trackers:
+        phase_name = "test" if is_final_validation else "validation"
+        if tracker.name == "tensorboard":
+            np_images = np.stack([np.asarray(img) for img in images])
+            tracker.writer.add_images(phase_name, np_images, epoch, dataformats="NHWC")
+        if tracker.name == "wandb":
+            tracker.log(
+                {
+                    phase_name: [
+                        wandb.Image(image, caption=f"{i}: {args.validation_prompt}") for i, image in enumerate(images)
+                    ]
+                }
+            )
+
+    del pipeline
+    free_memory()
+
+    return images
+
+
+def module_filter_fn(mod: torch.nn.Module, fqn: str):
+    # don't convert the output module
+    if fqn == "proj_out":
+        return False
+    # don't convert linear modules with weight dimensions not divisible by 16
+    if isinstance(mod, torch.nn.Linear):
+        if mod.in_features % 16 != 0 or mod.out_features % 16 != 0:
+            return False
+    return True
+
+
+def parse_args(input_args=None):
+    parser = argparse.ArgumentParser(description="Simple example of a training script.")
+    parser.add_argument(
+        "--pretrained_model_name_or_path",
+        type=str,
+        default=None,
+        required=True,
+        help="Path to pretrained model or model identifier from huggingface.co/models.",
+    )
+    parser.add_argument(
+        "--revision",
+        type=str,
+        default=None,
+        required=False,
+        help="Revision of pretrained model identifier from huggingface.co/models.",
+    )
+    parser.add_argument(
+        "--bnb_quantization_config_path",
+        type=str,
+        default=None,
+        help="Quantization config in a JSON file that will be used to define the bitsandbytes quant config of the DiT.",
+    )
+    parser.add_argument(
+        "--do_fp8_training",
+        action="store_true",
+        help="if we are doing FP8 training.",
+    )
+    parser.add_argument(
+        "--variant",
+        type=str,
+        default=None,
+        help="Variant of the model files of the pretrained model identifier from huggingface.co/models, 'e.g.' fp16",
+    )
+    parser.add_argument(
+        "--dataset_name",
+        type=str,
+        default=None,
+        help=(
+            "The name of the Dataset (from the HuggingFace hub) containing the training data of instance images (could be your own, possibly private,"
+            " dataset). It can also be a path pointing to a local copy of a dataset in your filesystem,"
+            " or to a folder containing files that 🤗 Datasets can understand."
+        ),
+    )
+    parser.add_argument(
+        "--dataset_config_name",
+        type=str,
+        default=None,
+        help="The config of the Dataset, leave as None if there's only one config.",
+    )
+    parser.add_argument(
+        "--instance_data_dir",
+        type=str,
+        default=None,
+        help=("A folder containing the training data. "),
+    )
+
+    parser.add_argument(
+        "--cache_dir",
+        type=str,
+        default=None,
+        help="The directory where the downloaded models and datasets will be stored.",
+    )
+
+    parser.add_argument(
+        "--image_column",
+        type=str,
+        default="image",
+        help="The column of the dataset containing the target image. By "
+        "default, the standard Image Dataset maps out 'file_name' "
+        "to 'image'.",
+    )
+    parser.add_argument(
+        "--caption_column",
+        type=str,
+        default=None,
+        help="The column of the dataset containing the instance prompt for each image",
+    )
+
+    parser.add_argument("--repeats", type=int, default=1, help="How many times to repeat the training data.")
+
+    parser.add_argument(
+        "--class_data_dir",
+        type=str,
+        default=None,
+        required=False,
+        help="A folder containing the training data of class images.",
+    )
+    parser.add_argument(
+        "--instance_prompt",
+        type=str,
+        default=None,
+        required=True,
+        help="The prompt with identifier specifying the instance, e.g. 'photo of a TOK dog', 'in the style of TOK'",
+    )
+    parser.add_argument(
+        "--class_prompt",
+        type=str,
+        default=None,
+        help="The prompt to specify images in the same class as provided instance images.",
+    )
+    parser.add_argument(
+        "--max_sequence_length",
+        type=int,
+        default=512,
+        help="Maximum sequence length to use with with the T5 text encoder",
+    )
+    parser.add_argument(
+        "--text_encoder_out_layers",
+        type=int,
+        nargs="+",
+        default=[10, 20, 30],
+        help="Text encoder hidden layers to compute the final text embeddings.",
+    )
+    parser.add_argument(
+        "--validation_prompt",
+        type=str,
+        default=None,
+        help="A prompt that is used during validation to verify that the model is learning.",
+    )
+    parser.add_argument(
+        "--skip_final_inference",
+        default=False,
+        action="store_true",
+        help="Whether to skip the final inference step with loaded lora weights upon training completion. This will run intermediate validation inference if `validation_prompt` is provided. Specify to reduce memory.",
+    )
+    parser.add_argument(
+        "--final_validation_prompt",
+        type=str,
+        default=None,
+        help="A prompt that is used during a final validation to verify that the model is learning. Ignored if `--validation_prompt` is provided.",
+    )
+    parser.add_argument(
+        "--num_validation_images",
+        type=int,
+        default=4,
+        help="Number of images that should be generated during validation with `validation_prompt`.",
+    )
+    parser.add_argument(
+        "--validation_epochs",
+        type=int,
+        default=50,
+        help=(
+            "Run dreambooth validation every X epochs. Dreambooth validation consists of running the prompt"
+            " `args.validation_prompt` multiple times: `args.num_validation_images`."
+        ),
+    )
+    parser.add_argument(
+        "--rank",
+        type=int,
+        default=4,
+        help=("The dimension of the LoRA update matrices."),
+    )
+    parser.add_argument(
+        "--lora_alpha",
+        type=int,
+        default=4,
+        help="LoRA alpha to be used for additional scaling.",
+    )
+    parser.add_argument("--lora_dropout", type=float, default=0.0, help="Dropout probability for LoRA layers")
+
+    parser.add_argument(
+        "--with_prior_preservation",
+        default=False,
+        action="store_true",
+        help="Flag to add prior preservation loss.",
+    )
+    parser.add_argument("--prior_loss_weight", type=float, default=1.0, help="The weight of prior preservation loss.")
+    parser.add_argument(
+        "--num_class_images",
+        type=int,
+        default=100,
+        help=(
+            "Minimal class images for prior preservation loss. If there are not enough images already present in"
+            " class_data_dir, additional images will be sampled with class_prompt."
+        ),
+    )
+    parser.add_argument(
+        "--output_dir",
+        type=str,
+        default="flux-dreambooth-lora",
+        help="The output directory where the model predictions and checkpoints will be written.",
+    )
+    parser.add_argument("--seed", type=int, default=None, help="A seed for reproducible training.")
+    parser.add_argument(
+        "--resolution",
+        type=int,
+        default=512,
+        help=(
+            "The resolution for input images, all the images in the train/validation dataset will be resized to this"
+            " resolution"
+        ),
+    )
+    parser.add_argument(
+        "--aspect_ratio_buckets",
+        type=str,
+        default=None,
+        help=(
+            "Aspect ratio buckets to use for training. Define as a string of 'h1,w1;h2,w2;...'. "
+            "e.g. '1024,1024;768,1360;1360,768;880,1168;1168,880;1248,832;832,1248'"
+            "Images will be resized and cropped to fit the nearest bucket. If provided, --resolution is ignored."
+        ),
+    )
+    parser.add_argument(
+        "--center_crop",
+        default=False,
+        action="store_true",
+        help=(
+            "Whether to center crop the input images to the resolution. If not set, the images will be randomly"
+            " cropped. The images will be resized to the resolution first before cropping."
+        ),
+    )
+    parser.add_argument(
+        "--random_flip",
+        action="store_true",
+        help="whether to randomly flip images horizontally",
+    )
+    parser.add_argument(
+        "--train_batch_size", type=int, default=4, help="Batch size (per device) for the training dataloader."
+    )
+    parser.add_argument(
+        "--sample_batch_size", type=int, default=4, help="Batch size (per device) for sampling images."
+    )
+    parser.add_argument("--num_train_epochs", type=int, default=1)
+    parser.add_argument(
+        "--max_train_steps",
+        type=int,
+        default=None,
+        help="Total number of training steps to perform.  If provided, overrides num_train_epochs.",
+    )
+    parser.add_argument(
+        "--checkpointing_steps",
+        type=int,
+        default=500,
+        help=(
+            "Save a checkpoint of the training state every X updates. These checkpoints can be used both as final"
+            " checkpoints in case they are better than the last checkpoint, and are also suitable for resuming"
+            " training using `--resume_from_checkpoint`."
+        ),
+    )
+    parser.add_argument(
+        "--checkpoints_total_limit",
+        type=int,
+        default=None,
+        help=("Max number of checkpoints to store."),
+    )
+    parser.add_argument(
+        "--resume_from_checkpoint",
+        type=str,
+        default=None,
+        help=(
+            "Whether training should be resumed from a previous checkpoint. Use a path saved by"
+            ' `--checkpointing_steps`, or `"latest"` to automatically select the last available checkpoint.'
+        ),
+    )
+    parser.add_argument(
+        "--gradient_accumulation_steps",
+        type=int,
+        default=1,
+        help="Number of updates steps to accumulate before performing a backward/update pass.",
+    )
+    parser.add_argument(
+        "--gradient_checkpointing",
+        action="store_true",
+        help="Whether or not to use gradient checkpointing to save memory at the expense of slower backward pass.",
+    )
+    parser.add_argument(
+        "--learning_rate",
+        type=float,
+        default=1e-4,
+        help="Initial learning rate (after the potential warmup period) to use.",
+    )
+
+    parser.add_argument(
+        "--guidance_scale",
+        type=float,
+        default=3.5,
+        help="the FLUX.1 dev variant is a guidance distilled model",
+    )
+
+    parser.add_argument(
+        "--text_encoder_lr",
+        type=float,
+        default=5e-6,
+        help="Text encoder learning rate to use.",
+    )
+    parser.add_argument(
+        "--scale_lr",
+        action="store_true",
+        default=False,
+        help="Scale the learning rate by the number of GPUs, gradient accumulation steps, and batch size.",
+    )
+    parser.add_argument(
+        "--lr_scheduler",
+        type=str,
+        default="constant",
+        help=(
+            'The scheduler type to use. Choose between ["linear", "cosine", "cosine_with_restarts", "polynomial",'
+            ' "constant", "constant_with_warmup"]'
+        ),
+    )
+    parser.add_argument(
+        "--lr_warmup_steps", type=int, default=500, help="Number of steps for the warmup in the lr scheduler."
+    )
+    parser.add_argument(
+        "--lr_num_cycles",
+        type=int,
+        default=1,
+        help="Number of hard resets of the lr in cosine_with_restarts scheduler.",
+    )
+    parser.add_argument("--lr_power", type=float, default=1.0, help="Power factor of the polynomial scheduler.")
+    parser.add_argument(
+        "--dataloader_num_workers",
+        type=int,
+        default=0,
+        help=(
+            "Number of subprocesses to use for data loading. 0 means that the data will be loaded in the main process."
+        ),
+    )
+    parser.add_argument(
+        "--weighting_scheme",
+        type=str,
+        default="none",
+        choices=["sigma_sqrt", "logit_normal", "mode", "cosmap", "none"],
+        help=('We default to the "none" weighting scheme for uniform sampling and uniform loss'),
+    )
+    parser.add_argument(
+        "--logit_mean", type=float, default=0.0, help="mean to use when using the `'logit_normal'` weighting scheme."
+    )
+    parser.add_argument(
+        "--logit_std", type=float, default=1.0, help="std to use when using the `'logit_normal'` weighting scheme."
+    )
+    parser.add_argument(
+        "--mode_scale",
+        type=float,
+        default=1.29,
+        help="Scale of mode weighting scheme. Only effective when using the `'mode'` as the `weighting_scheme`.",
+    )
+    parser.add_argument(
+        "--optimizer",
+        type=str,
+        default="AdamW",
+        help=('The optimizer type to use. Choose between ["AdamW", "prodigy"]'),
+    )
+
+    parser.add_argument(
+        "--use_8bit_adam",
+        action="store_true",
+        help="Whether or not to use 8-bit Adam from bitsandbytes. Ignored if optimizer is not set to AdamW",
+    )
+
+    parser.add_argument(
+        "--adam_beta1", type=float, default=0.9, help="The beta1 parameter for the Adam and Prodigy optimizers."
+    )
+    parser.add_argument(
+        "--adam_beta2", type=float, default=0.999, help="The beta2 parameter for the Adam and Prodigy optimizers."
+    )
+    parser.add_argument(
+        "--prodigy_beta3",
+        type=float,
+        default=None,
+        help="coefficients for computing the Prodigy stepsize using running averages. If set to None, "
+        "uses the value of square root of beta2. Ignored if optimizer is adamW",
+    )
+    parser.add_argument("--prodigy_decouple", type=bool, default=True, help="Use AdamW style decoupled weight decay")
+    parser.add_argument("--adam_weight_decay", type=float, default=1e-04, help="Weight decay to use for unet params")
+    parser.add_argument(
+        "--adam_weight_decay_text_encoder", type=float, default=1e-03, help="Weight decay to use for text_encoder"
+    )
+
+    parser.add_argument(
+        "--lora_layers",
+        type=str,
+        default=None,
+        help=(
+            'The transformer modules to apply LoRA training on. Please specify the layers in a comma separated. E.g. - "to_k,to_q,to_v,to_out.0" will result in lora training of attention layers only'
+        ),
+    )
+
+    parser.add_argument(
+        "--adam_epsilon",
+        type=float,
+        default=1e-08,
+        help="Epsilon value for the Adam optimizer and Prodigy optimizers.",
+    )
+
+    parser.add_argument(
+        "--prodigy_use_bias_correction",
+        type=bool,
+        default=True,
+        help="Turn on Adam's bias correction. True by default. Ignored if optimizer is adamW",
+    )
+    parser.add_argument(
+        "--prodigy_safeguard_warmup",
+        type=bool,
+        default=True,
+        help="Remove lr from the denominator of D estimate to avoid issues during warm-up stage. True by default. "
+        "Ignored if optimizer is adamW",
+    )
+    parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.")
+    parser.add_argument("--push_to_hub", action="store_true", help="Whether or not to push the model to the Hub.")
+    parser.add_argument("--hub_token", type=str, default=None, help="The token to use to push to the Model Hub.")
+    parser.add_argument(
+        "--hub_model_id",
+        type=str,
+        default=None,
+        help="The name of the repository to keep in sync with the local `output_dir`.",
+    )
+    parser.add_argument(
+        "--logging_dir",
+        type=str,
+        default="logs",
+        help=(
+            "[TensorBoard](https://www.tensorflow.org/tensorboard) log directory. Will default to"
+            " *output_dir/runs/**CURRENT_DATETIME_HOSTNAME***."
+        ),
+    )
+    parser.add_argument(
+        "--allow_tf32",
+        action="store_true",
+        help=(
+            "Whether or not to allow TF32 on Ampere GPUs. Can be used to speed up training. For more information, see"
+            " https://pytorch.org/docs/stable/notes/cuda.html#tensorfloat-32-tf32-on-ampere-devices"
+        ),
+    )
+    parser.add_argument(
+        "--cache_latents",
+        action="store_true",
+        default=False,
+        help="Cache the VAE latents",
+    )
+    parser.add_argument(
+        "--report_to",
+        type=str,
+        default="tensorboard",
+        help=(
+            'The integration to report the results and logs to. Supported platforms are `"tensorboard"`'
+            ' (default), `"wandb"` and `"comet_ml"`. Use `"all"` to report to all integrations.'
+        ),
+    )
+    parser.add_argument(
+        "--mixed_precision",
+        type=str,
+        default=None,
+        choices=["no", "fp16", "bf16"],
+        help=(
+            "Whether to use mixed precision. Choose between fp16 and bf16 (bfloat16). Bf16 requires PyTorch >="
+            " 1.10.and an Nvidia Ampere GPU.  Default to the value of accelerate config of the current system or the"
+            " flag passed with the `accelerate.launch` command. Use this argument to override the accelerate config."
+        ),
+    )
+    parser.add_argument(
+        "--upcast_before_saving",
+        action="store_true",
+        default=False,
+        help=(
+            "Whether to upcast the trained transformer layers to float32 before saving (at the end of training). "
+            "Defaults to precision dtype used for training to save memory"
+        ),
+    )
+    parser.add_argument(
+        "--offload",
+        action="store_true",
+        help="Whether to offload the VAE and the text encoder to CPU when they are not used.",
+    )
+    parser.add_argument(
+        "--remote_text_encoder",
+        action="store_true",
+        help="Whether to use a remote text encoder. This means the text encoder will not be loaded locally and instead, the prompt embeddings will be computed remotely using the HuggingFace Inference API.",
+    )
+    parser.add_argument(
+        "--prior_generation_precision",
+        type=str,
+        default=None,
+        choices=["no", "fp32", "fp16", "bf16"],
+        help=(
+            "Choose prior generation precision between fp32, fp16 and bf16 (bfloat16). Bf16 requires PyTorch >="
+            " 1.10.and an Nvidia Ampere GPU.  Default to  fp16 if a GPU is available else fp32."
+        ),
+    )
+    parser.add_argument("--local_rank", type=int, default=-1, help="For distributed training: local_rank")
+    parser.add_argument("--enable_npu_flash_attention", action="store_true", help="Enabla Flash Attention for NPU")
+
+    if input_args is not None:
+        args = parser.parse_args(input_args)
+    else:
+        args = parser.parse_args()
+
+    if args.dataset_name is None and args.instance_data_dir is None:
+        raise ValueError("Specify either `--dataset_name` or `--instance_data_dir`")
+
+    if args.dataset_name is not None and args.instance_data_dir is not None:
+        raise ValueError("Specify only one of `--dataset_name` or `--instance_data_dir`")
+    if args.do_fp8_training and args.bnb_quantization_config_path:
+        raise ValueError("Both `do_fp8_training` and `bnb_quantization_config_path` cannot be passed.")
+
+    env_local_rank = int(os.environ.get("LOCAL_RANK", -1))
+    if env_local_rank != -1 and env_local_rank != args.local_rank:
+        args.local_rank = env_local_rank
+
+    if args.with_prior_preservation:
+        if args.class_data_dir is None:
+            raise ValueError("You must specify a data directory for class images.")
+        if args.class_prompt is None:
+            raise ValueError("You must specify prompt for class images.")
+    else:
+        # logger is not available yet
+        if args.class_data_dir is not None:
+            warnings.warn("You need not use --class_data_dir without --with_prior_preservation.")
+        if args.class_prompt is not None:
+            warnings.warn("You need not use --class_prompt without --with_prior_preservation.")
+
+    return args
+
+
+class DreamBoothDataset(Dataset):
+    """
+    A dataset to prepare the instance and class images with the prompts for fine-tuning the model.
+    It pre-processes the images.
+    """
+
+    def __init__(
+        self,
+        instance_data_root,
+        instance_prompt,
+        class_prompt,
+        class_data_root=None,
+        class_num=None,
+        size=1024,
+        repeats=1,
+        center_crop=False,
+        buckets=None,
+    ):
+        self.size = size
+        self.center_crop = center_crop
+
+        self.instance_prompt = instance_prompt
+        self.custom_instance_prompts = None
+        self.class_prompt = class_prompt
+
+        self.buckets = buckets
+
+        # if --dataset_name is provided or a metadata jsonl file is provided in the local --instance_data directory,
+        # we load the training data using load_dataset
+        if args.dataset_name is not None:
+            try:
+                from datasets import load_dataset
+            except ImportError:
+                raise ImportError(
+                    "You are trying to load your data using the datasets library. If you wish to train using custom "
+                    "captions please install the datasets library: `pip install datasets`. If you wish to load a "
+                    "local folder containing images only, specify --instance_data_dir instead."
+                )
+            # Downloading and loading a dataset from the hub.
+            # See more about loading custom images at
+            # https://huggingface.co/docs/datasets/v2.0.0/en/dataset_script
+            dataset = load_dataset(
+                args.dataset_name,
+                args.dataset_config_name,
+                cache_dir=args.cache_dir,
+            )
+            # Preprocessing the datasets.
+            column_names = dataset["train"].column_names
+
+            # 6. Get the column names for input/target.
+            if args.image_column is None:
+                image_column = column_names[0]
+                logger.info(f"image column defaulting to {image_column}")
+            else:
+                image_column = args.image_column
+                if image_column not in column_names:
+                    raise ValueError(
+                        f"`--image_column` value '{args.image_column}' not found in dataset columns. Dataset columns are: {', '.join(column_names)}"
+                    )
+            instance_images = dataset["train"][image_column]
+
+            if args.caption_column is None:
+                logger.info(
+                    "No caption column provided, defaulting to instance_prompt for all images. If your dataset "
+                    "contains captions/prompts for the images, make sure to specify the "
+                    "column as --caption_column"
+                )
+                self.custom_instance_prompts = None
+            else:
+                if args.caption_column not in column_names:
+                    raise ValueError(
+                        f"`--caption_column` value '{args.caption_column}' not found in dataset columns. Dataset columns are: {', '.join(column_names)}"
+                    )
+                custom_instance_prompts = dataset["train"][args.caption_column]
+                # create final list of captions according to --repeats
+                self.custom_instance_prompts = []
+                for caption in custom_instance_prompts:
+                    self.custom_instance_prompts.extend(itertools.repeat(caption, repeats))
+        else:
+            self.instance_data_root = Path(instance_data_root)
+            if not self.instance_data_root.exists():
+                raise ValueError("Instance images root doesn't exists.")
+
+            instance_images = [Image.open(path) for path in list(Path(instance_data_root).iterdir())]
+            self.custom_instance_prompts = None
+
+        self.instance_images = []
+        for img in instance_images:
+            self.instance_images.extend(itertools.repeat(img, repeats))
+
+        self.pixel_values = []
+        for i, image in enumerate(self.instance_images):
+            image = exif_transpose(image)
+            if not image.mode == "RGB":
+                image = image.convert("RGB")
+
+            width, height = image.size
+
+            # Find the closest bucket
+            bucket_idx = find_nearest_bucket(height, width, self.buckets)
+            target_height, target_width = self.buckets[bucket_idx]
+            self.size = (target_height, target_width)
+
+            # based on the bucket assignment, define the transformations
+            image = self.train_transform(
+                image,
+                size=self.size,
+                center_crop=args.center_crop,
+                random_flip=args.random_flip,
+            )
+            self.pixel_values.append((image, bucket_idx))
+
+        self.num_instance_images = len(self.instance_images)
+        self._length = self.num_instance_images
+
+        if class_data_root is not None:
+            self.class_data_root = Path(class_data_root)
+            self.class_data_root.mkdir(parents=True, exist_ok=True)
+            self.class_images_path = list(self.class_data_root.iterdir())
+            if class_num is not None:
+                self.num_class_images = min(len(self.class_images_path), class_num)
+            else:
+                self.num_class_images = len(self.class_images_path)
+            self._length = max(self.num_class_images, self.num_instance_images)
+        else:
+            self.class_data_root = None
+
+        self.image_transforms = transforms.Compose(
+            [
+                transforms.Resize(size, interpolation=transforms.InterpolationMode.BILINEAR),
+                transforms.CenterCrop(size) if center_crop else transforms.RandomCrop(size),
+                transforms.ToTensor(),
+                transforms.Normalize([0.5], [0.5]),
+            ]
+        )
+
+    def __len__(self):
+        return self._length
+
+    def __getitem__(self, index):
+        example = {}
+        instance_image, bucket_idx = self.pixel_values[index % self.num_instance_images]
+        example["instance_images"] = instance_image
+        example["bucket_idx"] = bucket_idx
+        if self.custom_instance_prompts:
+            caption = self.custom_instance_prompts[index % self.num_instance_images]
+            if caption:
+                example["instance_prompt"] = caption
+            else:
+                example["instance_prompt"] = self.instance_prompt
+
+        else:  # custom prompts were provided, but length does not match size of image dataset
+            example["instance_prompt"] = self.instance_prompt
+
+        if self.class_data_root:
+            class_image = Image.open(self.class_images_path[index % self.num_class_images])
+            class_image = exif_transpose(class_image)
+
+            if not class_image.mode == "RGB":
+                class_image = class_image.convert("RGB")
+            example["class_images"] = self.image_transforms(class_image)
+            example["class_prompt"] = self.class_prompt
+
+        return example
+
+    def train_transform(self, image, size=(224, 224), center_crop=False, random_flip=False):
+        # 1. Resize (deterministic)
+        resize = transforms.Resize(size, interpolation=transforms.InterpolationMode.BILINEAR)
+        image = resize(image)
+
+        # 2. Crop: either center or SAME random crop
+        if center_crop:
+            crop = transforms.CenterCrop(size)
+            image = crop(image)
+        else:
+            # get_params returns (i, j, h, w)
+            i, j, h, w = transforms.RandomCrop.get_params(image, output_size=size)
+            image = TF.crop(image, i, j, h, w)
+
+        # 3. Random horizontal flip with the SAME coin flip
+        if random_flip:
+            do_flip = random.random() < 0.5
+            if do_flip:
+                image = TF.hflip(image)
+
+        # 4. ToTensor + Normalize (deterministic)
+        to_tensor = transforms.ToTensor()
+        normalize = transforms.Normalize([0.5], [0.5])
+        image = normalize(to_tensor(image))
+
+        return image
+
+
+def collate_fn(examples, with_prior_preservation=False):
+    pixel_values = [example["instance_images"] for example in examples]
+    prompts = [example["instance_prompt"] for example in examples]
+
+    # Concat class and instance examples for prior preservation.
+    # We do this to avoid doing two forward passes.
+    if with_prior_preservation:
+        pixel_values += [example["class_images"] for example in examples]
+        prompts += [example["class_prompt"] for example in examples]
+
+    pixel_values = torch.stack(pixel_values)
+    pixel_values = pixel_values.to(memory_format=torch.contiguous_format).float()
+
+    batch = {"pixel_values": pixel_values, "prompts": prompts}
+    return batch
+
+
+class BucketBatchSampler(BatchSampler):
+    def __init__(self, dataset: DreamBoothDataset, batch_size: int, drop_last: bool = False):
+        if not isinstance(batch_size, int) or batch_size <= 0:
+            raise ValueError("batch_size should be a positive integer value, but got batch_size={}".format(batch_size))
+        if not isinstance(drop_last, bool):
+            raise ValueError("drop_last should be a boolean value, but got drop_last={}".format(drop_last))
+
+        self.dataset = dataset
+        self.batch_size = batch_size
+        self.drop_last = drop_last
+
+        # Group indices by bucket
+        self.bucket_indices = [[] for _ in range(len(self.dataset.buckets))]
+        for idx, (_, bucket_idx) in enumerate(self.dataset.pixel_values):
+            self.bucket_indices[bucket_idx].append(idx)
+
+        self.sampler_len = 0
+        self.batches = []
+
+        # Pre-generate batches for each bucket
+        for indices_in_bucket in self.bucket_indices:
+            # Shuffle indices within the bucket
+            random.shuffle(indices_in_bucket)
+            # Create batches
+            for i in range(0, len(indices_in_bucket), self.batch_size):
+                batch = indices_in_bucket[i : i + self.batch_size]
+                if len(batch) < self.batch_size and self.drop_last:
+                    continue  # Skip partial batch if drop_last is True
+                self.batches.append(batch)
+                self.sampler_len += 1  # Count the number of batches
+
+    def __iter__(self):
+        # Shuffle the order of the batches each epoch
+        random.shuffle(self.batches)
+        for batch in self.batches:
+            yield batch
+
+    def __len__(self):
+        return self.sampler_len
+
+
+class PromptDataset(Dataset):
+    "A simple dataset to prepare the prompts to generate class images on multiple GPUs."
+
+    def __init__(self, prompt, num_samples):
+        self.prompt = prompt
+        self.num_samples = num_samples
+
+    def __len__(self):
+        return self.num_samples
+
+    def __getitem__(self, index):
+        example = {}
+        example["prompt"] = self.prompt
+        example["index"] = index
+        return example
+
+
+def main(args):
+    if args.report_to == "wandb" and args.hub_token is not None:
+        raise ValueError(
+            "You cannot use both --report_to=wandb and --hub_token due to a security risk of exposing your token."
+            " Please use `hf auth login` to authenticate with the Hub."
+        )
+
+    if torch.backends.mps.is_available() and args.mixed_precision == "bf16":
+        # due to pytorch#99272, MPS does not yet support bfloat16.
+        raise ValueError(
+            "Mixed precision training with bfloat16 is not supported on MPS. Please use fp16 (recommended) or fp32 instead."
+        )
+    if args.do_fp8_training:
+        from torchao.float8 import Float8LinearConfig, convert_to_float8_training
+
+    logging_dir = Path(args.output_dir, args.logging_dir)
+
+    accelerator_project_config = ProjectConfiguration(project_dir=args.output_dir, logging_dir=logging_dir)
+    kwargs = DistributedDataParallelKwargs(find_unused_parameters=True)
+    accelerator = Accelerator(
+        gradient_accumulation_steps=args.gradient_accumulation_steps,
+        mixed_precision=args.mixed_precision,
+        log_with=args.report_to,
+        project_config=accelerator_project_config,
+        kwargs_handlers=[kwargs],
+    )
+
+    # Disable AMP for MPS.
+    if torch.backends.mps.is_available():
+        accelerator.native_amp = False
+
+    if args.report_to == "wandb":
+        if not is_wandb_available():
+            raise ImportError("Make sure to install wandb if you want to use it for logging during training.")
+
+    # Make one log on every process with the configuration for debugging.
+    logging.basicConfig(
+        format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
+        datefmt="%m/%d/%Y %H:%M:%S",
+        level=logging.INFO,
+    )
+    logger.info(accelerator.state, main_process_only=False)
+    if accelerator.is_local_main_process:
+        transformers.utils.logging.set_verbosity_warning()
+        diffusers.utils.logging.set_verbosity_info()
+    else:
+        transformers.utils.logging.set_verbosity_error()
+        diffusers.utils.logging.set_verbosity_error()
+
+    # If passed along, set the training seed now.
+    if args.seed is not None:
+        set_seed(args.seed)
+
+    # Generate class images if prior preservation is enabled.
+    if args.with_prior_preservation:
+        class_images_dir = Path(args.class_data_dir)
+        if not class_images_dir.exists():
+            class_images_dir.mkdir(parents=True)
+        cur_class_images = len(list(class_images_dir.iterdir()))
+
+        if cur_class_images < args.num_class_images:
+            has_supported_fp16_accelerator = torch.cuda.is_available() or torch.backends.mps.is_available()
+            torch_dtype = torch.float16 if has_supported_fp16_accelerator else torch.float32
+            if args.prior_generation_precision == "fp32":
+                torch_dtype = torch.float32
+            elif args.prior_generation_precision == "fp16":
+                torch_dtype = torch.float16
+            elif args.prior_generation_precision == "bf16":
+                torch_dtype = torch.bfloat16
+
+            pipeline = Flux2Pipeline.from_pretrained(
+                args.pretrained_model_name_or_path,
+                torch_dtype=torch_dtype,
+                revision=args.revision,
+                variant=args.variant,
+            )
+            pipeline.set_progress_bar_config(disable=True)
+
+            num_new_images = args.num_class_images - cur_class_images
+            logger.info(f"Number of class images to sample: {num_new_images}.")
+
+            sample_dataset = PromptDataset(args.class_prompt, num_new_images)
+            sample_dataloader = torch.utils.data.DataLoader(sample_dataset, batch_size=args.sample_batch_size)
+
+            sample_dataloader = accelerator.prepare(sample_dataloader)
+            pipeline.to(accelerator.device)
+
+            for example in tqdm(
+                sample_dataloader, desc="Generating class images", disable=not accelerator.is_local_main_process
+            ):
+                with torch.autocast(device_type=accelerator.device.type, dtype=torch_dtype):
+                    images = pipeline(prompt=example["prompt"]).images
+
+                for i, image in enumerate(images):
+                    hash_image = insecure_hashlib.sha1(image.tobytes()).hexdigest()
+                    image_filename = class_images_dir / f"{example['index'][i] + cur_class_images}-{hash_image}.jpg"
+                    image.save(image_filename)
+
+            del pipeline
+            free_memory()
+
+    # Handle the repository creation
+    if accelerator.is_main_process:
+        if args.output_dir is not None:
+            os.makedirs(args.output_dir, exist_ok=True)
+
+        if args.push_to_hub:
+            repo_id = create_repo(
+                repo_id=args.hub_model_id or Path(args.output_dir).name,
+                exist_ok=True,
+            ).repo_id
+
+    # Load the tokenizers
+    tokenizer = PixtralProcessor.from_pretrained(
+        args.pretrained_model_name_or_path,
+        subfolder="tokenizer",
+        revision=args.revision,
+    )
+
+    # For mixed precision training we cast all non-trainable weights (vae, text_encoder and transformer) to half-precision
+    # as these weights are only used for inference, keeping weights in full precision is not required.
+    weight_dtype = torch.float32
+    if accelerator.mixed_precision == "fp16":
+        weight_dtype = torch.float16
+    elif accelerator.mixed_precision == "bf16":
+        weight_dtype = torch.bfloat16
+
+    # Load scheduler and models
+    noise_scheduler = FlowMatchEulerDiscreteScheduler.from_pretrained(
+        args.pretrained_model_name_or_path,
+        subfolder="scheduler",
+        revision=args.revision,
+    )
+    noise_scheduler_copy = copy.deepcopy(noise_scheduler)
+    vae = AutoencoderKLFlux2.from_pretrained(
+        args.pretrained_model_name_or_path,
+        subfolder="vae",
+        revision=args.revision,
+        variant=args.variant,
+    )
+    latents_bn_mean = vae.bn.running_mean.view(1, -1, 1, 1).to(accelerator.device)
+    latents_bn_std = torch.sqrt(vae.bn.running_var.view(1, -1, 1, 1) + vae.config.batch_norm_eps).to(
+        accelerator.device
+    )
+
+    quantization_config = None
+    if args.bnb_quantization_config_path is not None:
+        with open(args.bnb_quantization_config_path, "r") as f:
+            config_kwargs = json.load(f)
+            if "load_in_4bit" in config_kwargs and config_kwargs["load_in_4bit"]:
+                config_kwargs["bnb_4bit_compute_dtype"] = weight_dtype
+        quantization_config = BitsAndBytesConfig(**config_kwargs)
+
+    transformer = Flux2Transformer2DModel.from_pretrained(
+        args.pretrained_model_name_or_path,
+        subfolder="transformer",
+        revision=args.revision,
+        variant=args.variant,
+        quantization_config=quantization_config,
+        torch_dtype=weight_dtype,
+    )
+    if args.bnb_quantization_config_path is not None:
+        transformer = prepare_model_for_kbit_training(transformer, use_gradient_checkpointing=False)
+
+    if not args.remote_text_encoder:
+        text_encoder = Mistral3ForConditionalGeneration.from_pretrained(
+            args.pretrained_model_name_or_path, subfolder="text_encoder", revision=args.revision, variant=args.variant
+        )
+        text_encoder.requires_grad_(False)
+
+    # We only train the additional adapter LoRA layers
+    transformer.requires_grad_(False)
+    vae.requires_grad_(False)
+
+    if args.enable_npu_flash_attention:
+        if is_torch_npu_available():
+            logger.info("npu flash attention enabled.")
+            transformer.set_attention_backend("_native_npu")
+        else:
+            raise ValueError("npu flash attention requires torch_npu extensions and is supported only on npu device ")
+
+    if torch.backends.mps.is_available() and weight_dtype == torch.bfloat16:
+        # due to pytorch#99272, MPS does not yet support bfloat16.
+        raise ValueError(
+            "Mixed precision training with bfloat16 is not supported on MPS. Please use fp16 (recommended) or fp32 instead."
+        )
+
+    to_kwargs = {"dtype": weight_dtype, "device": accelerator.device} if not args.offload else {"dtype": weight_dtype}
+    # flux vae is stable in bf16 so load it in weight_dtype to reduce memory
+    vae.to(**to_kwargs)
+    # we never offload the transformer to CPU, so we can just use the accelerator device
+    transformer_to_kwargs = (
+        {"device": accelerator.device}
+        if args.bnb_quantization_config_path is not None
+        else {"device": accelerator.device, "dtype": weight_dtype}
+    )
+    transformer.to(**transformer_to_kwargs)
+    if args.do_fp8_training:
+        convert_to_float8_training(
+            transformer, module_filter_fn=module_filter_fn, config=Float8LinearConfig(pad_inner_dim=True)
+        )
+
+    if not args.remote_text_encoder:
+        text_encoder.to(**to_kwargs)
+        # Initialize a text encoding pipeline and keep it to CPU for now.
+        text_encoding_pipeline = Flux2Pipeline.from_pretrained(
+            args.pretrained_model_name_or_path,
+            vae=None,
+            transformer=None,
+            tokenizer=tokenizer,
+            text_encoder=text_encoder,
+            scheduler=None,
+            revision=args.revision,
+        )
+
+    if args.gradient_checkpointing:
+        transformer.enable_gradient_checkpointing()
+
+    if args.lora_layers is not None:
+        target_modules = [layer.strip() for layer in args.lora_layers.split(",")]
+    else:
+        target_modules = ["to_k", "to_q", "to_v", "to_out.0"]
+
+    # now we will add new LoRA weights the transformer layers
+    transformer_lora_config = LoraConfig(
+        r=args.rank,
+        lora_alpha=args.lora_alpha,
+        lora_dropout=args.lora_dropout,
+        init_lora_weights="gaussian",
+        target_modules=target_modules,
+    )
+    transformer.add_adapter(transformer_lora_config)
+
+    def unwrap_model(model):
+        model = accelerator.unwrap_model(model)
+        model = model._orig_mod if is_compiled_module(model) else model
+        return model
+
+    # create custom saving & loading hooks so that `accelerator.save_state(...)` serializes in a nice format
+    def save_model_hook(models, weights, output_dir):
+        if accelerator.is_main_process:
+            transformer_lora_layers_to_save = None
+            modules_to_save = {}
+            for model in models:
+                if isinstance(model, type(unwrap_model(transformer))):
+                    transformer_lora_layers_to_save = get_peft_model_state_dict(model)
+                    modules_to_save["transformer"] = model
+                else:
+                    raise ValueError(f"unexpected save model: {model.__class__}")
+
+                # make sure to pop weight so that corresponding model is not saved again
+                weights.pop()
+
+            Flux2Pipeline.save_lora_weights(
+                output_dir,
+                transformer_lora_layers=transformer_lora_layers_to_save,
+                **_collate_lora_metadata(modules_to_save),
+            )
+
+    def load_model_hook(models, input_dir):
+        transformer_ = None
+
+        while len(models) > 0:
+            model = models.pop()
+
+            if isinstance(model, type(unwrap_model(transformer))):
+                transformer_ = model
+            else:
+                raise ValueError(f"unexpected save model: {model.__class__}")
+
+        lora_state_dict = Flux2Pipeline.lora_state_dict(input_dir)
+
+        transformer_state_dict = {
+            f"{k.replace('transformer.', '')}": v for k, v in lora_state_dict.items() if k.startswith("transformer.")
+        }
+        transformer_state_dict = convert_unet_state_dict_to_peft(transformer_state_dict)
+        incompatible_keys = set_peft_model_state_dict(transformer_, transformer_state_dict, adapter_name="default")
+        if incompatible_keys is not None:
+            # check only for unexpected keys
+            unexpected_keys = getattr(incompatible_keys, "unexpected_keys", None)
+            if unexpected_keys:
+                logger.warning(
+                    f"Loading adapter weights from state_dict led to unexpected keys not found in the model: "
+                    f" {unexpected_keys}. "
+                )
+
+        # Make sure the trainable params are in float32. This is again needed since the base models
+        # are in `weight_dtype`. More details:
+        # https://github.com/huggingface/diffusers/pull/6514#discussion_r1449796804
+        if args.mixed_precision == "fp16":
+            models = [transformer_]
+            # only upcast trainable parameters (LoRA) into fp32
+            cast_training_params(models)
+
+    accelerator.register_save_state_pre_hook(save_model_hook)
+    accelerator.register_load_state_pre_hook(load_model_hook)
+
+    # Enable TF32 for faster training on Ampere GPUs,
+    # cf https://pytorch.org/docs/stable/notes/cuda.html#tensorfloat-32-tf32-on-ampere-devices
+    if args.allow_tf32 and torch.cuda.is_available():
+        torch.backends.cuda.matmul.allow_tf32 = True
+
+    if args.scale_lr:
+        args.learning_rate = (
+            args.learning_rate * args.gradient_accumulation_steps * args.train_batch_size * accelerator.num_processes
+        )
+
+    # Make sure the trainable params are in float32.
+    if args.mixed_precision == "fp16":
+        models = [transformer]
+        # only upcast trainable parameters (LoRA) into fp32
+        cast_training_params(models, dtype=torch.float32)
+
+    transformer_lora_parameters = list(filter(lambda p: p.requires_grad, transformer.parameters()))
+
+    # Optimization parameters
+    transformer_parameters_with_lr = {"params": transformer_lora_parameters, "lr": args.learning_rate}
+    params_to_optimize = [transformer_parameters_with_lr]
+
+    # Optimizer creation
+    if not (args.optimizer.lower() == "prodigy" or args.optimizer.lower() == "adamw"):
+        logger.warning(
+            f"Unsupported choice of optimizer: {args.optimizer}.Supported optimizers include [adamW, prodigy]."
+            "Defaulting to adamW"
+        )
+        args.optimizer = "adamw"
+
+    if args.use_8bit_adam and not args.optimizer.lower() == "adamw":
+        logger.warning(
+            f"use_8bit_adam is ignored when optimizer is not set to 'AdamW'. Optimizer was "
+            f"set to {args.optimizer.lower()}"
+        )
+
+    if args.optimizer.lower() == "adamw":
+        if args.use_8bit_adam:
+            try:
+                import bitsandbytes as bnb
+            except ImportError:
+                raise ImportError(
+                    "To use 8-bit Adam, please install the bitsandbytes library: `pip install bitsandbytes`."
+                )
+
+            optimizer_class = bnb.optim.AdamW8bit
+        else:
+            optimizer_class = torch.optim.AdamW
+
+        optimizer = optimizer_class(
+            params_to_optimize,
+            betas=(args.adam_beta1, args.adam_beta2),
+            weight_decay=args.adam_weight_decay,
+            eps=args.adam_epsilon,
+        )
+
+    if args.optimizer.lower() == "prodigy":
+        try:
+            import prodigyopt
+        except ImportError:
+            raise ImportError("To use Prodigy, please install the prodigyopt library: `pip install prodigyopt`")
+
+        optimizer_class = prodigyopt.Prodigy
+
+        if args.learning_rate <= 0.1:
+            logger.warning(
+                "Learning rate is too low. When using prodigy, it's generally better to set learning rate around 1.0"
+            )
+
+        optimizer = optimizer_class(
+            params_to_optimize,
+            betas=(args.adam_beta1, args.adam_beta2),
+            beta3=args.prodigy_beta3,
+            weight_decay=args.adam_weight_decay,
+            eps=args.adam_epsilon,
+            decouple=args.prodigy_decouple,
+            use_bias_correction=args.prodigy_use_bias_correction,
+            safeguard_warmup=args.prodigy_safeguard_warmup,
+        )
+
+    if args.aspect_ratio_buckets is not None:
+        buckets = parse_buckets_string(args.aspect_ratio_buckets)
+    else:
+        buckets = [(args.resolution, args.resolution)]
+    logger.info(f"Using parsed aspect ratio buckets: {buckets}")
+
+    # Dataset and DataLoaders creation:
+    train_dataset = DreamBoothDataset(
+        instance_data_root=args.instance_data_dir,
+        instance_prompt=args.instance_prompt,
+        class_prompt=args.class_prompt,
+        class_data_root=args.class_data_dir if args.with_prior_preservation else None,
+        class_num=args.num_class_images,
+        size=args.resolution,
+        repeats=args.repeats,
+        center_crop=args.center_crop,
+        buckets=buckets,
+    )
+    batch_sampler = BucketBatchSampler(train_dataset, batch_size=args.train_batch_size, drop_last=True)
+    train_dataloader = torch.utils.data.DataLoader(
+        train_dataset,
+        batch_sampler=batch_sampler,
+        collate_fn=lambda examples: collate_fn(examples, args.with_prior_preservation),
+        num_workers=args.dataloader_num_workers,
+    )
+
+    def compute_text_embeddings(prompt, text_encoding_pipeline):
+        with torch.no_grad():
+            prompt_embeds, text_ids = text_encoding_pipeline.encode_prompt(
+                prompt=prompt,
+                max_sequence_length=args.max_sequence_length,
+                text_encoder_out_layers=args.text_encoder_out_layers,
+            )
+        return prompt_embeds, text_ids
+
+    def compute_remote_text_embeddings(prompts):
+        import io
+
+        import requests
+
+        if args.hub_token is not None:
+            hf_token = args.hub_token
+        else:
+            from huggingface_hub import get_token
+
+            hf_token = get_token()
+            if hf_token is None:
+                raise ValueError(
+                    "No HuggingFace token found. To use the remote text encoder please login using `hf auth login` or provide a token using --hub_token"
+                )
+
+        def _encode_single(prompt: str):
+            response = requests.post(
+                "https://remote-text-encoder-flux-2.huggingface.co/predict",
+                json={"prompt": prompt},
+                headers={"Authorization": f"Bearer {hf_token}", "Content-Type": "application/json"},
+            )
+            assert response.status_code == 200, f"{response.status_code=}"
+            return torch.load(io.BytesIO(response.content))
+
+        try:
+            if isinstance(prompts, (list, tuple)):
+                embeds = [_encode_single(p) for p in prompts]
+                prompt_embeds = torch.cat(embeds, dim=0)
+            else:
+                prompt_embeds = _encode_single(prompts)
+
+            text_ids = Flux2Pipeline._prepare_text_ids(prompt_embeds).to(accelerator.device)
+            prompt_embeds = prompt_embeds.to(accelerator.device)
+            return prompt_embeds, text_ids
+
+        except Exception as e:
+            raise RuntimeError("Remote text encoder inference failed.") from e
+
+    # If no type of tuning is done on the text_encoder and custom instance prompts are NOT
+    # provided (i.e. the --instance_prompt is used for all images), we encode the instance prompt once to avoid
+    # the redundant encoding.
+    if not train_dataset.custom_instance_prompts:
+        if args.remote_text_encoder:
+            instance_prompt_hidden_states, instance_text_ids = compute_remote_text_embeddings(args.instance_prompt)
+        else:
+            with offload_models(text_encoding_pipeline, device=accelerator.device, offload=args.offload):
+                instance_prompt_hidden_states, instance_text_ids = compute_text_embeddings(
+                    args.instance_prompt, text_encoding_pipeline
+                )
+
+    # Handle class prompt for prior-preservation.
+    if args.with_prior_preservation:
+        if args.remote_text_encoder:
+            class_prompt_hidden_states, class_text_ids = compute_remote_text_embeddings(args.class_prompt)
+        else:
+            with offload_models(text_encoding_pipeline, device=accelerator.device, offload=args.offload):
+                class_prompt_hidden_states, class_text_ids = compute_text_embeddings(
+                    args.class_prompt, text_encoding_pipeline
+                )
+    validation_embeddings = {}
+    if args.validation_prompt is not None:
+        if args.remote_text_encoder:
+            (validation_embeddings["prompt_embeds"], validation_embeddings["text_ids"]) = (
+                compute_remote_text_embeddings(args.validation_prompt)
+            )
+        else:
+            with offload_models(text_encoding_pipeline, device=accelerator.device, offload=args.offload):
+                (validation_embeddings["prompt_embeds"], validation_embeddings["text_ids"]) = compute_text_embeddings(
+                    args.validation_prompt, text_encoding_pipeline
+                )
+
+    # If custom instance prompts are NOT provided (i.e. the instance prompt is used for all images),
+    # pack the statically computed variables appropriately here. This is so that we don't
+    # have to pass them to the dataloader.
+    if not train_dataset.custom_instance_prompts:
+        prompt_embeds = instance_prompt_hidden_states
+        text_ids = instance_text_ids
+        if args.with_prior_preservation:
+            prompt_embeds = torch.cat([prompt_embeds, class_prompt_hidden_states], dim=0)
+            text_ids = torch.cat([text_ids, class_text_ids], dim=0)
+
+    # if cache_latents is set to True, we encode images to latents and store them.
+    # Similar to pre-encoding in the case of a single instance prompt, if custom prompts are provided
+    # we encode them in advance as well.
+    precompute_latents = args.cache_latents or train_dataset.custom_instance_prompts
+    if precompute_latents:
+        prompt_embeds_cache = []
+        text_ids_cache = []
+        latents_cache = []
+        for batch in tqdm(train_dataloader, desc="Caching latents"):
+            with torch.no_grad():
+                if args.cache_latents:
+                    with offload_models(vae, device=accelerator.device, offload=args.offload):
+                        batch["pixel_values"] = batch["pixel_values"].to(
+                            accelerator.device, non_blocking=True, dtype=vae.dtype
+                        )
+                        latents_cache.append(vae.encode(batch["pixel_values"]).latent_dist)
+                if train_dataset.custom_instance_prompts:
+                    if args.remote_text_encoder:
+                        prompt_embeds, text_ids = compute_remote_text_embeddings(batch["prompts"])
+                    else:
+                        with offload_models(text_encoding_pipeline, device=accelerator.device, offload=args.offload):
+                            prompt_embeds, text_ids = compute_text_embeddings(batch["prompts"], text_encoding_pipeline)
+                    prompt_embeds_cache.append(prompt_embeds)
+                    text_ids_cache.append(text_ids)
+
+    # move back to cpu before deleting to ensure memory is freed see: https://github.com/huggingface/diffusers/issues/11376#issue-3008144624
+    if args.cache_latents:
+        vae = vae.to("cpu")
+        del vae
+
+    # move back to cpu before deleting to ensure memory is freed see: https://github.com/huggingface/diffusers/issues/11376#issue-3008144624
+    if not args.remote_text_encoder:
+        text_encoding_pipeline = text_encoding_pipeline.to("cpu")
+        del text_encoder, tokenizer
+    free_memory()
+
+    # Scheduler and math around the number of training steps.
+    # Check the PR https://github.com/huggingface/diffusers/pull/8312 for detailed explanation.
+    num_warmup_steps_for_scheduler = args.lr_warmup_steps * accelerator.num_processes
+    if args.max_train_steps is None:
+        len_train_dataloader_after_sharding = math.ceil(len(train_dataloader) / accelerator.num_processes)
+        num_update_steps_per_epoch = math.ceil(len_train_dataloader_after_sharding / args.gradient_accumulation_steps)
+        num_training_steps_for_scheduler = (
+            args.num_train_epochs * accelerator.num_processes * num_update_steps_per_epoch
+        )
+    else:
+        num_training_steps_for_scheduler = args.max_train_steps * accelerator.num_processes
+
+    lr_scheduler = get_scheduler(
+        args.lr_scheduler,
+        optimizer=optimizer,
+        num_warmup_steps=num_warmup_steps_for_scheduler,
+        num_training_steps=num_training_steps_for_scheduler,
+        num_cycles=args.lr_num_cycles,
+        power=args.lr_power,
+    )
+
+    # Prepare everything with our `accelerator`.
+    transformer, optimizer, train_dataloader, lr_scheduler = accelerator.prepare(
+        transformer, optimizer, train_dataloader, lr_scheduler
+    )
+
+    # We need to recalculate our total training steps as the size of the training dataloader may have changed.
+    num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
+    if args.max_train_steps is None:
+        args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
+        if num_training_steps_for_scheduler != args.max_train_steps:
+            logger.warning(
+                f"The length of the 'train_dataloader' after 'accelerator.prepare' ({len(train_dataloader)}) does not match "
+                f"the expected length ({len_train_dataloader_after_sharding}) when the learning rate scheduler was created. "
+                f"This inconsistency may result in the learning rate scheduler not functioning properly."
+            )
+    # Afterwards we recalculate our number of training epochs
+    args.num_train_epochs = math.ceil(args.max_train_steps / num_update_steps_per_epoch)
+
+    # We need to initialize the trackers we use, and also store our configuration.
+    # The trackers initializes automatically on the main process.
+    if accelerator.is_main_process:
+        tracker_name = "dreambooth-flux2-lora"
+        args_cp = vars(args).copy()
+        args_cp["text_encoder_out_layers"] = str(args_cp["text_encoder_out_layers"])
+        accelerator.init_trackers(tracker_name, config=args_cp)
+
+    # Train!
+    total_batch_size = args.train_batch_size * accelerator.num_processes * args.gradient_accumulation_steps
+
+    logger.info("***** Running training *****")
+    logger.info(f"  Num examples = {len(train_dataset)}")
+    logger.info(f"  Num batches each epoch = {len(train_dataloader)}")
+    logger.info(f"  Num Epochs = {args.num_train_epochs}")
+    logger.info(f"  Instantaneous batch size per device = {args.train_batch_size}")
+    logger.info(f"  Total train batch size (w. parallel, distributed & accumulation) = {total_batch_size}")
+    logger.info(f"  Gradient Accumulation steps = {args.gradient_accumulation_steps}")
+    logger.info(f"  Total optimization steps = {args.max_train_steps}")
+    global_step = 0
+    first_epoch = 0
+
+    # Potentially load in the weights and states from a previous save
+    if args.resume_from_checkpoint:
+        if args.resume_from_checkpoint != "latest":
+            path = os.path.basename(args.resume_from_checkpoint)
+        else:
+            # Get the mos recent checkpoint
+            dirs = os.listdir(args.output_dir)
+            dirs = [d for d in dirs if d.startswith("checkpoint")]
+            dirs = sorted(dirs, key=lambda x: int(x.split("-")[1]))
+            path = dirs[-1] if len(dirs) > 0 else None
+
+        if path is None:
+            accelerator.print(
+                f"Checkpoint '{args.resume_from_checkpoint}' does not exist. Starting a new training run."
+            )
+            args.resume_from_checkpoint = None
+            initial_global_step = 0
+        else:
+            accelerator.print(f"Resuming from checkpoint {path}")
+            accelerator.load_state(os.path.join(args.output_dir, path))
+            global_step = int(path.split("-")[1])
+
+            initial_global_step = global_step
+            first_epoch = global_step // num_update_steps_per_epoch
+
+    else:
+        initial_global_step = 0
+
+    progress_bar = tqdm(
+        range(0, args.max_train_steps),
+        initial=initial_global_step,
+        desc="Steps",
+        # Only show the progress bar once on each machine.
+        disable=not accelerator.is_local_main_process,
+    )
+
+    def get_sigmas(timesteps, n_dim=4, dtype=torch.float32):
+        sigmas = noise_scheduler_copy.sigmas.to(device=accelerator.device, dtype=dtype)
+        schedule_timesteps = noise_scheduler_copy.timesteps.to(accelerator.device)
+        timesteps = timesteps.to(accelerator.device)
+        step_indices = [(schedule_timesteps == t).nonzero().item() for t in timesteps]
+
+        sigma = sigmas[step_indices].flatten()
+        while len(sigma.shape) < n_dim:
+            sigma = sigma.unsqueeze(-1)
+        return sigma
+
+    for epoch in range(first_epoch, args.num_train_epochs):
+        transformer.train()
+
+        for step, batch in enumerate(train_dataloader):
+            models_to_accumulate = [transformer]
+            prompts = batch["prompts"]
+
+            with accelerator.accumulate(models_to_accumulate):
+                if train_dataset.custom_instance_prompts:
+                    prompt_embeds = prompt_embeds_cache[step]
+                    text_ids = text_ids_cache[step]
+                else:
+                    num_repeat_elements = len(prompts)
+                    prompt_embeds = prompt_embeds.repeat(num_repeat_elements, 1, 1)
+                    text_ids = text_ids.repeat(num_repeat_elements, 1, 1)
+
+                # Convert images to latent space
+                if args.cache_latents:
+                    model_input = latents_cache[step].mode()
+                else:
+                    with offload_models(vae, device=accelerator.device, offload=args.offload):
+                        pixel_values = batch["pixel_values"].to(dtype=vae.dtype)
+                    model_input = vae.encode(pixel_values).latent_dist.mode()
+
+                model_input = Flux2Pipeline._patchify_latents(model_input)
+                model_input = (model_input - latents_bn_mean) / latents_bn_std
+
+                model_input_ids = Flux2Pipeline._prepare_latent_ids(model_input).to(device=model_input.device)
+                # Sample noise that we'll add to the latents
+                noise = torch.randn_like(model_input)
+                bsz = model_input.shape[0]
+
+                # Sample a random timestep for each image
+                # for weighting schemes where we sample timesteps non-uniformly
+                u = compute_density_for_timestep_sampling(
+                    weighting_scheme=args.weighting_scheme,
+                    batch_size=bsz,
+                    logit_mean=args.logit_mean,
+                    logit_std=args.logit_std,
+                    mode_scale=args.mode_scale,
+                )
+                indices = (u * noise_scheduler_copy.config.num_train_timesteps).long()
+                timesteps = noise_scheduler_copy.timesteps[indices].to(device=model_input.device)
+
+                # Add noise according to flow matching.
+                # zt = (1 - texp) * x + texp * z1
+                sigmas = get_sigmas(timesteps, n_dim=model_input.ndim, dtype=model_input.dtype)
+                noisy_model_input = (1.0 - sigmas) * model_input + sigmas * noise
+
+                # [B, C, H, W] -> [B, H*W, C]
+                packed_noisy_model_input = Flux2Pipeline._pack_latents(noisy_model_input)
+
+                # handle guidance
+                guidance = torch.full([1], args.guidance_scale, device=accelerator.device)
+                guidance = guidance.expand(model_input.shape[0])
+
+                # Predict the noise residual
+                model_pred = transformer(
+                    hidden_states=packed_noisy_model_input,  # (B, image_seq_len, C)
+                    timestep=timesteps / 1000,
+                    guidance=guidance,
+                    encoder_hidden_states=prompt_embeds,
+                    txt_ids=text_ids,  # B, text_seq_len, 4
+                    img_ids=model_input_ids,  # B, image_seq_len, 4
+                    return_dict=False,
+                )[0]
+                model_pred = model_pred[:, : packed_noisy_model_input.size(1) :]
+
+                model_pred = Flux2Pipeline._unpack_latents_with_ids(model_pred, model_input_ids)
+
+                # these weighting schemes use a uniform timestep sampling
+                # and instead post-weight the loss
+                weighting = compute_loss_weighting_for_sd3(weighting_scheme=args.weighting_scheme, sigmas=sigmas)
+
+                # flow matching loss
+                target = noise - model_input
+
+                if args.with_prior_preservation:
+                    # Chunk the noise and model_pred into two parts and compute the loss on each part separately.
+                    model_pred, model_pred_prior = torch.chunk(model_pred, 2, dim=0)
+                    target, target_prior = torch.chunk(target, 2, dim=0)
+
+                    # Compute prior loss
+                    prior_loss = torch.mean(
+                        (weighting.float() * (model_pred_prior.float() - target_prior.float()) ** 2).reshape(
+                            target_prior.shape[0], -1
+                        ),
+                        1,
+                    )
+                    prior_loss = prior_loss.mean()
+
+                # Compute regular loss.
+                loss = torch.mean(
+                    (weighting.float() * (model_pred.float() - target.float()) ** 2).reshape(target.shape[0], -1),
+                    1,
+                )
+                loss = loss.mean()
+
+                if args.with_prior_preservation:
+                    # Add the prior loss to the instance loss.
+                    loss = loss + args.prior_loss_weight * prior_loss
+
+                accelerator.backward(loss)
+                if accelerator.sync_gradients:
+                    params_to_clip = transformer.parameters()
+                    accelerator.clip_grad_norm_(params_to_clip, args.max_grad_norm)
+
+                optimizer.step()
+                lr_scheduler.step()
+                optimizer.zero_grad()
+
+            # Checks if the accelerator has performed an optimization step behind the scenes
+            if accelerator.sync_gradients:
+                progress_bar.update(1)
+                global_step += 1
+
+                if accelerator.is_main_process:
+                    if global_step % args.checkpointing_steps == 0:
+                        # _before_ saving state, check if this save would set us over the `checkpoints_total_limit`
+                        if args.checkpoints_total_limit is not None:
+                            checkpoints = os.listdir(args.output_dir)
+                            checkpoints = [d for d in checkpoints if d.startswith("checkpoint")]
+                            checkpoints = sorted(checkpoints, key=lambda x: int(x.split("-")[1]))
+
+                            # before we save the new checkpoint, we need to have at _most_ `checkpoints_total_limit - 1` checkpoints
+                            if len(checkpoints) >= args.checkpoints_total_limit:
+                                num_to_remove = len(checkpoints) - args.checkpoints_total_limit + 1
+                                removing_checkpoints = checkpoints[0:num_to_remove]
+
+                                logger.info(
+                                    f"{len(checkpoints)} checkpoints already exist, removing {len(removing_checkpoints)} checkpoints"
+                                )
+                                logger.info(f"removing checkpoints: {', '.join(removing_checkpoints)}")
+
+                                for removing_checkpoint in removing_checkpoints:
+                                    removing_checkpoint = os.path.join(args.output_dir, removing_checkpoint)
+                                    shutil.rmtree(removing_checkpoint)
+
+                        save_path = os.path.join(args.output_dir, f"checkpoint-{global_step}")
+                        accelerator.save_state(save_path)
+                        logger.info(f"Saved state to {save_path}")
+
+            logs = {"loss": loss.detach().item(), "lr": lr_scheduler.get_last_lr()[0]}
+            progress_bar.set_postfix(**logs)
+            accelerator.log(logs, step=global_step)
+
+            if global_step >= args.max_train_steps:
+                break
+
+        if accelerator.is_main_process:
+            if args.validation_prompt is not None and epoch % args.validation_epochs == 0:
+                # create pipeline
+                pipeline = Flux2Pipeline.from_pretrained(
+                    args.pretrained_model_name_or_path,
+                    text_encoder=None,
+                    tokenizer=None,
+                    transformer=unwrap_model(transformer),
+                    revision=args.revision,
+                    variant=args.variant,
+                    torch_dtype=weight_dtype,
+                )
+                images = log_validation(
+                    pipeline=pipeline,
+                    args=args,
+                    accelerator=accelerator,
+                    pipeline_args=validation_embeddings,
+                    epoch=epoch,
+                    torch_dtype=weight_dtype,
+                )
+
+                del pipeline
+                free_memory()
+
+    # Save the lora layers
+    accelerator.wait_for_everyone()
+    if accelerator.is_main_process:
+        modules_to_save = {}
+        transformer = unwrap_model(transformer)
+        if args.bnb_quantization_config_path is None:
+            if args.upcast_before_saving:
+                transformer.to(torch.float32)
+            else:
+                transformer = transformer.to(weight_dtype)
+        transformer_lora_layers = get_peft_model_state_dict(transformer)
+        modules_to_save["transformer"] = transformer
+
+        Flux2Pipeline.save_lora_weights(
+            save_directory=args.output_dir,
+            transformer_lora_layers=transformer_lora_layers,
+            **_collate_lora_metadata(modules_to_save),
+        )
+
+        images = []
+        run_validation = (args.validation_prompt and args.num_validation_images > 0) or (args.final_validation_prompt)
+        should_run_final_inference = not args.skip_final_inference and run_validation
+        if should_run_final_inference:
+            pipeline = Flux2Pipeline.from_pretrained(
+                args.pretrained_model_name_or_path,
+                revision=args.revision,
+                variant=args.variant,
+                torch_dtype=weight_dtype,
+            )
+            # load attention processors
+            pipeline.load_lora_weights(args.output_dir)
+
+            # run inference
+            images = []
+            if args.validation_prompt and args.num_validation_images > 0:
+                images = log_validation(
+                    pipeline=pipeline,
+                    args=args,
+                    accelerator=accelerator,
+                    pipeline_args=validation_embeddings,
+                    epoch=epoch,
+                    is_final_validation=True,
+                    torch_dtype=weight_dtype,
+                )
+            images = None
+            del pipeline
+            free_memory()
+
+        validation_prompt = args.validation_prompt if args.validation_prompt else args.final_validation_prompt
+        quant_training = None
+        if args.do_fp8_training:
+            quant_training = "FP8 TorchAO"
+        elif args.bnb_quantization_config_path:
+            quant_training = "BitsandBytes"
+        save_model_card(
+            (args.hub_model_id or Path(args.output_dir).name) if not args.push_to_hub else repo_id,
+            images=images,
+            base_model=args.pretrained_model_name_or_path,
+            instance_prompt=args.instance_prompt,
+            validation_prompt=validation_prompt,
+            repo_folder=args.output_dir,
+            quant_training=quant_training,
+        )
+
+        if args.push_to_hub:
+            upload_folder(
+                repo_id=repo_id,
+                folder_path=args.output_dir,
+                commit_message="End of training",
+                ignore_patterns=["step_*", "epoch_*"],
+            )
+
+    accelerator.end_training()
+
+
+if __name__ == "__main__":
+    args = parse_args()
+    main(args)
diff --git a/examples/dreambooth/train_dreambooth_lora_flux2_img2img.py b/examples/dreambooth/train_dreambooth_lora_flux2_img2img.py
new file mode 100644
index 000000000000..32bce9531b71
--- /dev/null
+++ b/examples/dreambooth/train_dreambooth_lora_flux2_img2img.py
@@ -0,0 +1,1831 @@
+#!/usr/bin/env python
+# coding=utf-8
+# Copyright 2025 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# /// script
+# dependencies = [
+#     "diffusers @ git+https://github.com/huggingface/diffusers.git",
+#     "torch>=2.0.0",
+#     "accelerate>=0.31.0",
+#     "transformers>=4.41.2",
+#     "ftfy",
+#     "tensorboard",
+#     "Jinja2",
+#     "peft>=0.11.1",
+#     "sentencepiece",
+#     "torchvision",
+#     "datasets",
+#     "bitsandbytes",
+#     "prodigyopt",
+# ]
+# ///
+
+import argparse
+import copy
+import itertools
+import json
+import logging
+import math
+import os
+import random
+import shutil
+from contextlib import nullcontext
+from pathlib import Path
+
+import numpy as np
+import torch
+import transformers
+from accelerate import Accelerator
+from accelerate.logging import get_logger
+from accelerate.utils import DistributedDataParallelKwargs, ProjectConfiguration, set_seed
+from huggingface_hub import create_repo, upload_folder
+from peft import LoraConfig, prepare_model_for_kbit_training, set_peft_model_state_dict
+from peft.utils import get_peft_model_state_dict
+from PIL import Image
+from PIL.ImageOps import exif_transpose
+from torch.utils.data import Dataset
+from torch.utils.data.sampler import BatchSampler
+from torchvision import transforms
+from torchvision.transforms import functional as TF
+from tqdm.auto import tqdm
+from transformers import Mistral3ForConditionalGeneration, PixtralProcessor
+
+import diffusers
+from diffusers import (
+    AutoencoderKLFlux2,
+    BitsAndBytesConfig,
+    FlowMatchEulerDiscreteScheduler,
+    Flux2Pipeline,
+    Flux2Transformer2DModel,
+)
+from diffusers.optimization import get_scheduler
+from diffusers.pipelines.flux2.image_processor import Flux2ImageProcessor
+from diffusers.training_utils import (
+    _collate_lora_metadata,
+    cast_training_params,
+    compute_density_for_timestep_sampling,
+    compute_loss_weighting_for_sd3,
+    find_nearest_bucket,
+    free_memory,
+    offload_models,
+    parse_buckets_string,
+)
+from diffusers.utils import (
+    check_min_version,
+    convert_unet_state_dict_to_peft,
+    is_wandb_available,
+    load_image,
+)
+from diffusers.utils.hub_utils import load_or_create_model_card, populate_model_card
+from diffusers.utils.import_utils import is_torch_npu_available
+from diffusers.utils.torch_utils import is_compiled_module
+
+
+if is_wandb_available():
+    import wandb
+
+# Will error if the minimal version of diffusers is not installed. Remove at your own risks.
+check_min_version("0.36.0.dev0")
+
+logger = get_logger(__name__)
+
+
+def save_model_card(
+    repo_id: str,
+    images=None,
+    base_model: str = None,
+    instance_prompt=None,
+    validation_prompt=None,
+    repo_folder=None,
+    fp8_training=False,
+):
+    widget_dict = []
+    if images is not None:
+        for i, image in enumerate(images):
+            image.save(os.path.join(repo_folder, f"image_{i}.png"))
+            widget_dict.append(
+                {"text": validation_prompt if validation_prompt else " ", "output": {"url": f"image_{i}.png"}}
+            )
+
+    model_description = f"""
+# Flux DreamBooth LoRA - {repo_id}
+
+<Gallery />
+
+## Model description
+
+These are {repo_id} DreamBooth LoRA weights for {base_model}.
+
+The weights were trained using [DreamBooth](https://dreambooth.github.io/) with the [Flux2 diffusers trainer](https://github.com/huggingface/diffusers/blob/main/examples/dreambooth/README_flux2.md).
+
+FP8 training? {fp8_training}
+
+## Trigger words
+
+You should use `{instance_prompt}` to trigger the image generation.
+
+## Download model
+
+[Download the *.safetensors LoRA]({repo_id}/tree/main) in the Files & versions tab.
+
+## Use it with the [🧨 diffusers library](https://github.com/huggingface/diffusers)
+
+```py
+from diffusers import AutoPipelineForText2Image
+import torch
+pipeline = AutoPipelineForText2Image.from_pretrained("black-forest-labs/FLUX.2", torch_dtype=torch.bfloat16).to('cuda')
+pipeline.load_lora_weights('{repo_id}', weight_name='pytorch_lora_weights.safetensors')
+image = pipeline('{validation_prompt if validation_prompt else instance_prompt}').images[0]
+```
+
+For more details, including weighting, merging and fusing LoRAs, check the [documentation on loading LoRAs in diffusers](https://huggingface.co/docs/diffusers/main/en/using-diffusers/loading_adapters)
+
+## License
+
+Please adhere to the licensing terms as described [here](https://huggingface.co/black-forest-labs/FLUX.2/blob/main/LICENSE.md).
+"""
+    model_card = load_or_create_model_card(
+        repo_id_or_path=repo_id,
+        from_training=True,
+        license="other",
+        base_model=base_model,
+        prompt=instance_prompt,
+        model_description=model_description,
+        widget=widget_dict,
+    )
+    tags = [
+        "text-to-image",
+        "diffusers-training",
+        "diffusers",
+        "lora",
+        "flux2",
+        "flux2-diffusers",
+        "template:sd-lora",
+    ]
+
+    model_card = populate_model_card(model_card, tags=tags)
+    model_card.save(os.path.join(repo_folder, "README.md"))
+
+
+def log_validation(
+    pipeline,
+    args,
+    accelerator,
+    pipeline_args,
+    epoch,
+    torch_dtype,
+    is_final_validation=False,
+):
+    args.num_validation_images = args.num_validation_images if args.num_validation_images else 1
+    logger.info(
+        f"Running validation... \n Generating {args.num_validation_images} images with prompt:"
+        f" {args.validation_prompt}."
+    )
+    pipeline = pipeline.to(dtype=torch_dtype)
+    pipeline.enable_model_cpu_offload()
+    pipeline.set_progress_bar_config(disable=True)
+
+    # run inference
+    generator = torch.Generator(device=accelerator.device).manual_seed(args.seed) if args.seed is not None else None
+    autocast_ctx = torch.autocast(accelerator.device.type) if not is_final_validation else nullcontext()
+
+    images = []
+    for _ in range(args.num_validation_images):
+        with autocast_ctx:
+            image = pipeline(
+                image=pipeline_args["image"],
+                prompt_embeds=pipeline_args["prompt_embeds"],
+                generator=generator,
+            ).images[0]
+            images.append(image)
+
+    for tracker in accelerator.trackers:
+        phase_name = "test" if is_final_validation else "validation"
+        if tracker.name == "tensorboard":
+            np_images = np.stack([np.asarray(img) for img in images])
+            tracker.writer.add_images(phase_name, np_images, epoch, dataformats="NHWC")
+        if tracker.name == "wandb":
+            tracker.log(
+                {
+                    phase_name: [
+                        wandb.Image(image, caption=f"{i}: {args.validation_prompt}") for i, image in enumerate(images)
+                    ]
+                }
+            )
+
+    del pipeline
+    free_memory()
+
+    return images
+
+
+def module_filter_fn(mod: torch.nn.Module, fqn: str):
+    # don't convert the output module
+    if fqn == "proj_out":
+        return False
+    # don't convert linear modules with weight dimensions not divisible by 16
+    if isinstance(mod, torch.nn.Linear):
+        if mod.in_features % 16 != 0 or mod.out_features % 16 != 0:
+            return False
+    return True
+
+
+def parse_args(input_args=None):
+    parser = argparse.ArgumentParser(description="Simple example of a training script.")
+    parser.add_argument(
+        "--pretrained_model_name_or_path",
+        type=str,
+        default=None,
+        required=True,
+        help="Path to pretrained model or model identifier from huggingface.co/models.",
+    )
+    parser.add_argument(
+        "--revision",
+        type=str,
+        default=None,
+        required=False,
+        help="Revision of pretrained model identifier from huggingface.co/models.",
+    )
+    parser.add_argument(
+        "--bnb_quantization_config_path",
+        type=str,
+        default=None,
+        help="Quantization config in a JSON file that will be used to define the bitsandbytes quant config of the DiT.",
+    )
+    parser.add_argument(
+        "--do_fp8_training",
+        action="store_true",
+        help="if we are doing FP8 training.",
+    )
+    parser.add_argument(
+        "--variant",
+        type=str,
+        default=None,
+        help="Variant of the model files of the pretrained model identifier from huggingface.co/models, 'e.g.' fp16",
+    )
+    parser.add_argument(
+        "--dataset_name",
+        type=str,
+        default=None,
+        help=(
+            "The name of the Dataset (from the HuggingFace hub) containing the training data of instance images (could be your own, possibly private,"
+            " dataset). It can also be a path pointing to a local copy of a dataset in your filesystem,"
+            " or to a folder containing files that 🤗 Datasets can understand."
+        ),
+    )
+    parser.add_argument(
+        "--dataset_config_name",
+        type=str,
+        default=None,
+        help="The config of the Dataset, leave as None if there's only one config.",
+    )
+    parser.add_argument(
+        "--instance_data_dir",
+        type=str,
+        default=None,
+        help=("A folder containing the training data. "),
+    )
+
+    parser.add_argument(
+        "--cache_dir",
+        type=str,
+        default=None,
+        help="The directory where the downloaded models and datasets will be stored.",
+    )
+
+    parser.add_argument(
+        "--image_column",
+        type=str,
+        default="image",
+        help="The column of the dataset containing the target image. By "
+        "default, the standard Image Dataset maps out 'file_name' "
+        "to 'image'.",
+    )
+    parser.add_argument(
+        "--cond_image_column",
+        type=str,
+        default=None,
+        help="Column in the dataset containing the condition image. Must be specified when performing I2I fine-tuning",
+    )
+    parser.add_argument(
+        "--caption_column",
+        type=str,
+        default=None,
+        help="The column of the dataset containing the instance prompt for each image",
+    )
+
+    parser.add_argument("--repeats", type=int, default=1, help="How many times to repeat the training data.")
+
+    parser.add_argument(
+        "--class_data_dir",
+        type=str,
+        default=None,
+        required=False,
+        help="A folder containing the training data of class images.",
+    )
+    parser.add_argument(
+        "--instance_prompt",
+        type=str,
+        default=None,
+        required=True,
+        help="The prompt with identifier specifying the instance, e.g. 'photo of a TOK dog', 'in the style of TOK'",
+    )
+    parser.add_argument(
+        "--max_sequence_length",
+        type=int,
+        default=512,
+        help="Maximum sequence length to use with with the T5 text encoder",
+    )
+    parser.add_argument(
+        "--validation_prompt",
+        type=str,
+        default=None,
+        help="A prompt that is used during validation to verify that the model is learning.",
+    )
+    parser.add_argument(
+        "--validation_image",
+        type=str,
+        default=None,
+        help="path to an image that is used during validation as the condition image to verify that the model is learning.",
+    )
+    parser.add_argument(
+        "--skip_final_inference",
+        default=False,
+        action="store_true",
+        help="Whether to skip the final inference step with loaded lora weights upon training completion. This will run intermediate validation inference if `validation_prompt` is provided. Specify to reduce memory.",
+    )
+    parser.add_argument(
+        "--final_validation_prompt",
+        type=str,
+        default=None,
+        help="A prompt that is used during a final validation to verify that the model is learning. Ignored if `--validation_prompt` is provided.",
+    )
+    parser.add_argument(
+        "--num_validation_images",
+        type=int,
+        default=4,
+        help="Number of images that should be generated during validation with `validation_prompt`.",
+    )
+    parser.add_argument(
+        "--validation_epochs",
+        type=int,
+        default=50,
+        help=(
+            "Run dreambooth validation every X epochs. Dreambooth validation consists of running the prompt"
+            " `args.validation_prompt` multiple times: `args.num_validation_images`."
+        ),
+    )
+    parser.add_argument(
+        "--rank",
+        type=int,
+        default=4,
+        help=("The dimension of the LoRA update matrices."),
+    )
+    parser.add_argument(
+        "--lora_alpha",
+        type=int,
+        default=4,
+        help="LoRA alpha to be used for additional scaling.",
+    )
+    parser.add_argument("--lora_dropout", type=float, default=0.0, help="Dropout probability for LoRA layers")
+
+    parser.add_argument(
+        "--output_dir",
+        type=str,
+        default="flux-dreambooth-lora",
+        help="The output directory where the model predictions and checkpoints will be written.",
+    )
+    parser.add_argument("--seed", type=int, default=None, help="A seed for reproducible training.")
+    parser.add_argument(
+        "--resolution",
+        type=int,
+        default=512,
+        help=(
+            "The resolution for input images, all the images in the train/validation dataset will be resized to this"
+            " resolution"
+        ),
+    )
+    parser.add_argument(
+        "--aspect_ratio_buckets",
+        type=str,
+        default=None,
+        help=(
+            "Aspect ratio buckets to use for training. Define as a string of 'h1,w1;h2,w2;...'. "
+            "e.g. '1024,1024;768,1360;1360,768;880,1168;1168,880;1248,832;832,1248'"
+            "Images will be resized and cropped to fit the nearest bucket. If provided, --resolution is ignored."
+        ),
+    )
+    parser.add_argument(
+        "--center_crop",
+        default=False,
+        action="store_true",
+        help=(
+            "Whether to center crop the input images to the resolution. If not set, the images will be randomly"
+            " cropped. The images will be resized to the resolution first before cropping."
+        ),
+    )
+    parser.add_argument(
+        "--random_flip",
+        action="store_true",
+        help="whether to randomly flip images horizontally",
+    )
+    parser.add_argument(
+        "--train_batch_size", type=int, default=4, help="Batch size (per device) for the training dataloader."
+    )
+    parser.add_argument(
+        "--sample_batch_size", type=int, default=4, help="Batch size (per device) for sampling images."
+    )
+    parser.add_argument("--num_train_epochs", type=int, default=1)
+    parser.add_argument(
+        "--max_train_steps",
+        type=int,
+        default=None,
+        help="Total number of training steps to perform.  If provided, overrides num_train_epochs.",
+    )
+    parser.add_argument(
+        "--checkpointing_steps",
+        type=int,
+        default=500,
+        help=(
+            "Save a checkpoint of the training state every X updates. These checkpoints can be used both as final"
+            " checkpoints in case they are better than the last checkpoint, and are also suitable for resuming"
+            " training using `--resume_from_checkpoint`."
+        ),
+    )
+    parser.add_argument(
+        "--checkpoints_total_limit",
+        type=int,
+        default=None,
+        help=("Max number of checkpoints to store."),
+    )
+    parser.add_argument(
+        "--resume_from_checkpoint",
+        type=str,
+        default=None,
+        help=(
+            "Whether training should be resumed from a previous checkpoint. Use a path saved by"
+            ' `--checkpointing_steps`, or `"latest"` to automatically select the last available checkpoint.'
+        ),
+    )
+    parser.add_argument(
+        "--gradient_accumulation_steps",
+        type=int,
+        default=1,
+        help="Number of updates steps to accumulate before performing a backward/update pass.",
+    )
+    parser.add_argument(
+        "--gradient_checkpointing",
+        action="store_true",
+        help="Whether or not to use gradient checkpointing to save memory at the expense of slower backward pass.",
+    )
+    parser.add_argument(
+        "--learning_rate",
+        type=float,
+        default=1e-4,
+        help="Initial learning rate (after the potential warmup period) to use.",
+    )
+
+    parser.add_argument(
+        "--guidance_scale",
+        type=float,
+        default=3.5,
+        help="the FLUX.1 dev variant is a guidance distilled model",
+    )
+
+    parser.add_argument(
+        "--scale_lr",
+        action="store_true",
+        default=False,
+        help="Scale the learning rate by the number of GPUs, gradient accumulation steps, and batch size.",
+    )
+    parser.add_argument(
+        "--lr_scheduler",
+        type=str,
+        default="constant",
+        help=(
+            'The scheduler type to use. Choose between ["linear", "cosine", "cosine_with_restarts", "polynomial",'
+            ' "constant", "constant_with_warmup"]'
+        ),
+    )
+    parser.add_argument(
+        "--lr_warmup_steps", type=int, default=500, help="Number of steps for the warmup in the lr scheduler."
+    )
+    parser.add_argument(
+        "--lr_num_cycles",
+        type=int,
+        default=1,
+        help="Number of hard resets of the lr in cosine_with_restarts scheduler.",
+    )
+    parser.add_argument("--lr_power", type=float, default=1.0, help="Power factor of the polynomial scheduler.")
+    parser.add_argument(
+        "--dataloader_num_workers",
+        type=int,
+        default=0,
+        help=(
+            "Number of subprocesses to use for data loading. 0 means that the data will be loaded in the main process."
+        ),
+    )
+    parser.add_argument(
+        "--weighting_scheme",
+        type=str,
+        default="none",
+        choices=["sigma_sqrt", "logit_normal", "mode", "cosmap", "none"],
+        help=('We default to the "none" weighting scheme for uniform sampling and uniform loss'),
+    )
+    parser.add_argument(
+        "--logit_mean", type=float, default=0.0, help="mean to use when using the `'logit_normal'` weighting scheme."
+    )
+    parser.add_argument(
+        "--logit_std", type=float, default=1.0, help="std to use when using the `'logit_normal'` weighting scheme."
+    )
+    parser.add_argument(
+        "--mode_scale",
+        type=float,
+        default=1.29,
+        help="Scale of mode weighting scheme. Only effective when using the `'mode'` as the `weighting_scheme`.",
+    )
+    parser.add_argument(
+        "--optimizer",
+        type=str,
+        default="AdamW",
+        help=('The optimizer type to use. Choose between ["AdamW", "prodigy"]'),
+    )
+
+    parser.add_argument(
+        "--use_8bit_adam",
+        action="store_true",
+        help="Whether or not to use 8-bit Adam from bitsandbytes. Ignored if optimizer is not set to AdamW",
+    )
+
+    parser.add_argument(
+        "--adam_beta1", type=float, default=0.9, help="The beta1 parameter for the Adam and Prodigy optimizers."
+    )
+    parser.add_argument(
+        "--adam_beta2", type=float, default=0.999, help="The beta2 parameter for the Adam and Prodigy optimizers."
+    )
+    parser.add_argument(
+        "--prodigy_beta3",
+        type=float,
+        default=None,
+        help="coefficients for computing the Prodigy stepsize using running averages. If set to None, "
+        "uses the value of square root of beta2. Ignored if optimizer is adamW",
+    )
+    parser.add_argument("--prodigy_decouple", type=bool, default=True, help="Use AdamW style decoupled weight decay")
+    parser.add_argument("--adam_weight_decay", type=float, default=1e-04, help="Weight decay to use for unet params")
+    parser.add_argument(
+        "--adam_weight_decay_text_encoder", type=float, default=1e-03, help="Weight decay to use for text_encoder"
+    )
+
+    parser.add_argument(
+        "--lora_layers",
+        type=str,
+        default=None,
+        help=(
+            'The transformer modules to apply LoRA training on. Please specify the layers in a comma separated. E.g. - "to_k,to_q,to_v,to_out.0" will result in lora training of attention layers only'
+        ),
+    )
+
+    parser.add_argument(
+        "--adam_epsilon",
+        type=float,
+        default=1e-08,
+        help="Epsilon value for the Adam optimizer and Prodigy optimizers.",
+    )
+
+    parser.add_argument(
+        "--prodigy_use_bias_correction",
+        type=bool,
+        default=True,
+        help="Turn on Adam's bias correction. True by default. Ignored if optimizer is adamW",
+    )
+    parser.add_argument(
+        "--prodigy_safeguard_warmup",
+        type=bool,
+        default=True,
+        help="Remove lr from the denominator of D estimate to avoid issues during warm-up stage. True by default. "
+        "Ignored if optimizer is adamW",
+    )
+    parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.")
+    parser.add_argument("--push_to_hub", action="store_true", help="Whether or not to push the model to the Hub.")
+    parser.add_argument("--hub_token", type=str, default=None, help="The token to use to push to the Model Hub.")
+    parser.add_argument(
+        "--hub_model_id",
+        type=str,
+        default=None,
+        help="The name of the repository to keep in sync with the local `output_dir`.",
+    )
+    parser.add_argument(
+        "--logging_dir",
+        type=str,
+        default="logs",
+        help=(
+            "[TensorBoard](https://www.tensorflow.org/tensorboard) log directory. Will default to"
+            " *output_dir/runs/**CURRENT_DATETIME_HOSTNAME***."
+        ),
+    )
+    parser.add_argument(
+        "--allow_tf32",
+        action="store_true",
+        help=(
+            "Whether or not to allow TF32 on Ampere GPUs. Can be used to speed up training. For more information, see"
+            " https://pytorch.org/docs/stable/notes/cuda.html#tensorfloat-32-tf32-on-ampere-devices"
+        ),
+    )
+    parser.add_argument(
+        "--cache_latents",
+        action="store_true",
+        default=False,
+        help="Cache the VAE latents",
+    )
+    parser.add_argument(
+        "--report_to",
+        type=str,
+        default="tensorboard",
+        help=(
+            'The integration to report the results and logs to. Supported platforms are `"tensorboard"`'
+            ' (default), `"wandb"` and `"comet_ml"`. Use `"all"` to report to all integrations.'
+        ),
+    )
+    parser.add_argument(
+        "--mixed_precision",
+        type=str,
+        default=None,
+        choices=["no", "fp16", "bf16"],
+        help=(
+            "Whether to use mixed precision. Choose between fp16 and bf16 (bfloat16). Bf16 requires PyTorch >="
+            " 1.10.and an Nvidia Ampere GPU.  Default to the value of accelerate config of the current system or the"
+            " flag passed with the `accelerate.launch` command. Use this argument to override the accelerate config."
+        ),
+    )
+    parser.add_argument(
+        "--upcast_before_saving",
+        action="store_true",
+        default=False,
+        help=(
+            "Whether to upcast the trained transformer layers to float32 before saving (at the end of training). "
+            "Defaults to precision dtype used for training to save memory"
+        ),
+    )
+    parser.add_argument(
+        "--offload",
+        action="store_true",
+        help="Whether to offload the VAE and the text encoder to CPU when they are not used.",
+    )
+    parser.add_argument(
+        "--remote_text_encoder",
+        action="store_true",
+        help="Whether to use a remote text encoder. This means the text encoder will not be loaded locally and instead, the prompt embeddings will be computed remotely using the HuggingFace Inference API.",
+    )
+
+    parser.add_argument("--local_rank", type=int, default=-1, help="For distributed training: local_rank")
+    parser.add_argument("--enable_npu_flash_attention", action="store_true", help="Enabla Flash Attention for NPU")
+
+    if input_args is not None:
+        args = parser.parse_args(input_args)
+    else:
+        args = parser.parse_args()
+
+    if args.cond_image_column is None:
+        raise ValueError(
+            "you must provide --cond_image_column for image-to-image training. Otherwise please see Flux2 text-to-image training example."
+        )
+    else:
+        assert args.image_column is not None
+        assert args.caption_column is not None
+
+    if args.dataset_name is None and args.instance_data_dir is None:
+        raise ValueError("Specify either `--dataset_name` or `--instance_data_dir`")
+
+    if args.dataset_name is not None and args.instance_data_dir is not None:
+        raise ValueError("Specify only one of `--dataset_name` or `--instance_data_dir`")
+
+    env_local_rank = int(os.environ.get("LOCAL_RANK", -1))
+    if env_local_rank != -1 and env_local_rank != args.local_rank:
+        args.local_rank = env_local_rank
+
+    return args
+
+
+class DreamBoothDataset(Dataset):
+    """
+    A dataset to prepare the instance and class images with the prompts for fine-tuning the model.
+    It pre-processes the images.
+    """
+
+    def __init__(
+        self,
+        instance_data_root,
+        instance_prompt,
+        size=1024,
+        repeats=1,
+        center_crop=False,
+        buckets=None,
+    ):
+        self.size = size
+        self.center_crop = center_crop
+
+        self.instance_prompt = instance_prompt
+        self.custom_instance_prompts = None
+
+        self.buckets = buckets
+
+        # if --dataset_name is provided or a metadata jsonl file is provided in the local --instance_data directory,
+        # we load the training data using load_dataset
+        if args.dataset_name is not None:
+            try:
+                from datasets import load_dataset
+            except ImportError:
+                raise ImportError(
+                    "You are trying to load your data using the datasets library. If you wish to train using custom "
+                    "captions please install the datasets library: `pip install datasets`. If you wish to load a "
+                    "local folder containing images only, specify --instance_data_dir instead."
+                )
+            # Downloading and loading a dataset from the hub.
+            # See more about loading custom images at
+            # https://huggingface.co/docs/datasets/v2.0.0/en/dataset_script
+            dataset = load_dataset(
+                args.dataset_name,
+                args.dataset_config_name,
+                cache_dir=args.cache_dir,
+            )
+            # Preprocessing the datasets.
+            column_names = dataset["train"].column_names
+
+            # 6. Get the column names for input/target.
+            if args.cond_image_column is not None and args.cond_image_column not in column_names:
+                raise ValueError(
+                    f"`--cond_image_column` value '{args.cond_image_column}' not found in dataset columns. Dataset columns are: {', '.join(column_names)}"
+                )
+            if args.image_column is None:
+                image_column = column_names[0]
+                logger.info(f"image column defaulting to {image_column}")
+            else:
+                image_column = args.image_column
+                if image_column not in column_names:
+                    raise ValueError(
+                        f"`--image_column` value '{args.image_column}' not found in dataset columns. Dataset columns are: {', '.join(column_names)}"
+                    )
+            instance_images = dataset["train"][image_column]
+            cond_images = None
+            cond_image_column = args.cond_image_column
+            if cond_image_column is not None:
+                cond_images = [dataset["train"][i][cond_image_column] for i in range(len(dataset["train"]))]
+                assert len(instance_images) == len(cond_images)
+
+            if args.caption_column is None:
+                logger.info(
+                    "No caption column provided, defaulting to instance_prompt for all images. If your dataset "
+                    "contains captions/prompts for the images, make sure to specify the "
+                    "column as --caption_column"
+                )
+                self.custom_instance_prompts = None
+            else:
+                if args.caption_column not in column_names:
+                    raise ValueError(
+                        f"`--caption_column` value '{args.caption_column}' not found in dataset columns. Dataset columns are: {', '.join(column_names)}"
+                    )
+                custom_instance_prompts = dataset["train"][args.caption_column]
+                # create final list of captions according to --repeats
+                self.custom_instance_prompts = []
+                for caption in custom_instance_prompts:
+                    self.custom_instance_prompts.extend(itertools.repeat(caption, repeats))
+        else:
+            self.instance_data_root = Path(instance_data_root)
+            if not self.instance_data_root.exists():
+                raise ValueError("Instance images root doesn't exists.")
+
+            instance_images = [Image.open(path) for path in list(Path(instance_data_root).iterdir())]
+            self.custom_instance_prompts = None
+
+        self.instance_images = []
+        self.cond_images = []
+        for i, img in enumerate(instance_images):
+            self.instance_images.extend(itertools.repeat(img, repeats))
+            if args.dataset_name is not None and cond_images is not None:
+                self.cond_images.extend(itertools.repeat(cond_images[i], repeats))
+
+        self.pixel_values = []
+        self.cond_pixel_values = []
+        for i, image in enumerate(self.instance_images):
+            image = exif_transpose(image)
+            if not image.mode == "RGB":
+                image = image.convert("RGB")
+            dest_image = None
+            if self.cond_images:  # todo: take care of max area for buckets
+                dest_image = self.cond_images[i]
+                image_width, image_height = dest_image.size
+                if image_width * image_height > 1024 * 1024:
+                    dest_image = Flux2ImageProcessor.image_processor._resize_to_target_area(dest_image, 1024 * 1024)
+                    image_width, image_height = dest_image.size
+
+                multiple_of = 2 ** (4 - 1)  # 2 ** (len(vae.config.block_out_channels) - 1), temp!
+                image_width = (image_width // multiple_of) * multiple_of
+                image_height = (image_height // multiple_of) * multiple_of
+                dest_image = Flux2ImageProcessor.image_processor.preprocess(
+                    dest_image, height=image_height, width=image_width, resize_mode="crop"
+                )
+
+                dest_image = exif_transpose(dest_image)
+                if not dest_image.mode == "RGB":
+                    dest_image = dest_image.convert("RGB")
+
+            width, height = image.size
+
+            # Find the closest bucket
+            bucket_idx = find_nearest_bucket(height, width, self.buckets)
+            target_height, target_width = self.buckets[bucket_idx]
+            self.size = (target_height, target_width)
+
+            # based on the bucket assignment, define the transformations
+            image, dest_image = self.paired_transform(
+                image,
+                dest_image=dest_image,
+                size=self.size,
+                center_crop=args.center_crop,
+                random_flip=args.random_flip,
+            )
+            self.pixel_values.append((image, bucket_idx))
+            if dest_image is not None:
+                self.cond_pixel_values.append((dest_image, bucket_idx))
+
+        self.num_instance_images = len(self.instance_images)
+        self._length = self.num_instance_images
+
+        self.image_transforms = transforms.Compose(
+            [
+                transforms.Resize(size, interpolation=transforms.InterpolationMode.BILINEAR),
+                transforms.CenterCrop(size) if center_crop else transforms.RandomCrop(size),
+                transforms.ToTensor(),
+                transforms.Normalize([0.5], [0.5]),
+            ]
+        )
+
+    def __len__(self):
+        return self._length
+
+    def __getitem__(self, index):
+        example = {}
+        instance_image, bucket_idx = self.pixel_values[index % self.num_instance_images]
+        example["instance_images"] = instance_image
+        example["bucket_idx"] = bucket_idx
+        if self.cond_pixel_values:
+            dest_image, _ = self.cond_pixel_values[index % self.num_instance_images]
+            example["cond_images"] = dest_image
+
+        if self.custom_instance_prompts:
+            caption = self.custom_instance_prompts[index % self.num_instance_images]
+            if caption:
+                example["instance_prompt"] = caption
+            else:
+                example["instance_prompt"] = self.instance_prompt
+
+        else:  # custom prompts were provided, but length does not match size of image dataset
+            example["instance_prompt"] = self.instance_prompt
+
+        return example
+
+    def paired_transform(self, image, dest_image=None, size=(224, 224), center_crop=False, random_flip=False):
+        # 1. Resize (deterministic)
+        resize = transforms.Resize(size, interpolation=transforms.InterpolationMode.BILINEAR)
+        image = resize(image)
+        if dest_image is not None:
+            dest_image = resize(dest_image)
+
+        # 2. Crop: either center or SAME random crop
+        if center_crop:
+            crop = transforms.CenterCrop(size)
+            image = crop(image)
+            if dest_image is not None:
+                dest_image = crop(dest_image)
+        else:
+            # get_params returns (i, j, h, w)
+            i, j, h, w = transforms.RandomCrop.get_params(image, output_size=size)
+            image = TF.crop(image, i, j, h, w)
+            if dest_image is not None:
+                dest_image = TF.crop(dest_image, i, j, h, w)
+
+        # 3. Random horizontal flip with the SAME coin flip
+        if random_flip:
+            do_flip = random.random() < 0.5
+            if do_flip:
+                image = TF.hflip(image)
+                if dest_image is not None:
+                    dest_image = TF.hflip(dest_image)
+
+        # 4. ToTensor + Normalize (deterministic)
+        to_tensor = transforms.ToTensor()
+        normalize = transforms.Normalize([0.5], [0.5])
+        image = normalize(to_tensor(image))
+        if dest_image is not None:
+            dest_image = normalize(to_tensor(dest_image))
+
+        return (image, dest_image) if dest_image is not None else (image, None)
+
+
+def collate_fn(examples):
+    pixel_values = [example["instance_images"] for example in examples]
+    prompts = [example["instance_prompt"] for example in examples]
+
+    pixel_values = torch.stack(pixel_values)
+    pixel_values = pixel_values.to(memory_format=torch.contiguous_format).float()
+
+    batch = {"pixel_values": pixel_values, "prompts": prompts}
+    if any("cond_images" in example for example in examples):
+        cond_pixel_values = [example["cond_images"] for example in examples]
+        cond_pixel_values = torch.stack(cond_pixel_values)
+        cond_pixel_values = cond_pixel_values.to(memory_format=torch.contiguous_format).float()
+        batch.update({"cond_pixel_values": cond_pixel_values})
+    return batch
+
+
+class BucketBatchSampler(BatchSampler):
+    def __init__(self, dataset: DreamBoothDataset, batch_size: int, drop_last: bool = False):
+        if not isinstance(batch_size, int) or batch_size <= 0:
+            raise ValueError("batch_size should be a positive integer value, but got batch_size={}".format(batch_size))
+        if not isinstance(drop_last, bool):
+            raise ValueError("drop_last should be a boolean value, but got drop_last={}".format(drop_last))
+
+        self.dataset = dataset
+        self.batch_size = batch_size
+        self.drop_last = drop_last
+
+        # Group indices by bucket
+        self.bucket_indices = [[] for _ in range(len(self.dataset.buckets))]
+        for idx, (_, bucket_idx) in enumerate(self.dataset.pixel_values):
+            self.bucket_indices[bucket_idx].append(idx)
+
+        self.sampler_len = 0
+        self.batches = []
+
+        # Pre-generate batches for each bucket
+        for indices_in_bucket in self.bucket_indices:
+            # Shuffle indices within the bucket
+            random.shuffle(indices_in_bucket)
+            # Create batches
+            for i in range(0, len(indices_in_bucket), self.batch_size):
+                batch = indices_in_bucket[i : i + self.batch_size]
+                if len(batch) < self.batch_size and self.drop_last:
+                    continue  # Skip partial batch if drop_last is True
+                self.batches.append(batch)
+                self.sampler_len += 1  # Count the number of batches
+
+    def __iter__(self):
+        # Shuffle the order of the batches each epoch
+        random.shuffle(self.batches)
+        for batch in self.batches:
+            yield batch
+
+    def __len__(self):
+        return self.sampler_len
+
+
+class PromptDataset(Dataset):
+    "A simple dataset to prepare the prompts to generate class images on multiple GPUs."
+
+    def __init__(self, prompt, num_samples):
+        self.prompt = prompt
+        self.num_samples = num_samples
+
+    def __len__(self):
+        return self.num_samples
+
+    def __getitem__(self, index):
+        example = {}
+        example["prompt"] = self.prompt
+        example["index"] = index
+        return example
+
+
+def main(args):
+    if args.report_to == "wandb" and args.hub_token is not None:
+        raise ValueError(
+            "You cannot use both --report_to=wandb and --hub_token due to a security risk of exposing your token."
+            " Please use `hf auth login` to authenticate with the Hub."
+        )
+
+    if torch.backends.mps.is_available() and args.mixed_precision == "bf16":
+        # due to pytorch#99272, MPS does not yet support bfloat16.
+        raise ValueError(
+            "Mixed precision training with bfloat16 is not supported on MPS. Please use fp16 (recommended) or fp32 instead."
+        )
+    if args.do_fp8_training:
+        from torchao.float8 import Float8LinearConfig, convert_to_float8_training
+
+    logging_dir = Path(args.output_dir, args.logging_dir)
+
+    accelerator_project_config = ProjectConfiguration(project_dir=args.output_dir, logging_dir=logging_dir)
+    kwargs = DistributedDataParallelKwargs(find_unused_parameters=True)
+    accelerator = Accelerator(
+        gradient_accumulation_steps=args.gradient_accumulation_steps,
+        mixed_precision=args.mixed_precision,
+        log_with=args.report_to,
+        project_config=accelerator_project_config,
+        kwargs_handlers=[kwargs],
+    )
+
+    # Disable AMP for MPS.
+    if torch.backends.mps.is_available():
+        accelerator.native_amp = False
+
+    if args.report_to == "wandb":
+        if not is_wandb_available():
+            raise ImportError("Make sure to install wandb if you want to use it for logging during training.")
+
+    # Make one log on every process with the configuration for debugging.
+    logging.basicConfig(
+        format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
+        datefmt="%m/%d/%Y %H:%M:%S",
+        level=logging.INFO,
+    )
+    logger.info(accelerator.state, main_process_only=False)
+    if accelerator.is_local_main_process:
+        transformers.utils.logging.set_verbosity_warning()
+        diffusers.utils.logging.set_verbosity_info()
+    else:
+        transformers.utils.logging.set_verbosity_error()
+        diffusers.utils.logging.set_verbosity_error()
+
+    # If passed along, set the training seed now.
+    if args.seed is not None:
+        set_seed(args.seed)
+
+    # Handle the repository creation
+    if accelerator.is_main_process:
+        if args.output_dir is not None:
+            os.makedirs(args.output_dir, exist_ok=True)
+
+        if args.push_to_hub:
+            repo_id = create_repo(
+                repo_id=args.hub_model_id or Path(args.output_dir).name,
+                exist_ok=True,
+            ).repo_id
+
+    # Load the tokenizers
+    tokenizer = PixtralProcessor.from_pretrained(
+        args.pretrained_model_name_or_path,
+        subfolder="tokenizer",
+        revision=args.revision,
+    )
+
+    # For mixed precision training we cast all non-trainable weights (vae, text_encoder and transformer) to half-precision
+    # as these weights are only used for inference, keeping weights in full precision is not required.
+    weight_dtype = torch.float32
+    if accelerator.mixed_precision == "fp16":
+        weight_dtype = torch.float16
+    elif accelerator.mixed_precision == "bf16":
+        weight_dtype = torch.bfloat16
+
+    # Load scheduler and models
+    noise_scheduler = FlowMatchEulerDiscreteScheduler.from_pretrained(
+        args.pretrained_model_name_or_path,
+        subfolder="scheduler",
+        revision=args.revision,
+    )
+    noise_scheduler_copy = copy.deepcopy(noise_scheduler)
+    vae = AutoencoderKLFlux2.from_pretrained(
+        args.pretrained_model_name_or_path,
+        subfolder="vae",
+        revision=args.revision,
+        variant=args.variant,
+    )
+    latents_bn_mean = vae.bn.running_mean.view(1, -1, 1, 1).to(accelerator.device)
+    latents_bn_std = torch.sqrt(vae.bn.running_var.view(1, -1, 1, 1) + vae.config.batch_norm_eps).to(
+        accelerator.device
+    )
+
+    quantization_config = None
+    if args.bnb_quantization_config_path is not None:
+        with open(args.bnb_quantization_config_path, "r") as f:
+            config_kwargs = json.load(f)
+            if "load_in_4bit" in config_kwargs and config_kwargs["load_in_4bit"]:
+                config_kwargs["bnb_4bit_compute_dtype"] = weight_dtype
+        quantization_config = BitsAndBytesConfig(**config_kwargs)
+
+    transformer = Flux2Transformer2DModel.from_pretrained(
+        args.pretrained_model_name_or_path,
+        subfolder="transformer",
+        revision=args.revision,
+        variant=args.variant,
+        quantization_config=quantization_config,
+        torch_dtype=weight_dtype,
+    )
+    if args.bnb_quantization_config_path is not None:
+        transformer = prepare_model_for_kbit_training(transformer, use_gradient_checkpointing=False)
+
+    if not args.remote_text_encoder:
+        text_encoder = Mistral3ForConditionalGeneration.from_pretrained(
+            args.pretrained_model_name_or_path, subfolder="text_encoder", revision=args.revision, variant=args.variant
+        )
+        text_encoder.requires_grad_(False)
+
+    # We only train the additional adapter LoRA layers
+    transformer.requires_grad_(False)
+    vae.requires_grad_(False)
+
+    if args.enable_npu_flash_attention:
+        if is_torch_npu_available():
+            logger.info("npu flash attention enabled.")
+            transformer.set_attention_backend("_native_npu")
+        else:
+            raise ValueError("npu flash attention requires torch_npu extensions and is supported only on npu device ")
+
+    if torch.backends.mps.is_available() and weight_dtype == torch.bfloat16:
+        # due to pytorch#99272, MPS does not yet support bfloat16.
+        raise ValueError(
+            "Mixed precision training with bfloat16 is not supported on MPS. Please use fp16 (recommended) or fp32 instead."
+        )
+
+    to_kwargs = {"dtype": weight_dtype, "device": accelerator.device} if not args.offload else {"dtype": weight_dtype}
+    # flux vae is stable in bf16 so load it in weight_dtype to reduce memory
+    vae.to(**to_kwargs)
+    # we never offload the transformer to CPU, so we can just use the accelerator device
+    transformer_to_kwargs = (
+        {"device": accelerator.device}
+        if args.bnb_quantization_config_path is not None
+        else {"device": accelerator.device, "dtype": weight_dtype}
+    )
+    transformer.to(**transformer_to_kwargs)
+    if args.do_fp8_training:
+        convert_to_float8_training(
+            transformer, module_filter_fn=module_filter_fn, config=Float8LinearConfig(pad_inner_dim=True)
+        )
+
+    if not args.remote_text_encoder:
+        text_encoder.to(**to_kwargs)
+        # Initialize a text encoding pipeline and keep it to CPU for now.
+        text_encoding_pipeline = Flux2Pipeline.from_pretrained(
+            args.pretrained_model_name_or_path,
+            vae=None,
+            transformer=None,
+            tokenizer=tokenizer,
+            text_encoder=text_encoder,
+            scheduler=None,
+            revision=args.revision,
+        )
+
+    if args.gradient_checkpointing:
+        transformer.enable_gradient_checkpointing()
+
+    if args.lora_layers is not None:
+        target_modules = [layer.strip() for layer in args.lora_layers.split(",")]
+    else:
+        target_modules = ["to_k", "to_q", "to_v", "to_out.0"]
+
+    # now we will add new LoRA weights the transformer layers
+    transformer_lora_config = LoraConfig(
+        r=args.rank,
+        lora_alpha=args.lora_alpha,
+        lora_dropout=args.lora_dropout,
+        init_lora_weights="gaussian",
+        target_modules=target_modules,
+    )
+    transformer.add_adapter(transformer_lora_config)
+
+    def unwrap_model(model):
+        model = accelerator.unwrap_model(model)
+        model = model._orig_mod if is_compiled_module(model) else model
+        return model
+
+    # create custom saving & loading hooks so that `accelerator.save_state(...)` serializes in a nice format
+    def save_model_hook(models, weights, output_dir):
+        if accelerator.is_main_process:
+            transformer_lora_layers_to_save = None
+            modules_to_save = {}
+            for model in models:
+                if isinstance(model, type(unwrap_model(transformer))):
+                    transformer_lora_layers_to_save = get_peft_model_state_dict(model)
+                    modules_to_save["transformer"] = model
+                else:
+                    raise ValueError(f"unexpected save model: {model.__class__}")
+
+                # make sure to pop weight so that corresponding model is not saved again
+                weights.pop()
+
+            Flux2Pipeline.save_lora_weights(
+                output_dir,
+                transformer_lora_layers=transformer_lora_layers_to_save,
+                **_collate_lora_metadata(modules_to_save),
+            )
+
+    def load_model_hook(models, input_dir):
+        transformer_ = None
+
+        while len(models) > 0:
+            model = models.pop()
+
+            if isinstance(model, type(unwrap_model(transformer))):
+                transformer_ = model
+            else:
+                raise ValueError(f"unexpected save model: {model.__class__}")
+
+        lora_state_dict = Flux2Pipeline.lora_state_dict(input_dir)
+
+        transformer_state_dict = {
+            f"{k.replace('transformer.', '')}": v for k, v in lora_state_dict.items() if k.startswith("transformer.")
+        }
+        transformer_state_dict = convert_unet_state_dict_to_peft(transformer_state_dict)
+        incompatible_keys = set_peft_model_state_dict(transformer_, transformer_state_dict, adapter_name="default")
+        if incompatible_keys is not None:
+            # check only for unexpected keys
+            unexpected_keys = getattr(incompatible_keys, "unexpected_keys", None)
+            if unexpected_keys:
+                logger.warning(
+                    f"Loading adapter weights from state_dict led to unexpected keys not found in the model: "
+                    f" {unexpected_keys}. "
+                )
+
+        # Make sure the trainable params are in float32. This is again needed since the base models
+        # are in `weight_dtype`. More details:
+        # https://github.com/huggingface/diffusers/pull/6514#discussion_r1449796804
+        if args.mixed_precision == "fp16":
+            models = [transformer_]
+            # only upcast trainable parameters (LoRA) into fp32
+            cast_training_params(models)
+
+    accelerator.register_save_state_pre_hook(save_model_hook)
+    accelerator.register_load_state_pre_hook(load_model_hook)
+
+    # Enable TF32 for faster training on Ampere GPUs,
+    # cf https://pytorch.org/docs/stable/notes/cuda.html#tensorfloat-32-tf32-on-ampere-devices
+    if args.allow_tf32 and torch.cuda.is_available():
+        torch.backends.cuda.matmul.allow_tf32 = True
+
+    if args.scale_lr:
+        args.learning_rate = (
+            args.learning_rate * args.gradient_accumulation_steps * args.train_batch_size * accelerator.num_processes
+        )
+
+    # Make sure the trainable params are in float32.
+    if args.mixed_precision == "fp16":
+        models = [transformer]
+        # only upcast trainable parameters (LoRA) into fp32
+        cast_training_params(models, dtype=torch.float32)
+
+    transformer_lora_parameters = list(filter(lambda p: p.requires_grad, transformer.parameters()))
+
+    # Optimization parameters
+    transformer_parameters_with_lr = {"params": transformer_lora_parameters, "lr": args.learning_rate}
+    params_to_optimize = [transformer_parameters_with_lr]
+
+    # Optimizer creation
+    if not (args.optimizer.lower() == "prodigy" or args.optimizer.lower() == "adamw"):
+        logger.warning(
+            f"Unsupported choice of optimizer: {args.optimizer}.Supported optimizers include [adamW, prodigy]."
+            "Defaulting to adamW"
+        )
+        args.optimizer = "adamw"
+
+    if args.use_8bit_adam and not args.optimizer.lower() == "adamw":
+        logger.warning(
+            f"use_8bit_adam is ignored when optimizer is not set to 'AdamW'. Optimizer was "
+            f"set to {args.optimizer.lower()}"
+        )
+
+    if args.optimizer.lower() == "adamw":
+        if args.use_8bit_adam:
+            try:
+                import bitsandbytes as bnb
+            except ImportError:
+                raise ImportError(
+                    "To use 8-bit Adam, please install the bitsandbytes library: `pip install bitsandbytes`."
+                )
+
+            optimizer_class = bnb.optim.AdamW8bit
+        else:
+            optimizer_class = torch.optim.AdamW
+
+        optimizer = optimizer_class(
+            params_to_optimize,
+            betas=(args.adam_beta1, args.adam_beta2),
+            weight_decay=args.adam_weight_decay,
+            eps=args.adam_epsilon,
+        )
+
+    if args.optimizer.lower() == "prodigy":
+        try:
+            import prodigyopt
+        except ImportError:
+            raise ImportError("To use Prodigy, please install the prodigyopt library: `pip install prodigyopt`")
+
+        optimizer_class = prodigyopt.Prodigy
+
+        if args.learning_rate <= 0.1:
+            logger.warning(
+                "Learning rate is too low. When using prodigy, it's generally better to set learning rate around 1.0"
+            )
+
+        optimizer = optimizer_class(
+            params_to_optimize,
+            betas=(args.adam_beta1, args.adam_beta2),
+            beta3=args.prodigy_beta3,
+            weight_decay=args.adam_weight_decay,
+            eps=args.adam_epsilon,
+            decouple=args.prodigy_decouple,
+            use_bias_correction=args.prodigy_use_bias_correction,
+            safeguard_warmup=args.prodigy_safeguard_warmup,
+        )
+
+    if args.aspect_ratio_buckets is not None:
+        buckets = parse_buckets_string(args.aspect_ratio_buckets)
+    else:
+        buckets = [(args.resolution, args.resolution)]
+    logger.info(f"Using parsed aspect ratio buckets: {buckets}")
+
+    # Dataset and DataLoaders creation:
+    train_dataset = DreamBoothDataset(
+        instance_data_root=args.instance_data_dir,
+        instance_prompt=args.instance_prompt,
+        size=args.resolution,
+        repeats=args.repeats,
+        center_crop=args.center_crop,
+        buckets=buckets,
+    )
+    batch_sampler = BucketBatchSampler(train_dataset, batch_size=args.train_batch_size, drop_last=True)
+    train_dataloader = torch.utils.data.DataLoader(
+        train_dataset,
+        batch_sampler=batch_sampler,
+        collate_fn=lambda examples: collate_fn(examples),
+        num_workers=args.dataloader_num_workers,
+    )
+
+    def compute_text_embeddings(prompt, text_encoding_pipeline):
+        with torch.no_grad():
+            prompt_embeds, text_ids = text_encoding_pipeline.encode_prompt(
+                prompt=prompt, max_sequence_length=args.max_sequence_length
+            )
+            # prompt_embeds = prompt_embeds.to(accelerator.device)
+            # text_ids = text_ids.to(accelerator.device)
+        return prompt_embeds, text_ids
+
+    def compute_remote_text_embeddings(prompts: str | list[str]):
+        import io
+
+        import requests
+
+        if args.hub_token is not None:
+            hf_token = args.hub_token
+        else:
+            from huggingface_hub import get_token
+
+            hf_token = get_token()
+            if hf_token is None:
+                raise ValueError(
+                    "No HuggingFace token found. To use the remote text encoder please login using `hf auth login` or provide a token using --hub_token"
+                )
+
+        def _encode_single(prompt: str):
+            response = requests.post(
+                "https://remote-text-encoder-flux-2.huggingface.co/predict",
+                json={"prompt": prompt},
+                headers={"Authorization": f"Bearer {hf_token}", "Content-Type": "application/json"},
+            )
+            assert response.status_code == 200, f"{response.status_code=}"
+            return torch.load(io.BytesIO(response.content))
+
+        try:
+            if isinstance(prompts, (list, tuple)):
+                embeds = [_encode_single(p) for p in prompts]
+                prompt_embeds = torch.cat(embeds, dim=0).to(accelerator.device)
+            else:
+                prompt_embeds = _encode_single(prompts).to(accelerator.device)
+
+            text_ids = Flux2Pipeline._prepare_text_ids(prompt_embeds).to(accelerator.device)
+            return prompt_embeds, text_ids
+
+        except Exception as e:
+            raise RuntimeError("Remote text encoder inference failed.") from e
+
+    # If no type of tuning is done on the text_encoder and custom instance prompts are NOT
+    # provided (i.e. the --instance_prompt is used for all images), we encode the instance prompt once to avoid
+    # the redundant encoding.
+    if not train_dataset.custom_instance_prompts:
+        if args.remote_text_encoder:
+            instance_prompt_hidden_states, instance_text_ids = compute_remote_text_embeddings(args.instance_prompt)
+        else:
+            with offload_models(text_encoding_pipeline, device=accelerator.device, offload=args.offload):
+                instance_prompt_hidden_states, instance_text_ids = compute_text_embeddings(
+                    args.instance_prompt, text_encoding_pipeline
+                )
+
+    validation_image = load_image(args.validation_image_path).convert("RGB")
+    validation_kwargs = {"image": validation_image}
+    if args.validation_prompt is not None:
+        if args.remote_text_encoder:
+            validation_kwargs["prompt_embeds"] = compute_remote_text_embeddings(args.validation_prompt)
+        else:
+            with offload_models(text_encoding_pipeline, device=accelerator.device, offload=args.offload):
+                validation_kwargs["prompt_embeds"] = compute_text_embeddings(
+                    args.validation_prompt, text_encoding_pipeline
+                )
+
+    # If custom instance prompts are NOT provided (i.e. the instance prompt is used for all images),
+    # pack the statically computed variables appropriately here. This is so that we don't
+    # have to pass them to the dataloader.
+    if not train_dataset.custom_instance_prompts:
+        prompt_embeds = instance_prompt_hidden_states
+        text_ids = instance_text_ids
+
+    # if cache_latents is set to True, we encode images to latents and store them.
+    # Similar to pre-encoding in the case of a single instance prompt, if custom prompts are provided
+    # we encode them in advance as well.
+    precompute_latents = args.cache_latents or train_dataset.custom_instance_prompts
+    if precompute_latents:
+        prompt_embeds_cache = []
+        text_ids_cache = []
+        latents_cache = []
+        cond_latents_cache = []
+        for batch in tqdm(train_dataloader, desc="Caching latents"):
+            with torch.no_grad():
+                if args.cache_latents:
+                    with offload_models(vae, device=accelerator.device, offload=args.offload):
+                        batch["pixel_values"] = batch["pixel_values"].to(
+                            accelerator.device, non_blocking=True, dtype=vae.dtype
+                        )
+                        latents_cache.append(vae.encode(batch["pixel_values"]).latent_dist)
+                        batch["cond_pixel_values"] = batch["cond_pixel_values"].to(
+                            accelerator.device, non_blocking=True, dtype=vae.dtype
+                        )
+                        cond_latents_cache.append(vae.encode(batch["cond_pixel_values"]).latent_dist)
+                if train_dataset.custom_instance_prompts:
+                    if args.remote_text_encoder:
+                        prompt_embeds, text_ids = compute_remote_text_embeddings(batch["prompts"])
+                    else:
+                        with offload_models(text_encoding_pipeline, device=accelerator.device, offload=args.offload):
+                            prompt_embeds, text_ids = compute_text_embeddings(batch["prompts"], text_encoding_pipeline)
+                    prompt_embeds_cache.append(prompt_embeds)
+                    text_ids_cache.append(text_ids)
+
+    # move back to cpu before deleting to ensure memory is freed see: https://github.com/huggingface/diffusers/issues/11376#issue-3008144624
+    if args.cache_latents:
+        vae = vae.to("cpu")
+        del vae
+
+    # move back to cpu before deleting to ensure memory is freed see: https://github.com/huggingface/diffusers/issues/11376#issue-3008144624
+    if not args.remote_text_encoder:
+        text_encoding_pipeline = text_encoding_pipeline.to("cpu")
+        del text_encoder, tokenizer
+    free_memory()
+
+    # Scheduler and math around the number of training steps.
+    # Check the PR https://github.com/huggingface/diffusers/pull/8312 for detailed explanation.
+    num_warmup_steps_for_scheduler = args.lr_warmup_steps * accelerator.num_processes
+    if args.max_train_steps is None:
+        len_train_dataloader_after_sharding = math.ceil(len(train_dataloader) / accelerator.num_processes)
+        num_update_steps_per_epoch = math.ceil(len_train_dataloader_after_sharding / args.gradient_accumulation_steps)
+        num_training_steps_for_scheduler = (
+            args.num_train_epochs * accelerator.num_processes * num_update_steps_per_epoch
+        )
+    else:
+        num_training_steps_for_scheduler = args.max_train_steps * accelerator.num_processes
+
+    lr_scheduler = get_scheduler(
+        args.lr_scheduler,
+        optimizer=optimizer,
+        num_warmup_steps=num_warmup_steps_for_scheduler,
+        num_training_steps=num_training_steps_for_scheduler,
+        num_cycles=args.lr_num_cycles,
+        power=args.lr_power,
+    )
+
+    # Prepare everything with our `accelerator`.
+    transformer, optimizer, train_dataloader, lr_scheduler = accelerator.prepare(
+        transformer, optimizer, train_dataloader, lr_scheduler
+    )
+
+    # We need to recalculate our total training steps as the size of the training dataloader may have changed.
+    num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
+    if args.max_train_steps is None:
+        args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
+        if num_training_steps_for_scheduler != args.max_train_steps:
+            logger.warning(
+                f"The length of the 'train_dataloader' after 'accelerator.prepare' ({len(train_dataloader)}) does not match "
+                f"the expected length ({len_train_dataloader_after_sharding}) when the learning rate scheduler was created. "
+                f"This inconsistency may result in the learning rate scheduler not functioning properly."
+            )
+    # Afterwards we recalculate our number of training epochs
+    args.num_train_epochs = math.ceil(args.max_train_steps / num_update_steps_per_epoch)
+
+    # We need to initialize the trackers we use, and also store our configuration.
+    # The trackers initializes automatically on the main process.
+    if accelerator.is_main_process:
+        tracker_name = "dreambooth-flux2-image2img-lora"
+        accelerator.init_trackers(tracker_name, config=vars(args))
+
+    # Train!
+    total_batch_size = args.train_batch_size * accelerator.num_processes * args.gradient_accumulation_steps
+
+    logger.info("***** Running training *****")
+    logger.info(f"  Num examples = {len(train_dataset)}")
+    logger.info(f"  Num batches each epoch = {len(train_dataloader)}")
+    logger.info(f"  Num Epochs = {args.num_train_epochs}")
+    logger.info(f"  Instantaneous batch size per device = {args.train_batch_size}")
+    logger.info(f"  Total train batch size (w. parallel, distributed & accumulation) = {total_batch_size}")
+    logger.info(f"  Gradient Accumulation steps = {args.gradient_accumulation_steps}")
+    logger.info(f"  Total optimization steps = {args.max_train_steps}")
+    global_step = 0
+    first_epoch = 0
+
+    # Potentially load in the weights and states from a previous save
+    if args.resume_from_checkpoint:
+        if args.resume_from_checkpoint != "latest":
+            path = os.path.basename(args.resume_from_checkpoint)
+        else:
+            # Get the mos recent checkpoint
+            dirs = os.listdir(args.output_dir)
+            dirs = [d for d in dirs if d.startswith("checkpoint")]
+            dirs = sorted(dirs, key=lambda x: int(x.split("-")[1]))
+            path = dirs[-1] if len(dirs) > 0 else None
+
+        if path is None:
+            accelerator.print(
+                f"Checkpoint '{args.resume_from_checkpoint}' does not exist. Starting a new training run."
+            )
+            args.resume_from_checkpoint = None
+            initial_global_step = 0
+        else:
+            accelerator.print(f"Resuming from checkpoint {path}")
+            accelerator.load_state(os.path.join(args.output_dir, path))
+            global_step = int(path.split("-")[1])
+
+            initial_global_step = global_step
+            first_epoch = global_step // num_update_steps_per_epoch
+
+    else:
+        initial_global_step = 0
+
+    progress_bar = tqdm(
+        range(0, args.max_train_steps),
+        initial=initial_global_step,
+        desc="Steps",
+        # Only show the progress bar once on each machine.
+        disable=not accelerator.is_local_main_process,
+    )
+
+    def get_sigmas(timesteps, n_dim=4, dtype=torch.float32):
+        sigmas = noise_scheduler_copy.sigmas.to(device=accelerator.device, dtype=dtype)
+        schedule_timesteps = noise_scheduler_copy.timesteps.to(accelerator.device)
+        timesteps = timesteps.to(accelerator.device)
+        step_indices = [(schedule_timesteps == t).nonzero().item() for t in timesteps]
+
+        sigma = sigmas[step_indices].flatten()
+        while len(sigma.shape) < n_dim:
+            sigma = sigma.unsqueeze(-1)
+        return sigma
+
+    for epoch in range(first_epoch, args.num_train_epochs):
+        transformer.train()
+
+        for step, batch in enumerate(train_dataloader):
+            models_to_accumulate = [transformer]
+            prompts = batch["prompts"]
+
+            with accelerator.accumulate(models_to_accumulate):
+                if train_dataset.custom_instance_prompts:
+                    prompt_embeds = prompt_embeds_cache[step]
+                    text_ids = text_ids_cache[step]
+                else:
+                    num_repeat_elements = len(prompts)
+                    prompt_embeds = prompt_embeds.repeat(num_repeat_elements, 1, 1)
+                    text_ids = text_ids.repeat(num_repeat_elements, 1, 1)
+
+                # Convert images to latent space
+                if args.cache_latents:
+                    model_input = latents_cache[step].mode()
+                    cond_model_input = cond_latents_cache[step].mode()
+                else:
+                    with offload_models(vae, device=accelerator.device, offload=args.offload):
+                        pixel_values = batch["pixel_values"].to(dtype=vae.dtype)
+                        cond_pixel_values = batch["cond_pixel_values"].to(dtype=vae.dtype)
+
+                    model_input = vae.encode(pixel_values).latent_dist.mode()
+                    cond_model_input = vae.encode(cond_pixel_values).latent_dist.mode()
+
+                    # model_input = Flux2Pipeline._encode_vae_image(pixel_values)
+
+                model_input = Flux2Pipeline._patchify_latents(model_input)
+                model_input = (model_input - latents_bn_mean) / latents_bn_std
+
+                cond_model_input = Flux2Pipeline._patchify_latents(cond_model_input)
+                cond_model_input = (cond_model_input - latents_bn_mean) / latents_bn_std
+
+                model_input_ids = Flux2Pipeline._prepare_latent_ids(model_input).to(device=model_input.device)
+                cond_model_input_ids = Flux2Pipeline._prepare_image_ids(cond_model_input).to(
+                    device=cond_model_input.device
+                )
+
+                # Sample noise that we'll add to the latents
+                noise = torch.randn_like(model_input)
+                bsz = model_input.shape[0]
+
+                # Sample a random timestep for each image
+                # for weighting schemes where we sample timesteps non-uniformly
+                u = compute_density_for_timestep_sampling(
+                    weighting_scheme=args.weighting_scheme,
+                    batch_size=bsz,
+                    logit_mean=args.logit_mean,
+                    logit_std=args.logit_std,
+                    mode_scale=args.mode_scale,
+                )
+                indices = (u * noise_scheduler_copy.config.num_train_timesteps).long()
+                timesteps = noise_scheduler_copy.timesteps[indices].to(device=model_input.device)
+
+                # Add noise according to flow matching.
+                # zt = (1 - texp) * x + texp * z1
+                sigmas = get_sigmas(timesteps, n_dim=model_input.ndim, dtype=model_input.dtype)
+                noisy_model_input = (1.0 - sigmas) * model_input + sigmas * noise
+
+                # [B, C, H, W] -> [B, H*W, C]
+                packed_noisy_model_input = Flux2Pipeline._pack_latents(noisy_model_input)
+                packed_cond_model_input = Flux2Pipeline._pack_latents(cond_model_input)
+
+                # concatenate the model inputs with the cond inputs
+                packed_noisy_model_input = torch.cat([packed_noisy_model_input, packed_cond_model_input], dim=1)
+                model_input_ids = torch.cat([model_input_ids, cond_model_input_ids], dim=1)
+
+                # handle guidance
+                guidance = torch.full([1], args.guidance_scale, device=accelerator.device)
+                guidance = guidance.expand(model_input.shape[0])
+
+                # Predict the noise residual
+                model_pred = transformer(
+                    hidden_states=packed_noisy_model_input,  # (B, image_seq_len, C)
+                    timestep=timesteps / 1000,
+                    guidance=guidance,
+                    encoder_hidden_states=prompt_embeds,
+                    txt_ids=text_ids,  # B, text_seq_len, 4
+                    img_ids=model_input_ids,  # B, image_seq_len, 4
+                    return_dict=False,
+                )[0]
+                model_pred = model_pred[:, : packed_noisy_model_input.size(1) :]
+
+                model_pred = Flux2Pipeline._unpack_latents_with_ids(model_pred, model_input_ids)
+
+                # these weighting schemes use a uniform timestep sampling
+                # and instead post-weight the loss
+                weighting = compute_loss_weighting_for_sd3(weighting_scheme=args.weighting_scheme, sigmas=sigmas)
+
+                # flow matching loss
+                target = noise - model_input
+
+                # Compute regular loss.
+                loss = torch.mean(
+                    (weighting.float() * (model_pred.float() - target.float()) ** 2).reshape(target.shape[0], -1),
+                    1,
+                )
+                loss = loss.mean()
+
+                accelerator.backward(loss)
+                if accelerator.sync_gradients:
+                    params_to_clip = transformer.parameters()
+                    accelerator.clip_grad_norm_(params_to_clip, args.max_grad_norm)
+
+                optimizer.step()
+                lr_scheduler.step()
+                optimizer.zero_grad()
+
+            # Checks if the accelerator has performed an optimization step behind the scenes
+            if accelerator.sync_gradients:
+                progress_bar.update(1)
+                global_step += 1
+
+                if accelerator.is_main_process:
+                    if global_step % args.checkpointing_steps == 0:
+                        # _before_ saving state, check if this save would set us over the `checkpoints_total_limit`
+                        if args.checkpoints_total_limit is not None:
+                            checkpoints = os.listdir(args.output_dir)
+                            checkpoints = [d for d in checkpoints if d.startswith("checkpoint")]
+                            checkpoints = sorted(checkpoints, key=lambda x: int(x.split("-")[1]))
+
+                            # before we save the new checkpoint, we need to have at _most_ `checkpoints_total_limit - 1` checkpoints
+                            if len(checkpoints) >= args.checkpoints_total_limit:
+                                num_to_remove = len(checkpoints) - args.checkpoints_total_limit + 1
+                                removing_checkpoints = checkpoints[0:num_to_remove]
+
+                                logger.info(
+                                    f"{len(checkpoints)} checkpoints already exist, removing {len(removing_checkpoints)} checkpoints"
+                                )
+                                logger.info(f"removing checkpoints: {', '.join(removing_checkpoints)}")
+
+                                for removing_checkpoint in removing_checkpoints:
+                                    removing_checkpoint = os.path.join(args.output_dir, removing_checkpoint)
+                                    shutil.rmtree(removing_checkpoint)
+
+                        save_path = os.path.join(args.output_dir, f"checkpoint-{global_step}")
+                        accelerator.save_state(save_path)
+                        logger.info(f"Saved state to {save_path}")
+
+            logs = {"loss": loss.detach().item(), "lr": lr_scheduler.get_last_lr()[0]}
+            progress_bar.set_postfix(**logs)
+            accelerator.log(logs, step=global_step)
+
+            if global_step >= args.max_train_steps:
+                break
+
+        if accelerator.is_main_process:
+            if args.validation_prompt is not None and epoch % args.validation_epochs == 0:
+                # create pipeline
+                pipeline = Flux2Pipeline.from_pretrained(
+                    args.pretrained_model_name_or_path,
+                    text_encoder=None,
+                    tokenizer=None,
+                    transformer=unwrap_model(transformer),
+                    revision=args.revision,
+                    variant=args.variant,
+                    torch_dtype=weight_dtype,
+                )
+                images = log_validation(
+                    pipeline=pipeline,
+                    args=args,
+                    accelerator=accelerator,
+                    pipeline_args=validation_kwargs,
+                    epoch=epoch,
+                    torch_dtype=weight_dtype,
+                )
+
+                del pipeline
+                free_memory()
+
+    # Save the lora layers
+    accelerator.wait_for_everyone()
+    if accelerator.is_main_process:
+        modules_to_save = {}
+        transformer = unwrap_model(transformer)
+        if args.bnb_quantization_config_path is None:
+            if args.upcast_before_saving:
+                transformer.to(torch.float32)
+            else:
+                transformer = transformer.to(weight_dtype)
+        transformer_lora_layers = get_peft_model_state_dict(transformer)
+        modules_to_save["transformer"] = transformer
+
+        Flux2Pipeline.save_lora_weights(
+            save_directory=args.output_dir,
+            transformer_lora_layers=transformer_lora_layers,
+            **_collate_lora_metadata(modules_to_save),
+        )
+
+        images = []
+        run_validation = (args.validation_prompt and args.num_validation_images > 0) or (args.final_validation_prompt)
+        should_run_final_inference = not args.skip_final_inference and run_validation
+        if should_run_final_inference:
+            pipeline = Flux2Pipeline.from_pretrained(
+                args.pretrained_model_name_or_path,
+                revision=args.revision,
+                variant=args.variant,
+                torch_dtype=weight_dtype,
+            )
+            # load attention processors
+            pipeline.load_lora_weights(args.output_dir)
+
+            # run inference
+            images = []
+            if args.validation_prompt and args.num_validation_images > 0:
+                images = log_validation(
+                    pipeline=pipeline,
+                    args=args,
+                    accelerator=accelerator,
+                    pipeline_args=validation_kwargs,
+                    epoch=epoch,
+                    is_final_validation=True,
+                    torch_dtype=weight_dtype,
+                )
+            del pipeline
+            free_memory()
+
+        validation_prompt = args.validation_prompt if args.validation_prompt else args.final_validation_prompt
+        save_model_card(
+            (args.hub_model_id or Path(args.output_dir).name) if not args.push_to_hub else repo_id,
+            images=images,
+            base_model=args.pretrained_model_name_or_path,
+            instance_prompt=args.instance_prompt,
+            validation_prompt=validation_prompt,
+            repo_folder=args.output_dir,
+            fp8_training=args.do_fp8_training,
+        )
+
+        if args.push_to_hub:
+            upload_folder(
+                repo_id=repo_id,
+                folder_path=args.output_dir,
+                commit_message="End of training",
+                ignore_patterns=["step_*", "epoch_*"],
+            )
+
+    accelerator.end_training()
+
+
+if __name__ == "__main__":
+    args = parse_args()
+    main(args)
diff --git a/examples/text_to_image/train_text_to_image_lora.py b/examples/text_to_image/train_text_to_image_lora.py
index 663d6f6b0898..1fd48dcd159d 100644
--- a/examples/text_to_image/train_text_to_image_lora.py
+++ b/examples/text_to_image/train_text_to_image_lora.py
@@ -37,7 +37,7 @@
 from huggingface_hub import create_repo, upload_folder
 from packaging import version
 from peft import LoraConfig
-from peft.utils import get_peft_model_state_dict
+from peft.utils import get_peft_model_state_dict, set_peft_model_state_dict
 from torchvision import transforms
 from tqdm.auto import tqdm
 from transformers import CLIPTextModel, CLIPTokenizer
@@ -46,7 +46,12 @@
 from diffusers import AutoencoderKL, DDPMScheduler, DiffusionPipeline, StableDiffusionPipeline, UNet2DConditionModel
 from diffusers.optimization import get_scheduler
 from diffusers.training_utils import cast_training_params, compute_snr
-from diffusers.utils import check_min_version, convert_state_dict_to_diffusers, is_wandb_available
+from diffusers.utils import (
+    check_min_version,
+    convert_state_dict_to_diffusers,
+    convert_unet_state_dict_to_peft,
+    is_wandb_available,
+)
 from diffusers.utils.hub_utils import load_or_create_model_card, populate_model_card
 from diffusers.utils.import_utils import is_xformers_available
 from diffusers.utils.torch_utils import is_compiled_module
@@ -708,6 +713,56 @@ def collate_fn(examples):
         num_workers=args.dataloader_num_workers,
     )
 
+    def save_model_hook(models, weights, output_dir):
+        if accelerator.is_main_process:
+            unet_lora_layers_to_save = None
+
+            for model in models:
+                if isinstance(model, type(unwrap_model(unet))):
+                    unet_lora_layers_to_save = get_peft_model_state_dict(model)
+                else:
+                    raise ValueError(f"Unexpected save model: {model.__class__}")
+
+                # make sure to pop weight so that corresponding model is not saved again
+                weights.pop()
+
+            StableDiffusionPipeline.save_lora_weights(
+                save_directory=output_dir,
+                unet_lora_layers=unet_lora_layers_to_save,
+                safe_serialization=True,
+            )
+
+    def load_model_hook(models, input_dir):
+        unet_ = None
+
+        while len(models) > 0:
+            model = models.pop()
+            if isinstance(model, type(unwrap_model(unet))):
+                unet_ = model
+            else:
+                raise ValueError(f"unexpected save model: {model.__class__}")
+
+        # returns a tuple of state dictionary and network alphas
+        lora_state_dict, network_alphas = StableDiffusionPipeline.lora_state_dict(input_dir)
+
+        unet_state_dict = {f"{k.replace('unet.', '')}": v for k, v in lora_state_dict.items() if k.startswith("unet.")}
+        unet_state_dict = convert_unet_state_dict_to_peft(unet_state_dict)
+        incompatible_keys = set_peft_model_state_dict(unet_, unet_state_dict, adapter_name="default")
+
+        if incompatible_keys is not None:
+            # check only for unexpected keys
+            unexpected_keys = getattr(incompatible_keys, "unexpected_keys", None)
+            # throw warning if some unexpected keys are found and continue loading
+            if unexpected_keys:
+                logger.warning(
+                    f"Loading adapter weights from state_dict led to unexpected keys not found in the model: "
+                    f" {unexpected_keys}. "
+                )
+
+        # Make sure the trainable params are in float32
+        if args.mixed_precision in ["fp16"]:
+            cast_training_params([unet_], dtype=torch.float32)
+
     # Scheduler and math around the number of training steps.
     # Check the PR https://github.com/huggingface/diffusers/pull/8312 for detailed explanation.
     num_warmup_steps_for_scheduler = args.lr_warmup_steps * accelerator.num_processes
@@ -732,6 +787,10 @@ def collate_fn(examples):
         unet, optimizer, train_dataloader, lr_scheduler
     )
 
+    # Register the hooks for efficient saving and loading of LoRA weights
+    accelerator.register_save_state_pre_hook(save_model_hook)
+    accelerator.register_load_state_pre_hook(load_model_hook)
+
     # We need to recalculate our total training steps as the size of the training dataloader may have changed.
     num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
     if args.max_train_steps is None:
@@ -906,17 +965,6 @@ def collate_fn(examples):
                         save_path = os.path.join(args.output_dir, f"checkpoint-{global_step}")
                         accelerator.save_state(save_path)
 
-                        unwrapped_unet = unwrap_model(unet)
-                        unet_lora_state_dict = convert_state_dict_to_diffusers(
-                            get_peft_model_state_dict(unwrapped_unet)
-                        )
-
-                        StableDiffusionPipeline.save_lora_weights(
-                            save_directory=save_path,
-                            unet_lora_layers=unet_lora_state_dict,
-                            safe_serialization=True,
-                        )
-
                         logger.info(f"Saved state to {save_path}")
 
             logs = {"step_loss": loss.detach().item(), "lr": lr_scheduler.get_last_lr()[0]}
diff --git a/examples/unconditional_image_generation/README.md b/examples/unconditional_image_generation/README.md
index 22f982509bb1..6f8276a632f7 100644
--- a/examples/unconditional_image_generation/README.md
+++ b/examples/unconditional_image_generation/README.md
@@ -104,6 +104,8 @@ To use your own dataset, there are 2 ways:
 - you can either provide your own folder as `--train_data_dir`
 - or you can upload your dataset to the hub (possibly as a private repo, if you prefer so), and simply pass the `--dataset_name` argument.
 
+If your dataset contains 16 or 32-bit channels (for example, medical TIFFs), add the `--preserve_input_precision` flag so the preprocessing keeps the original precision while still training a 3-channel model. Precision still depends on the decoder: Pillow keeps 16-bit grayscale and float inputs, but many 16-bit RGB files are decoded as 8-bit RGB, and the flag cannot recover precision lost at load time.
+
 Below, we explain both in more detail.
 
 #### Provide the dataset as a folder
diff --git a/examples/unconditional_image_generation/train_unconditional.py b/examples/unconditional_image_generation/train_unconditional.py
index 3ffeef13647c..0cc96220b932 100644
--- a/examples/unconditional_image_generation/train_unconditional.py
+++ b/examples/unconditional_image_generation/train_unconditional.py
@@ -52,6 +52,24 @@ def _extract_into_tensor(arr, timesteps, broadcast_shape):
     return res.expand(broadcast_shape)
 
 
+def _ensure_three_channels(tensor: torch.Tensor) -> torch.Tensor:
+    """
+    Ensure the tensor has exactly three channels (C, H, W) by repeating or truncating channels when needed.
+    """
+    if tensor.ndim == 2:
+        tensor = tensor.unsqueeze(0)
+    channels = tensor.shape[0]
+    if channels == 3:
+        return tensor
+    if channels == 1:
+        return tensor.repeat(3, 1, 1)
+    if channels == 2:
+        return torch.cat([tensor, tensor[:1]], dim=0)
+    if channels > 3:
+        return tensor[:3]
+    raise ValueError(f"Unsupported number of channels: {channels}")
+
+
 def parse_args():
     parser = argparse.ArgumentParser(description="Simple example of a training script.")
     parser.add_argument(
@@ -260,6 +278,11 @@ def parse_args():
     parser.add_argument(
         "--enable_xformers_memory_efficient_attention", action="store_true", help="Whether or not to use xformers."
     )
+    parser.add_argument(
+        "--preserve_input_precision",
+        action="store_true",
+        help="Preserve 16/32-bit image precision by avoiding 8-bit RGB conversion while still producing 3-channel tensors.",
+    )
 
     args = parser.parse_args()
     env_local_rank = int(os.environ.get("LOCAL_RANK", -1))
@@ -453,19 +476,41 @@ def load_model_hook(models, input_dir):
         # https://huggingface.co/docs/datasets/v2.4.0/en/image_load#imagefolder
 
     # Preprocessing the datasets and DataLoaders creation.
+    spatial_augmentations = [
+        transforms.Resize(args.resolution, interpolation=transforms.InterpolationMode.BILINEAR),
+        transforms.CenterCrop(args.resolution) if args.center_crop else transforms.RandomCrop(args.resolution),
+        transforms.RandomHorizontalFlip() if args.random_flip else transforms.Lambda(lambda x: x),
+    ]
+
     augmentations = transforms.Compose(
-        [
-            transforms.Resize(args.resolution, interpolation=transforms.InterpolationMode.BILINEAR),
-            transforms.CenterCrop(args.resolution) if args.center_crop else transforms.RandomCrop(args.resolution),
-            transforms.RandomHorizontalFlip() if args.random_flip else transforms.Lambda(lambda x: x),
+        spatial_augmentations
+        + [
             transforms.ToTensor(),
             transforms.Normalize([0.5], [0.5]),
         ]
     )
 
+    precision_augmentations = transforms.Compose(
+        [
+            transforms.PILToTensor(),
+            transforms.Lambda(_ensure_three_channels),
+            transforms.ConvertImageDtype(torch.float32),
+        ]
+        + spatial_augmentations
+        + [transforms.Normalize([0.5], [0.5])]
+    )
+
     def transform_images(examples):
-        images = [augmentations(image.convert("RGB")) for image in examples["image"]]
-        return {"input": images}
+        processed = []
+        for image in examples["image"]:
+            if not args.preserve_input_precision:
+                processed.append(augmentations(image.convert("RGB")))
+            else:
+                precise_image = image
+                if precise_image.mode == "P":
+                    precise_image = precise_image.convert("RGB")
+                processed.append(precision_augmentations(precise_image))
+        return {"input": processed}
 
     logger.info(f"Dataset size: {len(dataset)}")
 
diff --git a/pyproject.toml b/pyproject.toml
index a864ea34b888..fdda8a6977be 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,5 +1,8 @@
 [tool.ruff]
 line-length = 119
+extend-exclude = [
+    "src/diffusers/pipelines/flux2/system_messages.py",
+]
 
 [tool.ruff.lint]
 # Never enforce `E501` (line length violations).
diff --git a/scripts/convert_flux2_to_diffusers.py b/scripts/convert_flux2_to_diffusers.py
new file mode 100644
index 000000000000..2973913fa215
--- /dev/null
+++ b/scripts/convert_flux2_to_diffusers.py
@@ -0,0 +1,475 @@
+import argparse
+from contextlib import nullcontext
+from typing import Any, Dict, Tuple
+
+import safetensors.torch
+import torch
+from accelerate import init_empty_weights
+from huggingface_hub import hf_hub_download
+from transformers import AutoProcessor, GenerationConfig, Mistral3ForConditionalGeneration
+
+from diffusers import AutoencoderKLFlux2, FlowMatchEulerDiscreteScheduler, Flux2Pipeline, Flux2Transformer2DModel
+from diffusers.utils.import_utils import is_accelerate_available
+
+
+"""
+# VAE
+
+python scripts/convert_flux2_to_diffusers.py  \
+--original_state_dict_repo_id "diffusers-internal-dev/new-model-image" \
+--vae_filename "flux2-vae.sft" \
+--output_path "/raid/yiyi/dummy-flux2-diffusers" \
+--vae
+
+# DiT
+
+python scripts/convert_flux2_to_diffusers.py \
+  --original_state_dict_repo_id diffusers-internal-dev/new-model-image \
+  --dit_filename flux-dev-dummy.sft \
+  --dit \
+  --output_path .
+
+# Full pipe
+
+python scripts/convert_flux2_to_diffusers.py \
+  --original_state_dict_repo_id diffusers-internal-dev/new-model-image \
+  --dit_filename flux-dev-dummy.sft \
+  --vae_filename "flux2-vae.sft" \
+  --dit --vae --full_pipe \
+  --output_path .
+"""
+
+CTX = init_empty_weights if is_accelerate_available() else nullcontext
+
+parser = argparse.ArgumentParser()
+parser.add_argument("--original_state_dict_repo_id", default=None, type=str)
+parser.add_argument("--vae_filename", default="flux2-vae.sft", type=str)
+parser.add_argument("--dit_filename", default="flux-dev-dummy.sft", type=str)
+parser.add_argument("--vae", action="store_true")
+parser.add_argument("--dit", action="store_true")
+parser.add_argument("--vae_dtype", type=str, default="fp32")
+parser.add_argument("--dit_dtype", type=str, default="bf16")
+parser.add_argument("--checkpoint_path", default=None, type=str)
+parser.add_argument("--full_pipe", action="store_true")
+parser.add_argument("--output_path", type=str)
+
+args = parser.parse_args()
+
+
+def load_original_checkpoint(args, filename):
+    if args.original_state_dict_repo_id is not None:
+        ckpt_path = hf_hub_download(repo_id=args.original_state_dict_repo_id, filename=filename)
+    elif args.checkpoint_path is not None:
+        ckpt_path = args.checkpoint_path
+    else:
+        raise ValueError(" please provide either `original_state_dict_repo_id` or a local `checkpoint_path`")
+
+    original_state_dict = safetensors.torch.load_file(ckpt_path)
+    return original_state_dict
+
+
+DIFFUSERS_VAE_TO_FLUX2_MAPPING = {
+    "encoder.conv_in.weight": "encoder.conv_in.weight",
+    "encoder.conv_in.bias": "encoder.conv_in.bias",
+    "encoder.conv_out.weight": "encoder.conv_out.weight",
+    "encoder.conv_out.bias": "encoder.conv_out.bias",
+    "encoder.conv_norm_out.weight": "encoder.norm_out.weight",
+    "encoder.conv_norm_out.bias": "encoder.norm_out.bias",
+    "decoder.conv_in.weight": "decoder.conv_in.weight",
+    "decoder.conv_in.bias": "decoder.conv_in.bias",
+    "decoder.conv_out.weight": "decoder.conv_out.weight",
+    "decoder.conv_out.bias": "decoder.conv_out.bias",
+    "decoder.conv_norm_out.weight": "decoder.norm_out.weight",
+    "decoder.conv_norm_out.bias": "decoder.norm_out.bias",
+    "quant_conv.weight": "encoder.quant_conv.weight",
+    "quant_conv.bias": "encoder.quant_conv.bias",
+    "post_quant_conv.weight": "decoder.post_quant_conv.weight",
+    "post_quant_conv.bias": "decoder.post_quant_conv.bias",
+    "bn.running_mean": "bn.running_mean",
+    "bn.running_var": "bn.running_var",
+}
+
+
+# Copied from diffusers.pipelines.stable_diffusion.convert_from_ckpt.conv_attn_to_linear
+def conv_attn_to_linear(checkpoint):
+    keys = list(checkpoint.keys())
+    attn_keys = ["query.weight", "key.weight", "value.weight"]
+    for key in keys:
+        if ".".join(key.split(".")[-2:]) in attn_keys:
+            if checkpoint[key].ndim > 2:
+                checkpoint[key] = checkpoint[key][:, :, 0, 0]
+        elif "proj_attn.weight" in key:
+            if checkpoint[key].ndim > 2:
+                checkpoint[key] = checkpoint[key][:, :, 0]
+
+
+def update_vae_resnet_ldm_to_diffusers(keys, new_checkpoint, checkpoint, mapping):
+    for ldm_key in keys:
+        diffusers_key = ldm_key.replace(mapping["old"], mapping["new"]).replace("nin_shortcut", "conv_shortcut")
+        new_checkpoint[diffusers_key] = checkpoint.get(ldm_key)
+
+
+def update_vae_attentions_ldm_to_diffusers(keys, new_checkpoint, checkpoint, mapping):
+    for ldm_key in keys:
+        diffusers_key = (
+            ldm_key.replace(mapping["old"], mapping["new"])
+            .replace("norm.weight", "group_norm.weight")
+            .replace("norm.bias", "group_norm.bias")
+            .replace("q.weight", "to_q.weight")
+            .replace("q.bias", "to_q.bias")
+            .replace("k.weight", "to_k.weight")
+            .replace("k.bias", "to_k.bias")
+            .replace("v.weight", "to_v.weight")
+            .replace("v.bias", "to_v.bias")
+            .replace("proj_out.weight", "to_out.0.weight")
+            .replace("proj_out.bias", "to_out.0.bias")
+        )
+        new_checkpoint[diffusers_key] = checkpoint.get(ldm_key)
+
+        # proj_attn.weight has to be converted from conv 1D to linear
+        shape = new_checkpoint[diffusers_key].shape
+
+        if len(shape) == 3:
+            new_checkpoint[diffusers_key] = new_checkpoint[diffusers_key][:, :, 0]
+        elif len(shape) == 4:
+            new_checkpoint[diffusers_key] = new_checkpoint[diffusers_key][:, :, 0, 0]
+
+
+def convert_flux2_vae_checkpoint_to_diffusers(vae_state_dict, config):
+    new_checkpoint = {}
+    for diffusers_key, ldm_key in DIFFUSERS_VAE_TO_FLUX2_MAPPING.items():
+        if ldm_key not in vae_state_dict:
+            continue
+        new_checkpoint[diffusers_key] = vae_state_dict[ldm_key]
+
+    # Retrieves the keys for the encoder down blocks only
+    num_down_blocks = len(config["down_block_types"])
+    down_blocks = {
+        layer_id: [key for key in vae_state_dict if f"down.{layer_id}" in key] for layer_id in range(num_down_blocks)
+    }
+
+    for i in range(num_down_blocks):
+        resnets = [key for key in down_blocks[i] if f"down.{i}" in key and f"down.{i}.downsample" not in key]
+        update_vae_resnet_ldm_to_diffusers(
+            resnets,
+            new_checkpoint,
+            vae_state_dict,
+            mapping={"old": f"down.{i}.block", "new": f"down_blocks.{i}.resnets"},
+        )
+        if f"encoder.down.{i}.downsample.conv.weight" in vae_state_dict:
+            new_checkpoint[f"encoder.down_blocks.{i}.downsamplers.0.conv.weight"] = vae_state_dict.get(
+                f"encoder.down.{i}.downsample.conv.weight"
+            )
+            new_checkpoint[f"encoder.down_blocks.{i}.downsamplers.0.conv.bias"] = vae_state_dict.get(
+                f"encoder.down.{i}.downsample.conv.bias"
+            )
+
+    mid_resnets = [key for key in vae_state_dict if "encoder.mid.block" in key]
+    num_mid_res_blocks = 2
+    for i in range(1, num_mid_res_blocks + 1):
+        resnets = [key for key in mid_resnets if f"encoder.mid.block_{i}" in key]
+        update_vae_resnet_ldm_to_diffusers(
+            resnets,
+            new_checkpoint,
+            vae_state_dict,
+            mapping={"old": f"mid.block_{i}", "new": f"mid_block.resnets.{i - 1}"},
+        )
+
+    mid_attentions = [key for key in vae_state_dict if "encoder.mid.attn" in key]
+    update_vae_attentions_ldm_to_diffusers(
+        mid_attentions, new_checkpoint, vae_state_dict, mapping={"old": "mid.attn_1", "new": "mid_block.attentions.0"}
+    )
+
+    # Retrieves the keys for the decoder up blocks only
+    num_up_blocks = len(config["up_block_types"])
+    up_blocks = {
+        layer_id: [key for key in vae_state_dict if f"up.{layer_id}" in key] for layer_id in range(num_up_blocks)
+    }
+
+    for i in range(num_up_blocks):
+        block_id = num_up_blocks - 1 - i
+        resnets = [
+            key for key in up_blocks[block_id] if f"up.{block_id}" in key and f"up.{block_id}.upsample" not in key
+        ]
+        update_vae_resnet_ldm_to_diffusers(
+            resnets,
+            new_checkpoint,
+            vae_state_dict,
+            mapping={"old": f"up.{block_id}.block", "new": f"up_blocks.{i}.resnets"},
+        )
+        if f"decoder.up.{block_id}.upsample.conv.weight" in vae_state_dict:
+            new_checkpoint[f"decoder.up_blocks.{i}.upsamplers.0.conv.weight"] = vae_state_dict[
+                f"decoder.up.{block_id}.upsample.conv.weight"
+            ]
+            new_checkpoint[f"decoder.up_blocks.{i}.upsamplers.0.conv.bias"] = vae_state_dict[
+                f"decoder.up.{block_id}.upsample.conv.bias"
+            ]
+
+    mid_resnets = [key for key in vae_state_dict if "decoder.mid.block" in key]
+    num_mid_res_blocks = 2
+    for i in range(1, num_mid_res_blocks + 1):
+        resnets = [key for key in mid_resnets if f"decoder.mid.block_{i}" in key]
+        update_vae_resnet_ldm_to_diffusers(
+            resnets,
+            new_checkpoint,
+            vae_state_dict,
+            mapping={"old": f"mid.block_{i}", "new": f"mid_block.resnets.{i - 1}"},
+        )
+
+    mid_attentions = [key for key in vae_state_dict if "decoder.mid.attn" in key]
+    update_vae_attentions_ldm_to_diffusers(
+        mid_attentions, new_checkpoint, vae_state_dict, mapping={"old": "mid.attn_1", "new": "mid_block.attentions.0"}
+    )
+    conv_attn_to_linear(new_checkpoint)
+
+    return new_checkpoint
+
+
+FLUX2_TRANSFORMER_KEYS_RENAME_DICT = {
+    # Image and text input projections
+    "img_in": "x_embedder",
+    "txt_in": "context_embedder",
+    # Timestep and guidance embeddings
+    "time_in.in_layer": "time_guidance_embed.timestep_embedder.linear_1",
+    "time_in.out_layer": "time_guidance_embed.timestep_embedder.linear_2",
+    "guidance_in.in_layer": "time_guidance_embed.guidance_embedder.linear_1",
+    "guidance_in.out_layer": "time_guidance_embed.guidance_embedder.linear_2",
+    # Modulation parameters
+    "double_stream_modulation_img.lin": "double_stream_modulation_img.linear",
+    "double_stream_modulation_txt.lin": "double_stream_modulation_txt.linear",
+    "single_stream_modulation.lin": "single_stream_modulation.linear",
+    # Final output layer
+    # "final_layer.adaLN_modulation.1": "norm_out.linear",  # Handle separately since we need to swap mod params
+    "final_layer.linear": "proj_out",
+}
+
+
+FLUX2_TRANSFORMER_ADA_LAYER_NORM_KEY_MAP = {
+    "final_layer.adaLN_modulation.1": "norm_out.linear",
+}
+
+
+FLUX2_TRANSFORMER_DOUBLE_BLOCK_KEY_MAP = {
+    # Handle fused QKV projections separately as we need to break into Q, K, V projections
+    "img_attn.norm.query_norm": "attn.norm_q",
+    "img_attn.norm.key_norm": "attn.norm_k",
+    "img_attn.proj": "attn.to_out.0",
+    "img_mlp.0": "ff.linear_in",
+    "img_mlp.2": "ff.linear_out",
+    "txt_attn.norm.query_norm": "attn.norm_added_q",
+    "txt_attn.norm.key_norm": "attn.norm_added_k",
+    "txt_attn.proj": "attn.to_add_out",
+    "txt_mlp.0": "ff_context.linear_in",
+    "txt_mlp.2": "ff_context.linear_out",
+}
+
+
+FLUX2_TRANSFORMER_SINGLE_BLOCK_KEY_MAP = {
+    "linear1": "attn.to_qkv_mlp_proj",
+    "linear2": "attn.to_out",
+    "norm.query_norm": "attn.norm_q",
+    "norm.key_norm": "attn.norm_k",
+}
+
+
+# in SD3 original implementation of AdaLayerNormContinuous, it split linear projection output into shift, scale;
+# while in diffusers it split into scale, shift. Here we swap the linear projection weights in order to be able to use
+# diffusers implementation
+def swap_scale_shift(weight):
+    shift, scale = weight.chunk(2, dim=0)
+    new_weight = torch.cat([scale, shift], dim=0)
+    return new_weight
+
+
+def convert_ada_layer_norm_weights(key: str, state_dict: Dict[str, Any]) -> None:
+    # Skip if not a weight
+    if ".weight" not in key:
+        return
+
+    # If adaLN_modulation is in the key, swap scale and shift parameters
+    # Original implementation is (shift, scale); diffusers implementation is (scale, shift)
+    if "adaLN_modulation" in key:
+        key_without_param_type, param_type = key.rsplit(".", maxsplit=1)
+        # Assume all such keys are in the AdaLayerNorm key map
+        new_key_without_param_type = FLUX2_TRANSFORMER_ADA_LAYER_NORM_KEY_MAP[key_without_param_type]
+        new_key = ".".join([new_key_without_param_type, param_type])
+
+        swapped_weight = swap_scale_shift(state_dict.pop(key))
+        state_dict[new_key] = swapped_weight
+    return
+
+
+def convert_flux2_double_stream_blocks(key: str, state_dict: Dict[str, Any]) -> None:
+    # Skip if not a weight, bias, or scale
+    if ".weight" not in key and ".bias" not in key and ".scale" not in key:
+        return
+
+    new_prefix = "transformer_blocks"
+    if "double_blocks." in key:
+        parts = key.split(".")
+        block_idx = parts[1]
+        modality_block_name = parts[2]  # img_attn, img_mlp, txt_attn, txt_mlp
+        within_block_name = ".".join(parts[2:-1])
+        param_type = parts[-1]
+
+        if param_type == "scale":
+            param_type = "weight"
+
+        if "qkv" in within_block_name:
+            fused_qkv_weight = state_dict.pop(key)
+            to_q_weight, to_k_weight, to_v_weight = torch.chunk(fused_qkv_weight, 3, dim=0)
+            if "img" in modality_block_name:
+                # double_blocks.{N}.img_attn.qkv --> transformer_blocks.{N}.attn.{to_q|to_k|to_v}
+                to_q_weight, to_k_weight, to_v_weight = torch.chunk(fused_qkv_weight, 3, dim=0)
+                new_q_name = "attn.to_q"
+                new_k_name = "attn.to_k"
+                new_v_name = "attn.to_v"
+            elif "txt" in modality_block_name:
+                # double_blocks.{N}.txt_attn.qkv --> transformer_blocks.{N}.attn.{add_q_proj|add_k_proj|add_v_proj}
+                to_q_weight, to_k_weight, to_v_weight = torch.chunk(fused_qkv_weight, 3, dim=0)
+                new_q_name = "attn.add_q_proj"
+                new_k_name = "attn.add_k_proj"
+                new_v_name = "attn.add_v_proj"
+            new_q_key = ".".join([new_prefix, block_idx, new_q_name, param_type])
+            new_k_key = ".".join([new_prefix, block_idx, new_k_name, param_type])
+            new_v_key = ".".join([new_prefix, block_idx, new_v_name, param_type])
+            state_dict[new_q_key] = to_q_weight
+            state_dict[new_k_key] = to_k_weight
+            state_dict[new_v_key] = to_v_weight
+        else:
+            new_within_block_name = FLUX2_TRANSFORMER_DOUBLE_BLOCK_KEY_MAP[within_block_name]
+            new_key = ".".join([new_prefix, block_idx, new_within_block_name, param_type])
+
+            param = state_dict.pop(key)
+            state_dict[new_key] = param
+    return
+
+
+def convert_flux2_single_stream_blocks(key: str, state_dict: Dict[str, Any]) -> None:
+    # Skip if not a weight, bias, or scale
+    if ".weight" not in key and ".bias" not in key and ".scale" not in key:
+        return
+
+    # Mapping:
+    #     - single_blocks.{N}.linear1               --> single_transformer_blocks.{N}.attn.to_qkv_mlp_proj
+    #     - single_blocks.{N}.linear2               --> single_transformer_blocks.{N}.attn.to_out
+    #     - single_blocks.{N}.norm.query_norm.scale --> single_transformer_blocks.{N}.attn.norm_q.weight
+    #     - single_blocks.{N}.norm.key_norm.scale   --> single_transformer_blocks.{N}.attn.norm_k.weight
+    new_prefix = "single_transformer_blocks"
+    if "single_blocks." in key:
+        parts = key.split(".")
+        block_idx = parts[1]
+        within_block_name = ".".join(parts[2:-1])
+        param_type = parts[-1]
+
+        if param_type == "scale":
+            param_type = "weight"
+
+        new_within_block_name = FLUX2_TRANSFORMER_SINGLE_BLOCK_KEY_MAP[within_block_name]
+        new_key = ".".join([new_prefix, block_idx, new_within_block_name, param_type])
+
+        param = state_dict.pop(key)
+        state_dict[new_key] = param
+    return
+
+
+TRANSFORMER_SPECIAL_KEYS_REMAP = {
+    "adaLN_modulation": convert_ada_layer_norm_weights,
+    "double_blocks": convert_flux2_double_stream_blocks,
+    "single_blocks": convert_flux2_single_stream_blocks,
+}
+
+
+def update_state_dict(state_dict: Dict[str, Any], old_key: str, new_key: str) -> None:
+    state_dict[new_key] = state_dict.pop(old_key)
+
+
+def get_flux2_transformer_config(model_type: str) -> Tuple[Dict[str, Any], ...]:
+    if model_type == "test" or model_type == "dummy-flux2":
+        config = {
+            "model_id": "diffusers-internal-dev/dummy-flux2",
+            "diffusers_config": {
+                "patch_size": 1,
+                "in_channels": 128,
+                "num_layers": 8,
+                "num_single_layers": 48,
+                "attention_head_dim": 128,
+                "num_attention_heads": 48,
+                "joint_attention_dim": 15360,
+                "timestep_guidance_channels": 256,
+                "mlp_ratio": 3.0,
+                "axes_dims_rope": (32, 32, 32, 32),
+                "rope_theta": 2000,
+                "eps": 1e-6,
+            },
+        }
+        rename_dict = FLUX2_TRANSFORMER_KEYS_RENAME_DICT
+        special_keys_remap = TRANSFORMER_SPECIAL_KEYS_REMAP
+    return config, rename_dict, special_keys_remap
+
+
+def convert_flux2_transformer_to_diffusers(original_state_dict: Dict[str, torch.Tensor], model_type: str):
+    config, rename_dict, special_keys_remap = get_flux2_transformer_config(model_type)
+
+    diffusers_config = config["diffusers_config"]
+
+    with init_empty_weights():
+        transformer = Flux2Transformer2DModel.from_config(diffusers_config)
+
+    # Handle official code --> diffusers key remapping via the remap dict
+    for key in list(original_state_dict.keys()):
+        new_key = key[:]
+        for replace_key, rename_key in rename_dict.items():
+            new_key = new_key.replace(replace_key, rename_key)
+        update_state_dict(original_state_dict, key, new_key)
+
+    # Handle any special logic which can't be expressed by a simple 1:1 remapping with the handlers in
+    # special_keys_remap
+    for key in list(original_state_dict.keys()):
+        for special_key, handler_fn_inplace in special_keys_remap.items():
+            if special_key not in key:
+                continue
+            handler_fn_inplace(key, original_state_dict)
+
+    transformer.load_state_dict(original_state_dict, strict=True, assign=True)
+    return transformer
+
+
+def main(args):
+    if args.vae:
+        original_vae_ckpt = load_original_checkpoint(args, filename=args.vae_filename)
+        vae = AutoencoderKLFlux2()
+        converted_vae_state_dict = convert_flux2_vae_checkpoint_to_diffusers(original_vae_ckpt, vae.config)
+        vae.load_state_dict(converted_vae_state_dict, strict=True)
+        if not args.full_pipe:
+            vae_dtype = torch.bfloat16 if args.vae_dtype == "bf16" else torch.float32
+            vae.to(vae_dtype).save_pretrained(f"{args.output_path}/vae")
+
+    if args.dit:
+        original_dit_ckpt = load_original_checkpoint(args, filename=args.dit_filename)
+        transformer = convert_flux2_transformer_to_diffusers(original_dit_ckpt, "test")
+        if not args.full_pipe:
+            dit_dtype = torch.bfloat16 if args.dit_dtype == "bf16" else torch.float32
+            transformer.to(dit_dtype).save_pretrained(f"{args.output_path}/transformer")
+
+    if args.full_pipe:
+        tokenizer_id = "mistralai/Mistral-Small-3.1-24B-Instruct-2503"
+        text_encoder_id = "mistralai/Mistral-Small-3.2-24B-Instruct-2506"
+        generate_config = GenerationConfig.from_pretrained(text_encoder_id)
+        generate_config.do_sample = True
+        text_encoder = Mistral3ForConditionalGeneration.from_pretrained(
+            text_encoder_id, generation_config=generate_config, torch_dtype=torch.bfloat16
+        )
+        tokenizer = AutoProcessor.from_pretrained(tokenizer_id)
+        scheduler = FlowMatchEulerDiscreteScheduler.from_pretrained(
+            "black-forest-labs/FLUX.1-dev", subfolder="scheduler"
+        )
+
+        pipe = Flux2Pipeline(
+            vae=vae, transformer=transformer, text_encoder=text_encoder, tokenizer=tokenizer, scheduler=scheduler
+        )
+        pipe.save_pretrained(args.output_path)
+
+
+if __name__ == "__main__":
+    main(args)
diff --git a/scripts/convert_hunyuan_video1_5_to_diffusers.py b/scripts/convert_hunyuan_video1_5_to_diffusers.py
new file mode 100644
index 000000000000..89e5cdb16956
--- /dev/null
+++ b/scripts/convert_hunyuan_video1_5_to_diffusers.py
@@ -0,0 +1,875 @@
+import argparse
+import json
+import os
+import pathlib
+
+import torch
+from accelerate import init_empty_weights
+from huggingface_hub import hf_hub_download, snapshot_download
+from safetensors.torch import load_file
+from transformers import (
+    AutoModel,
+    AutoTokenizer,
+    SiglipImageProcessor,
+    SiglipVisionModel,
+    T5EncoderModel,
+)
+
+from diffusers import (
+    AutoencoderKLHunyuanVideo15,
+    ClassifierFreeGuidance,
+    FlowMatchEulerDiscreteScheduler,
+    HunyuanVideo15ImageToVideoPipeline,
+    HunyuanVideo15Pipeline,
+    HunyuanVideo15Transformer3DModel,
+)
+
+
+# to convert only transformer
+"""
+python scripts/convert_hunyuan_video1_5_to_diffusers.py \
+    --original_state_dict_repo_id tencent/HunyuanVideo-1.5\
+    --output_path /fsx/yiyi/HunyuanVideo-1.5-Diffusers/transformer\
+    --transformer_type 480p_t2v
+"""
+
+# to convert full pipeline
+"""
+python scripts/convert_hunyuan_video1_5_to_diffusers.py \
+    --original_state_dict_repo_id tencent/HunyuanVideo-1.5\
+    --output_path /fsx/yiyi/HunyuanVideo-1.5-Diffusers \
+    --save_pipeline \
+    --byt5_path /fsx/yiyi/hy15/text_encoder/Glyph-SDXL-v2\
+    --transformer_type 480p_t2v
+"""
+
+
+TRANSFORMER_CONFIGS = {
+    "480p_t2v": {
+        "target_size": 640,
+        "task_type": "i2v",
+    },
+    "720p_t2v": {
+        "target_size": 960,
+        "task_type": "t2v",
+    },
+    "720p_i2v": {
+        "target_size": 960,
+        "task_type": "i2v",
+    },
+    "480p_t2v_distilled": {
+        "target_size": 640,
+        "task_type": "t2v",
+    },
+    "480p_i2v_distilled": {
+        "target_size": 640,
+        "task_type": "i2v",
+    },
+    "720p_i2v_distilled": {
+        "target_size": 960,
+        "task_type": "i2v",
+    },
+    "480p_i2v_step_distilled": {
+        "target_size": 640,
+        "task_type": "i2v",
+        "use_meanflow": True,
+    },
+}
+
+SCHEDULER_CONFIGS = {
+    "480p_t2v": {
+        "shift": 5.0,
+    },
+    "480p_i2v": {
+        "shift": 5.0,
+    },
+    "720p_t2v": {
+        "shift": 9.0,
+    },
+    "720p_i2v": {
+        "shift": 7.0,
+    },
+    "480p_t2v_distilled": {
+        "shift": 5.0,
+    },
+    "480p_i2v_distilled": {
+        "shift": 5.0,
+    },
+    "720p_i2v_distilled": {
+        "shift": 7.0,
+    },
+    "480p_i2v_step_distilled": {
+        "shift": 7.0,
+    },
+}
+
+GUIDANCE_CONFIGS = {
+    "480p_t2v": {
+        "guidance_scale": 6.0,
+    },
+    "480p_i2v": {
+        "guidance_scale": 6.0,
+    },
+    "720p_t2v": {
+        "guidance_scale": 6.0,
+    },
+    "720p_i2v": {
+        "guidance_scale": 6.0,
+    },
+    "480p_t2v_distilled": {
+        "guidance_scale": 1.0,
+    },
+    "480p_i2v_distilled": {
+        "guidance_scale": 1.0,
+    },
+    "720p_i2v_distilled": {
+        "guidance_scale": 1.0,
+    },
+    "480p_i2v_step_distilled": {
+        "guidance_scale": 1.0,
+    },
+}
+
+
+def swap_scale_shift(weight):
+    shift, scale = weight.chunk(2, dim=0)
+    new_weight = torch.cat([scale, shift], dim=0)
+    return new_weight
+
+
+def convert_hyvideo15_transformer_to_diffusers(original_state_dict, config=None):
+    """
+    Convert HunyuanVideo 1.5 original checkpoint to Diffusers format.
+    """
+    converted_state_dict = {}
+
+    # 1. time_embed.timestep_embedder <- time_in
+    converted_state_dict["time_embed.timestep_embedder.linear_1.weight"] = original_state_dict.pop(
+        "time_in.mlp.0.weight"
+    )
+    converted_state_dict["time_embed.timestep_embedder.linear_1.bias"] = original_state_dict.pop("time_in.mlp.0.bias")
+    converted_state_dict["time_embed.timestep_embedder.linear_2.weight"] = original_state_dict.pop(
+        "time_in.mlp.2.weight"
+    )
+    converted_state_dict["time_embed.timestep_embedder.linear_2.bias"] = original_state_dict.pop("time_in.mlp.2.bias")
+
+    if config.use_meanflow:
+        converted_state_dict["time_embed.timestep_embedder_r.linear_1.weight"] = original_state_dict.pop(
+            "time_r_in.mlp.0.weight"
+        )
+        converted_state_dict["time_embed.timestep_embedder_r.linear_1.bias"] = original_state_dict.pop(
+            "time_r_in.mlp.0.bias"
+        )
+        converted_state_dict["time_embed.timestep_embedder_r.linear_2.weight"] = original_state_dict.pop(
+            "time_r_in.mlp.2.weight"
+        )
+        converted_state_dict["time_embed.timestep_embedder_r.linear_2.bias"] = original_state_dict.pop(
+            "time_r_in.mlp.2.bias"
+        )
+
+    # 2. context_embedder.time_text_embed.timestep_embedder <- txt_in.t_embedder
+    converted_state_dict["context_embedder.time_text_embed.timestep_embedder.linear_1.weight"] = (
+        original_state_dict.pop("txt_in.t_embedder.mlp.0.weight")
+    )
+    converted_state_dict["context_embedder.time_text_embed.timestep_embedder.linear_1.bias"] = original_state_dict.pop(
+        "txt_in.t_embedder.mlp.0.bias"
+    )
+    converted_state_dict["context_embedder.time_text_embed.timestep_embedder.linear_2.weight"] = (
+        original_state_dict.pop("txt_in.t_embedder.mlp.2.weight")
+    )
+    converted_state_dict["context_embedder.time_text_embed.timestep_embedder.linear_2.bias"] = original_state_dict.pop(
+        "txt_in.t_embedder.mlp.2.bias"
+    )
+
+    # 3. context_embedder.time_text_embed.text_embedder <- txt_in.c_embedder
+    converted_state_dict["context_embedder.time_text_embed.text_embedder.linear_1.weight"] = original_state_dict.pop(
+        "txt_in.c_embedder.linear_1.weight"
+    )
+    converted_state_dict["context_embedder.time_text_embed.text_embedder.linear_1.bias"] = original_state_dict.pop(
+        "txt_in.c_embedder.linear_1.bias"
+    )
+    converted_state_dict["context_embedder.time_text_embed.text_embedder.linear_2.weight"] = original_state_dict.pop(
+        "txt_in.c_embedder.linear_2.weight"
+    )
+    converted_state_dict["context_embedder.time_text_embed.text_embedder.linear_2.bias"] = original_state_dict.pop(
+        "txt_in.c_embedder.linear_2.bias"
+    )
+
+    # 4. context_embedder.proj_in <- txt_in.input_embedder
+    converted_state_dict["context_embedder.proj_in.weight"] = original_state_dict.pop("txt_in.input_embedder.weight")
+    converted_state_dict["context_embedder.proj_in.bias"] = original_state_dict.pop("txt_in.input_embedder.bias")
+
+    # 5. context_embedder.token_refiner <- txt_in.individual_token_refiner
+    num_refiner_blocks = 2
+    for i in range(num_refiner_blocks):
+        block_prefix = f"context_embedder.token_refiner.refiner_blocks.{i}."
+        orig_prefix = f"txt_in.individual_token_refiner.blocks.{i}."
+
+        # norm1
+        converted_state_dict[f"{block_prefix}norm1.weight"] = original_state_dict.pop(f"{orig_prefix}norm1.weight")
+        converted_state_dict[f"{block_prefix}norm1.bias"] = original_state_dict.pop(f"{orig_prefix}norm1.bias")
+
+        # Split self_attn_qkv into to_q, to_k, to_v
+        qkv_weight = original_state_dict.pop(f"{orig_prefix}self_attn_qkv.weight")
+        qkv_bias = original_state_dict.pop(f"{orig_prefix}self_attn_qkv.bias")
+        q, k, v = torch.chunk(qkv_weight, 3, dim=0)
+        q_bias, k_bias, v_bias = torch.chunk(qkv_bias, 3, dim=0)
+
+        converted_state_dict[f"{block_prefix}attn.to_q.weight"] = q
+        converted_state_dict[f"{block_prefix}attn.to_q.bias"] = q_bias
+        converted_state_dict[f"{block_prefix}attn.to_k.weight"] = k
+        converted_state_dict[f"{block_prefix}attn.to_k.bias"] = k_bias
+        converted_state_dict[f"{block_prefix}attn.to_v.weight"] = v
+        converted_state_dict[f"{block_prefix}attn.to_v.bias"] = v_bias
+
+        # self_attn_proj -> attn.to_out.0
+        converted_state_dict[f"{block_prefix}attn.to_out.0.weight"] = original_state_dict.pop(
+            f"{orig_prefix}self_attn_proj.weight"
+        )
+        converted_state_dict[f"{block_prefix}attn.to_out.0.bias"] = original_state_dict.pop(
+            f"{orig_prefix}self_attn_proj.bias"
+        )
+
+        # norm2
+        converted_state_dict[f"{block_prefix}norm2.weight"] = original_state_dict.pop(f"{orig_prefix}norm2.weight")
+        converted_state_dict[f"{block_prefix}norm2.bias"] = original_state_dict.pop(f"{orig_prefix}norm2.bias")
+
+        # mlp -> ff
+        converted_state_dict[f"{block_prefix}ff.net.0.proj.weight"] = original_state_dict.pop(
+            f"{orig_prefix}mlp.fc1.weight"
+        )
+        converted_state_dict[f"{block_prefix}ff.net.0.proj.bias"] = original_state_dict.pop(
+            f"{orig_prefix}mlp.fc1.bias"
+        )
+        converted_state_dict[f"{block_prefix}ff.net.2.weight"] = original_state_dict.pop(
+            f"{orig_prefix}mlp.fc2.weight"
+        )
+        converted_state_dict[f"{block_prefix}ff.net.2.bias"] = original_state_dict.pop(f"{orig_prefix}mlp.fc2.bias")
+
+        # adaLN_modulation -> norm_out
+        converted_state_dict[f"{block_prefix}norm_out.linear.weight"] = original_state_dict.pop(
+            f"{orig_prefix}adaLN_modulation.1.weight"
+        )
+        converted_state_dict[f"{block_prefix}norm_out.linear.bias"] = original_state_dict.pop(
+            f"{orig_prefix}adaLN_modulation.1.bias"
+        )
+
+    # 6. context_embedder_2 <- byt5_in
+    converted_state_dict["context_embedder_2.norm.weight"] = original_state_dict.pop("byt5_in.layernorm.weight")
+    converted_state_dict["context_embedder_2.norm.bias"] = original_state_dict.pop("byt5_in.layernorm.bias")
+    converted_state_dict["context_embedder_2.linear_1.weight"] = original_state_dict.pop("byt5_in.fc1.weight")
+    converted_state_dict["context_embedder_2.linear_1.bias"] = original_state_dict.pop("byt5_in.fc1.bias")
+    converted_state_dict["context_embedder_2.linear_2.weight"] = original_state_dict.pop("byt5_in.fc2.weight")
+    converted_state_dict["context_embedder_2.linear_2.bias"] = original_state_dict.pop("byt5_in.fc2.bias")
+    converted_state_dict["context_embedder_2.linear_3.weight"] = original_state_dict.pop("byt5_in.fc3.weight")
+    converted_state_dict["context_embedder_2.linear_3.bias"] = original_state_dict.pop("byt5_in.fc3.bias")
+
+    # 7. image_embedder <- vision_in
+    converted_state_dict["image_embedder.norm_in.weight"] = original_state_dict.pop("vision_in.proj.0.weight")
+    converted_state_dict["image_embedder.norm_in.bias"] = original_state_dict.pop("vision_in.proj.0.bias")
+    converted_state_dict["image_embedder.linear_1.weight"] = original_state_dict.pop("vision_in.proj.1.weight")
+    converted_state_dict["image_embedder.linear_1.bias"] = original_state_dict.pop("vision_in.proj.1.bias")
+    converted_state_dict["image_embedder.linear_2.weight"] = original_state_dict.pop("vision_in.proj.3.weight")
+    converted_state_dict["image_embedder.linear_2.bias"] = original_state_dict.pop("vision_in.proj.3.bias")
+    converted_state_dict["image_embedder.norm_out.weight"] = original_state_dict.pop("vision_in.proj.4.weight")
+    converted_state_dict["image_embedder.norm_out.bias"] = original_state_dict.pop("vision_in.proj.4.bias")
+
+    # 8. x_embedder <- img_in
+    converted_state_dict["x_embedder.proj.weight"] = original_state_dict.pop("img_in.proj.weight")
+    converted_state_dict["x_embedder.proj.bias"] = original_state_dict.pop("img_in.proj.bias")
+
+    # 9. cond_type_embed <- cond_type_embedding
+    converted_state_dict["cond_type_embed.weight"] = original_state_dict.pop("cond_type_embedding.weight")
+
+    # 10. transformer_blocks <- double_blocks
+    num_layers = 54
+    for i in range(num_layers):
+        block_prefix = f"transformer_blocks.{i}."
+        orig_prefix = f"double_blocks.{i}."
+
+        # norm1 (img_mod)
+        converted_state_dict[f"{block_prefix}norm1.linear.weight"] = original_state_dict.pop(
+            f"{orig_prefix}img_mod.linear.weight"
+        )
+        converted_state_dict[f"{block_prefix}norm1.linear.bias"] = original_state_dict.pop(
+            f"{orig_prefix}img_mod.linear.bias"
+        )
+
+        # norm1_context (txt_mod)
+        converted_state_dict[f"{block_prefix}norm1_context.linear.weight"] = original_state_dict.pop(
+            f"{orig_prefix}txt_mod.linear.weight"
+        )
+        converted_state_dict[f"{block_prefix}norm1_context.linear.bias"] = original_state_dict.pop(
+            f"{orig_prefix}txt_mod.linear.bias"
+        )
+
+        # img attention (to_q, to_k, to_v)
+        converted_state_dict[f"{block_prefix}attn.to_q.weight"] = original_state_dict.pop(
+            f"{orig_prefix}img_attn_q.weight"
+        )
+        converted_state_dict[f"{block_prefix}attn.to_q.bias"] = original_state_dict.pop(
+            f"{orig_prefix}img_attn_q.bias"
+        )
+        converted_state_dict[f"{block_prefix}attn.to_k.weight"] = original_state_dict.pop(
+            f"{orig_prefix}img_attn_k.weight"
+        )
+        converted_state_dict[f"{block_prefix}attn.to_k.bias"] = original_state_dict.pop(
+            f"{orig_prefix}img_attn_k.bias"
+        )
+        converted_state_dict[f"{block_prefix}attn.to_v.weight"] = original_state_dict.pop(
+            f"{orig_prefix}img_attn_v.weight"
+        )
+        converted_state_dict[f"{block_prefix}attn.to_v.bias"] = original_state_dict.pop(
+            f"{orig_prefix}img_attn_v.bias"
+        )
+
+        # img attention qk norm
+        converted_state_dict[f"{block_prefix}attn.norm_q.weight"] = original_state_dict.pop(
+            f"{orig_prefix}img_attn_q_norm.weight"
+        )
+        converted_state_dict[f"{block_prefix}attn.norm_k.weight"] = original_state_dict.pop(
+            f"{orig_prefix}img_attn_k_norm.weight"
+        )
+
+        # img attention output projection
+        converted_state_dict[f"{block_prefix}attn.to_out.0.weight"] = original_state_dict.pop(
+            f"{orig_prefix}img_attn_proj.weight"
+        )
+        converted_state_dict[f"{block_prefix}attn.to_out.0.bias"] = original_state_dict.pop(
+            f"{orig_prefix}img_attn_proj.bias"
+        )
+
+        # txt attention (add_q_proj, add_k_proj, add_v_proj)
+        converted_state_dict[f"{block_prefix}attn.add_q_proj.weight"] = original_state_dict.pop(
+            f"{orig_prefix}txt_attn_q.weight"
+        )
+        converted_state_dict[f"{block_prefix}attn.add_q_proj.bias"] = original_state_dict.pop(
+            f"{orig_prefix}txt_attn_q.bias"
+        )
+        converted_state_dict[f"{block_prefix}attn.add_k_proj.weight"] = original_state_dict.pop(
+            f"{orig_prefix}txt_attn_k.weight"
+        )
+        converted_state_dict[f"{block_prefix}attn.add_k_proj.bias"] = original_state_dict.pop(
+            f"{orig_prefix}txt_attn_k.bias"
+        )
+        converted_state_dict[f"{block_prefix}attn.add_v_proj.weight"] = original_state_dict.pop(
+            f"{orig_prefix}txt_attn_v.weight"
+        )
+        converted_state_dict[f"{block_prefix}attn.add_v_proj.bias"] = original_state_dict.pop(
+            f"{orig_prefix}txt_attn_v.bias"
+        )
+
+        # txt attention qk norm
+        converted_state_dict[f"{block_prefix}attn.norm_added_q.weight"] = original_state_dict.pop(
+            f"{orig_prefix}txt_attn_q_norm.weight"
+        )
+        converted_state_dict[f"{block_prefix}attn.norm_added_k.weight"] = original_state_dict.pop(
+            f"{orig_prefix}txt_attn_k_norm.weight"
+        )
+
+        # txt attention output projection
+        converted_state_dict[f"{block_prefix}attn.to_add_out.weight"] = original_state_dict.pop(
+            f"{orig_prefix}txt_attn_proj.weight"
+        )
+        converted_state_dict[f"{block_prefix}attn.to_add_out.bias"] = original_state_dict.pop(
+            f"{orig_prefix}txt_attn_proj.bias"
+        )
+
+        # norm2 and norm2_context (these don't have weights in the original, they're LayerNorm with elementwise_affine=False)
+        # So we skip them
+
+        # img_mlp -> ff
+        converted_state_dict[f"{block_prefix}ff.net.0.proj.weight"] = original_state_dict.pop(
+            f"{orig_prefix}img_mlp.fc1.weight"
+        )
+        converted_state_dict[f"{block_prefix}ff.net.0.proj.bias"] = original_state_dict.pop(
+            f"{orig_prefix}img_mlp.fc1.bias"
+        )
+        converted_state_dict[f"{block_prefix}ff.net.2.weight"] = original_state_dict.pop(
+            f"{orig_prefix}img_mlp.fc2.weight"
+        )
+        converted_state_dict[f"{block_prefix}ff.net.2.bias"] = original_state_dict.pop(
+            f"{orig_prefix}img_mlp.fc2.bias"
+        )
+
+        # txt_mlp -> ff_context
+        converted_state_dict[f"{block_prefix}ff_context.net.0.proj.weight"] = original_state_dict.pop(
+            f"{orig_prefix}txt_mlp.fc1.weight"
+        )
+        converted_state_dict[f"{block_prefix}ff_context.net.0.proj.bias"] = original_state_dict.pop(
+            f"{orig_prefix}txt_mlp.fc1.bias"
+        )
+        converted_state_dict[f"{block_prefix}ff_context.net.2.weight"] = original_state_dict.pop(
+            f"{orig_prefix}txt_mlp.fc2.weight"
+        )
+        converted_state_dict[f"{block_prefix}ff_context.net.2.bias"] = original_state_dict.pop(
+            f"{orig_prefix}txt_mlp.fc2.bias"
+        )
+
+    # 11. norm_out and proj_out <- final_layer
+    converted_state_dict["norm_out.linear.weight"] = swap_scale_shift(
+        original_state_dict.pop("final_layer.adaLN_modulation.1.weight")
+    )
+    converted_state_dict["norm_out.linear.bias"] = swap_scale_shift(
+        original_state_dict.pop("final_layer.adaLN_modulation.1.bias")
+    )
+    converted_state_dict["proj_out.weight"] = original_state_dict.pop("final_layer.linear.weight")
+    converted_state_dict["proj_out.bias"] = original_state_dict.pop("final_layer.linear.bias")
+
+    return converted_state_dict
+
+
+def convert_hunyuan_video_15_vae_checkpoint_to_diffusers(
+    original_state_dict, block_out_channels=[128, 256, 512, 1024, 1024], layers_per_block=2
+):
+    converted = {}
+
+    # 1. Encoder
+    # 1.1 conv_in
+    converted["encoder.conv_in.conv.weight"] = original_state_dict.pop("encoder.conv_in.conv.weight")
+    converted["encoder.conv_in.conv.bias"] = original_state_dict.pop("encoder.conv_in.conv.bias")
+
+    # 1.2 Down blocks
+    for down_block_index in range(len(block_out_channels)):  # 0 to 4
+        # ResNet blocks
+        for resnet_block_index in range(layers_per_block):  # 0 to 1
+            converted[f"encoder.down_blocks.{down_block_index}.resnets.{resnet_block_index}.norm1.gamma"] = (
+                original_state_dict.pop(f"encoder.down.{down_block_index}.block.{resnet_block_index}.norm1.gamma")
+            )
+            converted[f"encoder.down_blocks.{down_block_index}.resnets.{resnet_block_index}.conv1.conv.weight"] = (
+                original_state_dict.pop(
+                    f"encoder.down.{down_block_index}.block.{resnet_block_index}.conv1.conv.weight"
+                )
+            )
+            converted[f"encoder.down_blocks.{down_block_index}.resnets.{resnet_block_index}.conv1.conv.bias"] = (
+                original_state_dict.pop(f"encoder.down.{down_block_index}.block.{resnet_block_index}.conv1.conv.bias")
+            )
+            converted[f"encoder.down_blocks.{down_block_index}.resnets.{resnet_block_index}.norm2.gamma"] = (
+                original_state_dict.pop(f"encoder.down.{down_block_index}.block.{resnet_block_index}.norm2.gamma")
+            )
+            converted[f"encoder.down_blocks.{down_block_index}.resnets.{resnet_block_index}.conv2.conv.weight"] = (
+                original_state_dict.pop(
+                    f"encoder.down.{down_block_index}.block.{resnet_block_index}.conv2.conv.weight"
+                )
+            )
+            converted[f"encoder.down_blocks.{down_block_index}.resnets.{resnet_block_index}.conv2.conv.bias"] = (
+                original_state_dict.pop(f"encoder.down.{down_block_index}.block.{resnet_block_index}.conv2.conv.bias")
+            )
+
+        # Downsample (if exists)
+        if f"encoder.down.{down_block_index}.downsample.conv.conv.weight" in original_state_dict:
+            converted[f"encoder.down_blocks.{down_block_index}.downsamplers.0.conv.conv.weight"] = (
+                original_state_dict.pop(f"encoder.down.{down_block_index}.downsample.conv.conv.weight")
+            )
+            converted[f"encoder.down_blocks.{down_block_index}.downsamplers.0.conv.conv.bias"] = (
+                original_state_dict.pop(f"encoder.down.{down_block_index}.downsample.conv.conv.bias")
+            )
+
+    # 1.3 Mid block
+    converted["encoder.mid_block.resnets.0.norm1.gamma"] = original_state_dict.pop("encoder.mid.block_1.norm1.gamma")
+    converted["encoder.mid_block.resnets.0.conv1.conv.weight"] = original_state_dict.pop(
+        "encoder.mid.block_1.conv1.conv.weight"
+    )
+    converted["encoder.mid_block.resnets.0.conv1.conv.bias"] = original_state_dict.pop(
+        "encoder.mid.block_1.conv1.conv.bias"
+    )
+    converted["encoder.mid_block.resnets.0.norm2.gamma"] = original_state_dict.pop("encoder.mid.block_1.norm2.gamma")
+    converted["encoder.mid_block.resnets.0.conv2.conv.weight"] = original_state_dict.pop(
+        "encoder.mid.block_1.conv2.conv.weight"
+    )
+    converted["encoder.mid_block.resnets.0.conv2.conv.bias"] = original_state_dict.pop(
+        "encoder.mid.block_1.conv2.conv.bias"
+    )
+
+    converted["encoder.mid_block.resnets.1.norm1.gamma"] = original_state_dict.pop("encoder.mid.block_2.norm1.gamma")
+    converted["encoder.mid_block.resnets.1.conv1.conv.weight"] = original_state_dict.pop(
+        "encoder.mid.block_2.conv1.conv.weight"
+    )
+    converted["encoder.mid_block.resnets.1.conv1.conv.bias"] = original_state_dict.pop(
+        "encoder.mid.block_2.conv1.conv.bias"
+    )
+    converted["encoder.mid_block.resnets.1.norm2.gamma"] = original_state_dict.pop("encoder.mid.block_2.norm2.gamma")
+    converted["encoder.mid_block.resnets.1.conv2.conv.weight"] = original_state_dict.pop(
+        "encoder.mid.block_2.conv2.conv.weight"
+    )
+    converted["encoder.mid_block.resnets.1.conv2.conv.bias"] = original_state_dict.pop(
+        "encoder.mid.block_2.conv2.conv.bias"
+    )
+
+    # Attention block
+    converted["encoder.mid_block.attentions.0.norm.gamma"] = original_state_dict.pop("encoder.mid.attn_1.norm.gamma")
+    converted["encoder.mid_block.attentions.0.to_q.weight"] = original_state_dict.pop("encoder.mid.attn_1.q.weight")
+    converted["encoder.mid_block.attentions.0.to_q.bias"] = original_state_dict.pop("encoder.mid.attn_1.q.bias")
+    converted["encoder.mid_block.attentions.0.to_k.weight"] = original_state_dict.pop("encoder.mid.attn_1.k.weight")
+    converted["encoder.mid_block.attentions.0.to_k.bias"] = original_state_dict.pop("encoder.mid.attn_1.k.bias")
+    converted["encoder.mid_block.attentions.0.to_v.weight"] = original_state_dict.pop("encoder.mid.attn_1.v.weight")
+    converted["encoder.mid_block.attentions.0.to_v.bias"] = original_state_dict.pop("encoder.mid.attn_1.v.bias")
+    converted["encoder.mid_block.attentions.0.proj_out.weight"] = original_state_dict.pop(
+        "encoder.mid.attn_1.proj_out.weight"
+    )
+    converted["encoder.mid_block.attentions.0.proj_out.bias"] = original_state_dict.pop(
+        "encoder.mid.attn_1.proj_out.bias"
+    )
+
+    # 1.4 Encoder output
+    converted["encoder.norm_out.gamma"] = original_state_dict.pop("encoder.norm_out.gamma")
+    converted["encoder.conv_out.conv.weight"] = original_state_dict.pop("encoder.conv_out.conv.weight")
+    converted["encoder.conv_out.conv.bias"] = original_state_dict.pop("encoder.conv_out.conv.bias")
+
+    # 2. Decoder
+    # 2.1 conv_in
+    converted["decoder.conv_in.conv.weight"] = original_state_dict.pop("decoder.conv_in.conv.weight")
+    converted["decoder.conv_in.conv.bias"] = original_state_dict.pop("decoder.conv_in.conv.bias")
+
+    # 2.2 Mid block
+    converted["decoder.mid_block.resnets.0.norm1.gamma"] = original_state_dict.pop("decoder.mid.block_1.norm1.gamma")
+    converted["decoder.mid_block.resnets.0.conv1.conv.weight"] = original_state_dict.pop(
+        "decoder.mid.block_1.conv1.conv.weight"
+    )
+    converted["decoder.mid_block.resnets.0.conv1.conv.bias"] = original_state_dict.pop(
+        "decoder.mid.block_1.conv1.conv.bias"
+    )
+    converted["decoder.mid_block.resnets.0.norm2.gamma"] = original_state_dict.pop("decoder.mid.block_1.norm2.gamma")
+    converted["decoder.mid_block.resnets.0.conv2.conv.weight"] = original_state_dict.pop(
+        "decoder.mid.block_1.conv2.conv.weight"
+    )
+    converted["decoder.mid_block.resnets.0.conv2.conv.bias"] = original_state_dict.pop(
+        "decoder.mid.block_1.conv2.conv.bias"
+    )
+
+    converted["decoder.mid_block.resnets.1.norm1.gamma"] = original_state_dict.pop("decoder.mid.block_2.norm1.gamma")
+    converted["decoder.mid_block.resnets.1.conv1.conv.weight"] = original_state_dict.pop(
+        "decoder.mid.block_2.conv1.conv.weight"
+    )
+    converted["decoder.mid_block.resnets.1.conv1.conv.bias"] = original_state_dict.pop(
+        "decoder.mid.block_2.conv1.conv.bias"
+    )
+    converted["decoder.mid_block.resnets.1.norm2.gamma"] = original_state_dict.pop("decoder.mid.block_2.norm2.gamma")
+    converted["decoder.mid_block.resnets.1.conv2.conv.weight"] = original_state_dict.pop(
+        "decoder.mid.block_2.conv2.conv.weight"
+    )
+    converted["decoder.mid_block.resnets.1.conv2.conv.bias"] = original_state_dict.pop(
+        "decoder.mid.block_2.conv2.conv.bias"
+    )
+
+    # Decoder attention block
+    converted["decoder.mid_block.attentions.0.norm.gamma"] = original_state_dict.pop("decoder.mid.attn_1.norm.gamma")
+    converted["decoder.mid_block.attentions.0.to_q.weight"] = original_state_dict.pop("decoder.mid.attn_1.q.weight")
+    converted["decoder.mid_block.attentions.0.to_q.bias"] = original_state_dict.pop("decoder.mid.attn_1.q.bias")
+    converted["decoder.mid_block.attentions.0.to_k.weight"] = original_state_dict.pop("decoder.mid.attn_1.k.weight")
+    converted["decoder.mid_block.attentions.0.to_k.bias"] = original_state_dict.pop("decoder.mid.attn_1.k.bias")
+    converted["decoder.mid_block.attentions.0.to_v.weight"] = original_state_dict.pop("decoder.mid.attn_1.v.weight")
+    converted["decoder.mid_block.attentions.0.to_v.bias"] = original_state_dict.pop("decoder.mid.attn_1.v.bias")
+    converted["decoder.mid_block.attentions.0.proj_out.weight"] = original_state_dict.pop(
+        "decoder.mid.attn_1.proj_out.weight"
+    )
+    converted["decoder.mid_block.attentions.0.proj_out.bias"] = original_state_dict.pop(
+        "decoder.mid.attn_1.proj_out.bias"
+    )
+
+    # 2.3 Up blocks
+    for up_block_index in range(len(block_out_channels)):  # 0 to 5
+        # ResNet blocks
+        for resnet_block_index in range(layers_per_block + 1):  # 0 to 2 (decoder has 3 resnets per level)
+            converted[f"decoder.up_blocks.{up_block_index}.resnets.{resnet_block_index}.norm1.gamma"] = (
+                original_state_dict.pop(f"decoder.up.{up_block_index}.block.{resnet_block_index}.norm1.gamma")
+            )
+            converted[f"decoder.up_blocks.{up_block_index}.resnets.{resnet_block_index}.conv1.conv.weight"] = (
+                original_state_dict.pop(f"decoder.up.{up_block_index}.block.{resnet_block_index}.conv1.conv.weight")
+            )
+            converted[f"decoder.up_blocks.{up_block_index}.resnets.{resnet_block_index}.conv1.conv.bias"] = (
+                original_state_dict.pop(f"decoder.up.{up_block_index}.block.{resnet_block_index}.conv1.conv.bias")
+            )
+            converted[f"decoder.up_blocks.{up_block_index}.resnets.{resnet_block_index}.norm2.gamma"] = (
+                original_state_dict.pop(f"decoder.up.{up_block_index}.block.{resnet_block_index}.norm2.gamma")
+            )
+            converted[f"decoder.up_blocks.{up_block_index}.resnets.{resnet_block_index}.conv2.conv.weight"] = (
+                original_state_dict.pop(f"decoder.up.{up_block_index}.block.{resnet_block_index}.conv2.conv.weight")
+            )
+            converted[f"decoder.up_blocks.{up_block_index}.resnets.{resnet_block_index}.conv2.conv.bias"] = (
+                original_state_dict.pop(f"decoder.up.{up_block_index}.block.{resnet_block_index}.conv2.conv.bias")
+            )
+
+        # Upsample (if exists)
+        if f"decoder.up.{up_block_index}.upsample.conv.conv.weight" in original_state_dict:
+            converted[f"decoder.up_blocks.{up_block_index}.upsamplers.0.conv.conv.weight"] = original_state_dict.pop(
+                f"decoder.up.{up_block_index}.upsample.conv.conv.weight"
+            )
+            converted[f"decoder.up_blocks.{up_block_index}.upsamplers.0.conv.conv.bias"] = original_state_dict.pop(
+                f"decoder.up.{up_block_index}.upsample.conv.conv.bias"
+            )
+
+    # 2.4 Decoder output
+    converted["decoder.norm_out.gamma"] = original_state_dict.pop("decoder.norm_out.gamma")
+    converted["decoder.conv_out.conv.weight"] = original_state_dict.pop("decoder.conv_out.conv.weight")
+    converted["decoder.conv_out.conv.bias"] = original_state_dict.pop("decoder.conv_out.conv.bias")
+
+    return converted
+
+
+def load_sharded_safetensors(dir: pathlib.Path):
+    file_paths = list(dir.glob("diffusion_pytorch_model*.safetensors"))
+    state_dict = {}
+    for path in file_paths:
+        state_dict.update(load_file(path))
+    return state_dict
+
+
+def load_original_transformer_state_dict(args):
+    if args.original_state_dict_repo_id is not None:
+        model_dir = snapshot_download(
+            args.original_state_dict_repo_id,
+            repo_type="model",
+            allow_patterns="transformer/" + args.transformer_type + "/*",
+        )
+    elif args.original_state_dict_folder is not None:
+        model_dir = pathlib.Path(args.original_state_dict_folder)
+    else:
+        raise ValueError("Please provide either `original_state_dict_repo_id` or `original_state_dict_folder`")
+    model_dir = pathlib.Path(model_dir)
+    model_dir = model_dir / "transformer" / args.transformer_type
+    return load_sharded_safetensors(model_dir)
+
+
+def load_original_vae_state_dict(args):
+    if args.original_state_dict_repo_id is not None:
+        ckpt_path = hf_hub_download(
+            repo_id=args.original_state_dict_repo_id, filename="vae/diffusion_pytorch_model.safetensors"
+        )
+    elif args.original_state_dict_folder is not None:
+        model_dir = pathlib.Path(args.original_state_dict_folder)
+        ckpt_path = model_dir / "vae/diffusion_pytorch_model.safetensors"
+    else:
+        raise ValueError("Please provide either `original_state_dict_repo_id` or `original_state_dict_folder`")
+
+    original_state_dict = load_file(ckpt_path)
+    return original_state_dict
+
+
+def convert_transformer(args):
+    original_state_dict = load_original_transformer_state_dict(args)
+
+    config = TRANSFORMER_CONFIGS[args.transformer_type]
+    with init_empty_weights():
+        transformer = HunyuanVideo15Transformer3DModel(**config)
+    state_dict = convert_hyvideo15_transformer_to_diffusers(original_state_dict, config=transformer.config)
+    transformer.load_state_dict(state_dict, strict=True, assign=True)
+
+    return transformer
+
+
+def convert_vae(args):
+    original_state_dict = load_original_vae_state_dict(args)
+    with init_empty_weights():
+        vae = AutoencoderKLHunyuanVideo15()
+    state_dict = convert_hunyuan_video_15_vae_checkpoint_to_diffusers(original_state_dict)
+    vae.load_state_dict(state_dict, strict=True, assign=True)
+    return vae
+
+
+def load_mllm():
+    print(" loading from Qwen/Qwen2.5-VL-7B-Instruct")
+    text_encoder = AutoModel.from_pretrained(
+        "Qwen/Qwen2.5-VL-7B-Instruct", torch_dtype=torch.bfloat16, low_cpu_mem_usage=True
+    )
+    if hasattr(text_encoder, "language_model"):
+        text_encoder = text_encoder.language_model
+    tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-VL-7B-Instruct", padding_side="right")
+    return text_encoder, tokenizer
+
+
+# copied from https://github.com/Tencent-Hunyuan/HunyuanVideo-1.5/blob/910da2a829c484ea28982e8cff3bbc2cacdf1681/hyvideo/models/text_encoders/byT5/__init__.py#L89
+def add_special_token(
+    tokenizer,
+    text_encoder,
+    add_color=True,
+    add_font=True,
+    multilingual=True,
+    color_ann_path="assets/color_idx.json",
+    font_ann_path="assets/multilingual_10-lang_idx.json",
+):
+    """
+    Add special tokens for color and font to tokenizer and text encoder.
+
+    Args:
+        tokenizer: Huggingface tokenizer.
+        text_encoder: Huggingface T5 encoder.
+        add_color (bool): Whether to add color tokens.
+        add_font (bool): Whether to add font tokens.
+        color_ann_path (str): Path to color annotation JSON.
+        font_ann_path (str): Path to font annotation JSON.
+        multilingual (bool): Whether to use multilingual font tokens.
+    """
+    with open(font_ann_path, "r") as f:
+        idx_font_dict = json.load(f)
+    with open(color_ann_path, "r") as f:
+        idx_color_dict = json.load(f)
+
+    if multilingual:
+        font_token = [f"<{font_code[:2]}-font-{idx_font_dict[font_code]}>" for font_code in idx_font_dict]
+    else:
+        font_token = [f"<font-{i}>" for i in range(len(idx_font_dict))]
+    color_token = [f"<color-{i}>" for i in range(len(idx_color_dict))]
+    additional_special_tokens = []
+    if add_color:
+        additional_special_tokens += color_token
+    if add_font:
+        additional_special_tokens += font_token
+
+    tokenizer.add_tokens(additional_special_tokens, special_tokens=True)
+    # Set mean_resizing=False to avoid PyTorch LAPACK dependency
+    text_encoder.resize_token_embeddings(len(tokenizer), mean_resizing=False)
+
+
+def load_byt5(args):
+    """
+    Load ByT5 encoder with Glyph-SDXL-v2 weights and save in HuggingFace format.
+    """
+
+    # 1. Load base tokenizer and encoder
+    tokenizer = AutoTokenizer.from_pretrained("google/byt5-small")
+
+    # Load as T5EncoderModel
+    encoder = T5EncoderModel.from_pretrained("google/byt5-small")
+
+    byt5_checkpoint_path = os.path.join(args.byt5_path, "checkpoints/byt5_model.pt")
+    color_ann_path = os.path.join(args.byt5_path, "assets/color_idx.json")
+    font_ann_path = os.path.join(args.byt5_path, "assets/multilingual_10-lang_idx.json")
+
+    # 2. Add special tokens
+    add_special_token(
+        tokenizer=tokenizer,
+        text_encoder=encoder,
+        add_color=True,
+        add_font=True,
+        color_ann_path=color_ann_path,
+        font_ann_path=font_ann_path,
+        multilingual=True,
+    )
+
+    # 3. Load Glyph-SDXL-v2 checkpoint
+    print(f"\n3. Loading Glyph-SDXL-v2 checkpoint: {byt5_checkpoint_path}")
+    checkpoint = torch.load(byt5_checkpoint_path, map_location="cpu")
+
+    # Handle different checkpoint formats
+    if "state_dict" in checkpoint:
+        state_dict = checkpoint["state_dict"]
+    else:
+        state_dict = checkpoint
+
+    # add 'encoder.' prefix to the keys
+    # Remove 'module.text_tower.encoder.' prefix if present
+    cleaned_state_dict = {}
+    for key, value in state_dict.items():
+        if key.startswith("module.text_tower.encoder."):
+            new_key = "encoder." + key[len("module.text_tower.encoder.") :]
+            cleaned_state_dict[new_key] = value
+        else:
+            new_key = "encoder." + key
+            cleaned_state_dict[new_key] = value
+
+    # 4. Load weights
+    missing_keys, unexpected_keys = encoder.load_state_dict(cleaned_state_dict, strict=False)
+    if unexpected_keys:
+        raise ValueError(f"Unexpected keys: {unexpected_keys}")
+    if "shared.weight" in missing_keys:
+        print("  Missing shared.weight as expected")
+        missing_keys.remove("shared.weight")
+    if missing_keys:
+        raise ValueError(f"Missing keys: {missing_keys}")
+
+    return encoder, tokenizer
+
+
+def load_siglip():
+    image_encoder = SiglipVisionModel.from_pretrained(
+        "black-forest-labs/FLUX.1-Redux-dev", subfolder="image_encoder", torch_dtype=torch.bfloat16
+    )
+    feature_extractor = SiglipImageProcessor.from_pretrained(
+        "black-forest-labs/FLUX.1-Redux-dev", subfolder="feature_extractor"
+    )
+    return image_encoder, feature_extractor
+
+
+def get_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--original_state_dict_repo_id", type=str, default=None, help="Path to original hub_id for the model"
+    )
+    parser.add_argument(
+        "--original_state_dict_folder", type=str, default=None, help="Local folder name of the original state dict"
+    )
+    parser.add_argument("--output_path", type=str, required=True, help="Path where converted model(s) should be saved")
+    parser.add_argument("--transformer_type", type=str, default="480p_i2v", choices=list(TRANSFORMER_CONFIGS.keys()))
+    parser.add_argument(
+        "--byt5_path",
+        type=str,
+        default=None,
+        help=(
+            "path to the downloaded byt5 checkpoint & assets. "
+            "Note: They use Glyph-SDXL-v2 as byt5 encoder. You can download from modelscope like: "
+            "`modelscope download --model AI-ModelScope/Glyph-SDXL-v2 --local_dir ./ckpts/text_encoder/Glyph-SDXL-v2` "
+            "or manually download following the instructions on "
+            "https://github.com/Tencent-Hunyuan/HunyuanVideo-1.5/blob/910da2a829c484ea28982e8cff3bbc2cacdf1681/checkpoints-download.md. "
+            "The path should point to the Glyph-SDXL-v2 folder which should contain an `assets` folder and a `checkpoints` folder, "
+            "like: Glyph-SDXL-v2/assets/... and Glyph-SDXL-v2/checkpoints/byt5_model.pt"
+        ),
+    )
+    parser.add_argument("--save_pipeline", action="store_true")
+    return parser.parse_args()
+
+
+if __name__ == "__main__":
+    args = get_args()
+
+    if args.save_pipeline and args.byt5_path is None:
+        raise ValueError("Please provide --byt5_path when saving pipeline")
+
+    transformer = None
+
+    transformer = convert_transformer(args)
+    if not args.save_pipeline:
+        transformer.save_pretrained(args.output_path, safe_serialization=True)
+    else:
+        task_type = transformer.config.task_type
+
+        vae = convert_vae(args)
+
+        text_encoder, tokenizer = load_mllm()
+        text_encoder_2, tokenizer_2 = load_byt5(args)
+
+        flow_shift = SCHEDULER_CONFIGS[args.transformer_type]["shift"]
+        scheduler = FlowMatchEulerDiscreteScheduler(shift=flow_shift)
+
+        guidance_scale = GUIDANCE_CONFIGS[args.transformer_type]["guidance_scale"]
+        guider = ClassifierFreeGuidance(guidance_scale=guidance_scale)
+
+        if task_type == "i2v":
+            image_encoder, feature_extractor = load_siglip()
+            pipeline = HunyuanVideo15ImageToVideoPipeline(
+                vae=vae,
+                text_encoder=text_encoder,
+                text_encoder_2=text_encoder_2,
+                tokenizer=tokenizer,
+                tokenizer_2=tokenizer_2,
+                transformer=transformer,
+                guider=guider,
+                scheduler=scheduler,
+                image_encoder=image_encoder,
+                feature_extractor=feature_extractor,
+            )
+        elif task_type == "t2v":
+            pipeline = HunyuanVideo15Pipeline(
+                vae=vae,
+                text_encoder=text_encoder,
+                text_encoder_2=text_encoder_2,
+                tokenizer=tokenizer,
+                tokenizer_2=tokenizer_2,
+                transformer=transformer,
+                guider=guider,
+                scheduler=scheduler,
+            )
+        else:
+            raise ValueError(f"Task type {task_type} is not supported")
+
+        pipeline.save_pretrained(args.output_path, safe_serialization=True)
diff --git a/scripts/convert_ovis_image_to_diffusers.py b/scripts/convert_ovis_image_to_diffusers.py
new file mode 100644
index 000000000000..0d3d9cd44bf6
--- /dev/null
+++ b/scripts/convert_ovis_image_to_diffusers.py
@@ -0,0 +1,263 @@
+import argparse
+from contextlib import nullcontext
+
+import safetensors.torch
+import torch
+from accelerate import init_empty_weights
+from huggingface_hub import hf_hub_download
+
+from diffusers import OvisImageTransformer2DModel
+from diffusers.utils.import_utils import is_accelerate_available
+
+
+"""
+# Transformer
+
+python scripts/convert_ovis_image_to_diffusers.py  \
+--original_state_dict_repo_id "AIDC-AI/Ovis-Image-7B" \
+--filename "ovis_image.safetensors"
+--output_path "ovis-image" \
+--transformer
+"""
+
+
+CTX = init_empty_weights if is_accelerate_available() else nullcontext
+
+parser = argparse.ArgumentParser()
+parser.add_argument("--original_state_dict_repo_id", default=None, type=str)
+parser.add_argument("--filename", default="ovis_image.safetensors", type=str)
+parser.add_argument("--checkpoint_path", default=None, type=str)
+parser.add_argument("--in_channels", type=int, default=64)
+parser.add_argument("--out_channels", type=int, default=None)
+parser.add_argument("--transformer", action="store_true")
+parser.add_argument("--output_path", type=str)
+parser.add_argument("--dtype", type=str, default="bf16")
+
+args = parser.parse_args()
+dtype = torch.bfloat16 if args.dtype == "bf16" else torch.float32
+
+
+def load_original_checkpoint(args):
+    if args.original_state_dict_repo_id is not None:
+        ckpt_path = hf_hub_download(repo_id=args.original_state_dict_repo_id, filename=args.filename)
+    elif args.checkpoint_path is not None:
+        ckpt_path = args.checkpoint_path
+    else:
+        raise ValueError(" please provide either `original_state_dict_repo_id` or a local `checkpoint_path`")
+
+    original_state_dict = safetensors.torch.load_file(ckpt_path)
+    return original_state_dict
+
+
+# in SD3 original implementation of AdaLayerNormContinuous, it split linear projection output into shift, scale;
+# while in diffusers it split into scale, shift. Here we swap the linear projection weights in order to be able to use diffusers implementation
+def swap_scale_shift(weight):
+    shift, scale = weight.chunk(2, dim=0)
+    new_weight = torch.cat([scale, shift], dim=0)
+    return new_weight
+
+
+def convert_ovis_image_transformer_checkpoint_to_diffusers(
+    original_state_dict, num_layers, num_single_layers, inner_dim, mlp_ratio=4.0
+):
+    converted_state_dict = {}
+
+    ## time_text_embed.timestep_embedder <-  time_in
+    converted_state_dict["timestep_embedder.linear_1.weight"] = original_state_dict.pop("time_in.in_layer.weight")
+    converted_state_dict["timestep_embedder.linear_1.bias"] = original_state_dict.pop("time_in.in_layer.bias")
+    converted_state_dict["timestep_embedder.linear_2.weight"] = original_state_dict.pop("time_in.out_layer.weight")
+    converted_state_dict["timestep_embedder.linear_2.bias"] = original_state_dict.pop("time_in.out_layer.bias")
+
+    # context_embedder
+    converted_state_dict["context_embedder_norm.weight"] = original_state_dict.pop("semantic_txt_norm.weight")
+    converted_state_dict["context_embedder.weight"] = original_state_dict.pop("semantic_txt_in.weight")
+    converted_state_dict["context_embedder.bias"] = original_state_dict.pop("semantic_txt_in.bias")
+
+    # x_embedder
+    converted_state_dict["x_embedder.weight"] = original_state_dict.pop("img_in.weight")
+    converted_state_dict["x_embedder.bias"] = original_state_dict.pop("img_in.bias")
+
+    # double transformer blocks
+    for i in range(num_layers):
+        block_prefix = f"transformer_blocks.{i}."
+        # norms.
+        ## norm1
+        converted_state_dict[f"{block_prefix}norm1.linear.weight"] = original_state_dict.pop(
+            f"double_blocks.{i}.img_mod.lin.weight"
+        )
+        converted_state_dict[f"{block_prefix}norm1.linear.bias"] = original_state_dict.pop(
+            f"double_blocks.{i}.img_mod.lin.bias"
+        )
+        ## norm1_context
+        converted_state_dict[f"{block_prefix}norm1_context.linear.weight"] = original_state_dict.pop(
+            f"double_blocks.{i}.txt_mod.lin.weight"
+        )
+        converted_state_dict[f"{block_prefix}norm1_context.linear.bias"] = original_state_dict.pop(
+            f"double_blocks.{i}.txt_mod.lin.bias"
+        )
+        # Q, K, V
+        sample_q, sample_k, sample_v = torch.chunk(
+            original_state_dict.pop(f"double_blocks.{i}.img_attn.qkv.weight"), 3, dim=0
+        )
+        context_q, context_k, context_v = torch.chunk(
+            original_state_dict.pop(f"double_blocks.{i}.txt_attn.qkv.weight"), 3, dim=0
+        )
+        sample_q_bias, sample_k_bias, sample_v_bias = torch.chunk(
+            original_state_dict.pop(f"double_blocks.{i}.img_attn.qkv.bias"), 3, dim=0
+        )
+        context_q_bias, context_k_bias, context_v_bias = torch.chunk(
+            original_state_dict.pop(f"double_blocks.{i}.txt_attn.qkv.bias"), 3, dim=0
+        )
+        converted_state_dict[f"{block_prefix}attn.to_q.weight"] = torch.cat([sample_q])
+        converted_state_dict[f"{block_prefix}attn.to_q.bias"] = torch.cat([sample_q_bias])
+        converted_state_dict[f"{block_prefix}attn.to_k.weight"] = torch.cat([sample_k])
+        converted_state_dict[f"{block_prefix}attn.to_k.bias"] = torch.cat([sample_k_bias])
+        converted_state_dict[f"{block_prefix}attn.to_v.weight"] = torch.cat([sample_v])
+        converted_state_dict[f"{block_prefix}attn.to_v.bias"] = torch.cat([sample_v_bias])
+        converted_state_dict[f"{block_prefix}attn.add_q_proj.weight"] = torch.cat([context_q])
+        converted_state_dict[f"{block_prefix}attn.add_q_proj.bias"] = torch.cat([context_q_bias])
+        converted_state_dict[f"{block_prefix}attn.add_k_proj.weight"] = torch.cat([context_k])
+        converted_state_dict[f"{block_prefix}attn.add_k_proj.bias"] = torch.cat([context_k_bias])
+        converted_state_dict[f"{block_prefix}attn.add_v_proj.weight"] = torch.cat([context_v])
+        converted_state_dict[f"{block_prefix}attn.add_v_proj.bias"] = torch.cat([context_v_bias])
+        # qk_norm
+        converted_state_dict[f"{block_prefix}attn.norm_q.weight"] = original_state_dict.pop(
+            f"double_blocks.{i}.img_attn.norm.query_norm.weight"
+        )
+        converted_state_dict[f"{block_prefix}attn.norm_k.weight"] = original_state_dict.pop(
+            f"double_blocks.{i}.img_attn.norm.key_norm.weight"
+        )
+        converted_state_dict[f"{block_prefix}attn.norm_added_q.weight"] = original_state_dict.pop(
+            f"double_blocks.{i}.txt_attn.norm.query_norm.weight"
+        )
+        converted_state_dict[f"{block_prefix}attn.norm_added_k.weight"] = original_state_dict.pop(
+            f"double_blocks.{i}.txt_attn.norm.key_norm.weight"
+        )
+        # ff img_mlp
+        converted_state_dict[f"{block_prefix}ff.net.0.proj.weight"] = torch.cat(
+            [
+                original_state_dict.pop(f"double_blocks.{i}.img_mlp.up_proj.weight"),
+                original_state_dict.pop(f"double_blocks.{i}.img_mlp.gate_proj.weight"),
+            ],
+            dim=0,
+        )
+        converted_state_dict[f"{block_prefix}ff.net.0.proj.bias"] = torch.cat(
+            [
+                original_state_dict.pop(f"double_blocks.{i}.img_mlp.up_proj.bias"),
+                original_state_dict.pop(f"double_blocks.{i}.img_mlp.gate_proj.bias"),
+            ],
+            dim=0,
+        )
+        converted_state_dict[f"{block_prefix}ff.net.2.weight"] = original_state_dict.pop(
+            f"double_blocks.{i}.img_mlp.down_proj.weight"
+        )
+        converted_state_dict[f"{block_prefix}ff.net.2.bias"] = original_state_dict.pop(
+            f"double_blocks.{i}.img_mlp.down_proj.bias"
+        )
+        converted_state_dict[f"{block_prefix}ff_context.net.0.proj.weight"] = torch.cat(
+            [
+                original_state_dict.pop(f"double_blocks.{i}.txt_mlp.up_proj.weight"),
+                original_state_dict.pop(f"double_blocks.{i}.txt_mlp.gate_proj.weight"),
+            ],
+            dim=0,
+        )
+        converted_state_dict[f"{block_prefix}ff_context.net.0.proj.bias"] = torch.cat(
+            [
+                original_state_dict.pop(f"double_blocks.{i}.txt_mlp.up_proj.bias"),
+                original_state_dict.pop(f"double_blocks.{i}.txt_mlp.gate_proj.bias"),
+            ],
+            dim=0,
+        )
+        converted_state_dict[f"{block_prefix}ff_context.net.2.weight"] = original_state_dict.pop(
+            f"double_blocks.{i}.txt_mlp.down_proj.weight"
+        )
+        converted_state_dict[f"{block_prefix}ff_context.net.2.bias"] = original_state_dict.pop(
+            f"double_blocks.{i}.txt_mlp.down_proj.bias"
+        )
+        # output projections.
+        converted_state_dict[f"{block_prefix}attn.to_out.0.weight"] = original_state_dict.pop(
+            f"double_blocks.{i}.img_attn.proj.weight"
+        )
+        converted_state_dict[f"{block_prefix}attn.to_out.0.bias"] = original_state_dict.pop(
+            f"double_blocks.{i}.img_attn.proj.bias"
+        )
+        converted_state_dict[f"{block_prefix}attn.to_add_out.weight"] = original_state_dict.pop(
+            f"double_blocks.{i}.txt_attn.proj.weight"
+        )
+        converted_state_dict[f"{block_prefix}attn.to_add_out.bias"] = original_state_dict.pop(
+            f"double_blocks.{i}.txt_attn.proj.bias"
+        )
+
+    # single transformer blocks
+    for i in range(num_single_layers):
+        block_prefix = f"single_transformer_blocks.{i}."
+        # norm.linear  <- single_blocks.0.modulation.lin
+        converted_state_dict[f"{block_prefix}norm.linear.weight"] = original_state_dict.pop(
+            f"single_blocks.{i}.modulation.lin.weight"
+        )
+        converted_state_dict[f"{block_prefix}norm.linear.bias"] = original_state_dict.pop(
+            f"single_blocks.{i}.modulation.lin.bias"
+        )
+        # Q, K, V, mlp
+        mlp_hidden_dim = int(inner_dim * mlp_ratio)
+        split_size = (inner_dim, inner_dim, inner_dim, mlp_hidden_dim * 2)
+        q, k, v, mlp = torch.split(original_state_dict.pop(f"single_blocks.{i}.linear1.weight"), split_size, dim=0)
+        q_bias, k_bias, v_bias, mlp_bias = torch.split(
+            original_state_dict.pop(f"single_blocks.{i}.linear1.bias"), split_size, dim=0
+        )
+        converted_state_dict[f"{block_prefix}attn.to_q.weight"] = torch.cat([q])
+        converted_state_dict[f"{block_prefix}attn.to_q.bias"] = torch.cat([q_bias])
+        converted_state_dict[f"{block_prefix}attn.to_k.weight"] = torch.cat([k])
+        converted_state_dict[f"{block_prefix}attn.to_k.bias"] = torch.cat([k_bias])
+        converted_state_dict[f"{block_prefix}attn.to_v.weight"] = torch.cat([v])
+        converted_state_dict[f"{block_prefix}attn.to_v.bias"] = torch.cat([v_bias])
+        converted_state_dict[f"{block_prefix}proj_mlp.weight"] = torch.cat([mlp])
+        converted_state_dict[f"{block_prefix}proj_mlp.bias"] = torch.cat([mlp_bias])
+        # qk norm
+        converted_state_dict[f"{block_prefix}attn.norm_q.weight"] = original_state_dict.pop(
+            f"single_blocks.{i}.norm.query_norm.weight"
+        )
+        converted_state_dict[f"{block_prefix}attn.norm_k.weight"] = original_state_dict.pop(
+            f"single_blocks.{i}.norm.key_norm.weight"
+        )
+        # output projections.
+        converted_state_dict[f"{block_prefix}proj_out.weight"] = original_state_dict.pop(
+            f"single_blocks.{i}.linear2.weight"
+        )
+        converted_state_dict[f"{block_prefix}proj_out.bias"] = original_state_dict.pop(
+            f"single_blocks.{i}.linear2.bias"
+        )
+
+    converted_state_dict["proj_out.weight"] = original_state_dict.pop("final_layer.linear.weight")
+    converted_state_dict["proj_out.bias"] = original_state_dict.pop("final_layer.linear.bias")
+    converted_state_dict["norm_out.linear.weight"] = swap_scale_shift(
+        original_state_dict.pop("final_layer.adaLN_modulation.1.weight")
+    )
+    converted_state_dict["norm_out.linear.bias"] = swap_scale_shift(
+        original_state_dict.pop("final_layer.adaLN_modulation.1.bias")
+    )
+
+    return converted_state_dict
+
+
+def main(args):
+    original_ckpt = load_original_checkpoint(args)
+
+    if args.transformer:
+        num_layers = 6
+        num_single_layers = 27
+        inner_dim = 3072
+        mlp_ratio = 4.0
+
+        converted_transformer_state_dict = convert_ovis_image_transformer_checkpoint_to_diffusers(
+            original_ckpt, num_layers, num_single_layers, inner_dim, mlp_ratio=mlp_ratio
+        )
+        transformer = OvisImageTransformer2DModel(in_channels=args.in_channels, out_channels=args.out_channels)
+        transformer.load_state_dict(converted_transformer_state_dict, strict=True)
+
+        print("Saving Ovis-Image Transformer in Diffusers format.")
+        transformer.to(dtype).save_pretrained(f"{args.output_path}/transformer")
+
+
+if __name__ == "__main__":
+    main(args)
diff --git a/scripts/convert_sana_controlnet_to_diffusers.py b/scripts/convert_sana_controlnet_to_diffusers.py
index bc1eb327880c..f7fcd7252576 100644
--- a/scripts/convert_sana_controlnet_to_diffusers.py
+++ b/scripts/convert_sana_controlnet_to_diffusers.py
@@ -10,7 +10,7 @@
 from diffusers import (
     SanaControlNetModel,
 )
-from diffusers.models.modeling_utils import load_model_dict_into_meta
+from diffusers.models.model_loading_utils import load_model_dict_into_meta
 from diffusers.utils.import_utils import is_accelerate_available
 
 
diff --git a/scripts/convert_sana_to_diffusers.py b/scripts/convert_sana_to_diffusers.py
index 959a647e0a5e..833e64ba75e7 100644
--- a/scripts/convert_sana_to_diffusers.py
+++ b/scripts/convert_sana_to_diffusers.py
@@ -20,7 +20,7 @@
     SanaTransformer2DModel,
     SCMScheduler,
 )
-from diffusers.models.modeling_utils import load_model_dict_into_meta
+from diffusers.models.model_loading_utils import load_model_dict_into_meta
 from diffusers.utils.import_utils import is_accelerate_available
 
 
diff --git a/scripts/convert_sana_video_to_diffusers.py b/scripts/convert_sana_video_to_diffusers.py
new file mode 100644
index 000000000000..a939a06cbd46
--- /dev/null
+++ b/scripts/convert_sana_video_to_diffusers.py
@@ -0,0 +1,327 @@
+#!/usr/bin/env python
+from __future__ import annotations
+
+import argparse
+import os
+from contextlib import nullcontext
+
+import torch
+from accelerate import init_empty_weights
+from huggingface_hub import hf_hub_download, snapshot_download
+from termcolor import colored
+from transformers import AutoModelForCausalLM, AutoTokenizer
+
+from diffusers import (
+    AutoencoderKLWan,
+    DPMSolverMultistepScheduler,
+    FlowMatchEulerDiscreteScheduler,
+    SanaVideoPipeline,
+    SanaVideoTransformer3DModel,
+    UniPCMultistepScheduler,
+)
+from diffusers.utils.import_utils import is_accelerate_available
+
+
+CTX = init_empty_weights if is_accelerate_available else nullcontext
+
+ckpt_ids = ["Efficient-Large-Model/SANA-Video_2B_480p/checkpoints/SANA_Video_2B_480p.pth"]
+# https://github.com/NVlabs/Sana/blob/main/inference_video_scripts/inference_sana_video.py
+
+
+def main(args):
+    cache_dir_path = os.path.expanduser("~/.cache/huggingface/hub")
+
+    if args.orig_ckpt_path is None or args.orig_ckpt_path in ckpt_ids:
+        ckpt_id = args.orig_ckpt_path or ckpt_ids[0]
+        snapshot_download(
+            repo_id=f"{'/'.join(ckpt_id.split('/')[:2])}",
+            cache_dir=cache_dir_path,
+            repo_type="model",
+        )
+        file_path = hf_hub_download(
+            repo_id=f"{'/'.join(ckpt_id.split('/')[:2])}",
+            filename=f"{'/'.join(ckpt_id.split('/')[2:])}",
+            cache_dir=cache_dir_path,
+            repo_type="model",
+        )
+    else:
+        file_path = args.orig_ckpt_path
+
+    print(colored(f"Loading checkpoint from {file_path}", "green", attrs=["bold"]))
+    all_state_dict = torch.load(file_path, weights_only=True)
+    state_dict = all_state_dict.pop("state_dict")
+    converted_state_dict = {}
+
+    # Patch embeddings.
+    converted_state_dict["patch_embedding.weight"] = state_dict.pop("x_embedder.proj.weight")
+    converted_state_dict["patch_embedding.bias"] = state_dict.pop("x_embedder.proj.bias")
+
+    # Caption projection.
+    converted_state_dict["caption_projection.linear_1.weight"] = state_dict.pop("y_embedder.y_proj.fc1.weight")
+    converted_state_dict["caption_projection.linear_1.bias"] = state_dict.pop("y_embedder.y_proj.fc1.bias")
+    converted_state_dict["caption_projection.linear_2.weight"] = state_dict.pop("y_embedder.y_proj.fc2.weight")
+    converted_state_dict["caption_projection.linear_2.bias"] = state_dict.pop("y_embedder.y_proj.fc2.bias")
+
+    converted_state_dict["time_embed.emb.timestep_embedder.linear_1.weight"] = state_dict.pop(
+        "t_embedder.mlp.0.weight"
+    )
+    converted_state_dict["time_embed.emb.timestep_embedder.linear_1.bias"] = state_dict.pop("t_embedder.mlp.0.bias")
+    converted_state_dict["time_embed.emb.timestep_embedder.linear_2.weight"] = state_dict.pop(
+        "t_embedder.mlp.2.weight"
+    )
+    converted_state_dict["time_embed.emb.timestep_embedder.linear_2.bias"] = state_dict.pop("t_embedder.mlp.2.bias")
+
+    # Shared norm.
+    converted_state_dict["time_embed.linear.weight"] = state_dict.pop("t_block.1.weight")
+    converted_state_dict["time_embed.linear.bias"] = state_dict.pop("t_block.1.bias")
+
+    # y norm
+    converted_state_dict["caption_norm.weight"] = state_dict.pop("attention_y_norm.weight")
+
+    # scheduler
+    flow_shift = 8.0
+    if args.task == "i2v":
+        assert args.scheduler_type == "flow-euler", "Scheduler type must be flow-euler for i2v task."
+
+    # model config
+    layer_num = 20
+    # Positional embedding interpolation scale.
+    qk_norm = True
+
+    # sample size
+    if args.video_size == 480:
+        sample_size = 30  # Wan-VAE: 8xp2 downsample factor
+        patch_size = (1, 2, 2)
+    elif args.video_size == 720:
+        sample_size = 22  # Wan-VAE: 32xp1 downsample factor
+        patch_size = (1, 1, 1)
+    else:
+        raise ValueError(f"Video size {args.video_size} is not supported.")
+
+    for depth in range(layer_num):
+        # Transformer blocks.
+        converted_state_dict[f"transformer_blocks.{depth}.scale_shift_table"] = state_dict.pop(
+            f"blocks.{depth}.scale_shift_table"
+        )
+
+        # Linear Attention is all you need 🤘
+        # Self attention.
+        q, k, v = torch.chunk(state_dict.pop(f"blocks.{depth}.attn.qkv.weight"), 3, dim=0)
+        converted_state_dict[f"transformer_blocks.{depth}.attn1.to_q.weight"] = q
+        converted_state_dict[f"transformer_blocks.{depth}.attn1.to_k.weight"] = k
+        converted_state_dict[f"transformer_blocks.{depth}.attn1.to_v.weight"] = v
+        if qk_norm is not None:
+            # Add Q/K normalization for self-attention (attn1) - needed for Sana-Sprint and Sana-1.5
+            converted_state_dict[f"transformer_blocks.{depth}.attn1.norm_q.weight"] = state_dict.pop(
+                f"blocks.{depth}.attn.q_norm.weight"
+            )
+            converted_state_dict[f"transformer_blocks.{depth}.attn1.norm_k.weight"] = state_dict.pop(
+                f"blocks.{depth}.attn.k_norm.weight"
+            )
+        # Projection.
+        converted_state_dict[f"transformer_blocks.{depth}.attn1.to_out.0.weight"] = state_dict.pop(
+            f"blocks.{depth}.attn.proj.weight"
+        )
+        converted_state_dict[f"transformer_blocks.{depth}.attn1.to_out.0.bias"] = state_dict.pop(
+            f"blocks.{depth}.attn.proj.bias"
+        )
+
+        # Feed-forward.
+        converted_state_dict[f"transformer_blocks.{depth}.ff.conv_inverted.weight"] = state_dict.pop(
+            f"blocks.{depth}.mlp.inverted_conv.conv.weight"
+        )
+        converted_state_dict[f"transformer_blocks.{depth}.ff.conv_inverted.bias"] = state_dict.pop(
+            f"blocks.{depth}.mlp.inverted_conv.conv.bias"
+        )
+        converted_state_dict[f"transformer_blocks.{depth}.ff.conv_depth.weight"] = state_dict.pop(
+            f"blocks.{depth}.mlp.depth_conv.conv.weight"
+        )
+        converted_state_dict[f"transformer_blocks.{depth}.ff.conv_depth.bias"] = state_dict.pop(
+            f"blocks.{depth}.mlp.depth_conv.conv.bias"
+        )
+        converted_state_dict[f"transformer_blocks.{depth}.ff.conv_point.weight"] = state_dict.pop(
+            f"blocks.{depth}.mlp.point_conv.conv.weight"
+        )
+        converted_state_dict[f"transformer_blocks.{depth}.ff.conv_temp.weight"] = state_dict.pop(
+            f"blocks.{depth}.mlp.t_conv.weight"
+        )
+
+        # Cross-attention.
+        q = state_dict.pop(f"blocks.{depth}.cross_attn.q_linear.weight")
+        q_bias = state_dict.pop(f"blocks.{depth}.cross_attn.q_linear.bias")
+        k, v = torch.chunk(state_dict.pop(f"blocks.{depth}.cross_attn.kv_linear.weight"), 2, dim=0)
+        k_bias, v_bias = torch.chunk(state_dict.pop(f"blocks.{depth}.cross_attn.kv_linear.bias"), 2, dim=0)
+
+        converted_state_dict[f"transformer_blocks.{depth}.attn2.to_q.weight"] = q
+        converted_state_dict[f"transformer_blocks.{depth}.attn2.to_q.bias"] = q_bias
+        converted_state_dict[f"transformer_blocks.{depth}.attn2.to_k.weight"] = k
+        converted_state_dict[f"transformer_blocks.{depth}.attn2.to_k.bias"] = k_bias
+        converted_state_dict[f"transformer_blocks.{depth}.attn2.to_v.weight"] = v
+        converted_state_dict[f"transformer_blocks.{depth}.attn2.to_v.bias"] = v_bias
+        if qk_norm is not None:
+            # Add Q/K normalization for cross-attention (attn2) - needed for Sana-Sprint and Sana-1.5
+            converted_state_dict[f"transformer_blocks.{depth}.attn2.norm_q.weight"] = state_dict.pop(
+                f"blocks.{depth}.cross_attn.q_norm.weight"
+            )
+            converted_state_dict[f"transformer_blocks.{depth}.attn2.norm_k.weight"] = state_dict.pop(
+                f"blocks.{depth}.cross_attn.k_norm.weight"
+            )
+
+        converted_state_dict[f"transformer_blocks.{depth}.attn2.to_out.0.weight"] = state_dict.pop(
+            f"blocks.{depth}.cross_attn.proj.weight"
+        )
+        converted_state_dict[f"transformer_blocks.{depth}.attn2.to_out.0.bias"] = state_dict.pop(
+            f"blocks.{depth}.cross_attn.proj.bias"
+        )
+
+    # Final block.
+    converted_state_dict["proj_out.weight"] = state_dict.pop("final_layer.linear.weight")
+    converted_state_dict["proj_out.bias"] = state_dict.pop("final_layer.linear.bias")
+    converted_state_dict["scale_shift_table"] = state_dict.pop("final_layer.scale_shift_table")
+
+    # Transformer
+    with CTX():
+        transformer_kwargs = {
+            "in_channels": 16,
+            "out_channels": 16,
+            "num_attention_heads": 20,
+            "attention_head_dim": 112,
+            "num_layers": 20,
+            "num_cross_attention_heads": 20,
+            "cross_attention_head_dim": 112,
+            "cross_attention_dim": 2240,
+            "caption_channels": 2304,
+            "mlp_ratio": 3.0,
+            "attention_bias": False,
+            "sample_size": sample_size,
+            "patch_size": patch_size,
+            "norm_elementwise_affine": False,
+            "norm_eps": 1e-6,
+            "qk_norm": "rms_norm_across_heads",
+            "rope_max_seq_len": 1024,
+        }
+
+        transformer = SanaVideoTransformer3DModel(**transformer_kwargs)
+
+    transformer.load_state_dict(converted_state_dict, strict=True, assign=True)
+
+    try:
+        state_dict.pop("y_embedder.y_embedding")
+        state_dict.pop("pos_embed")
+        state_dict.pop("logvar_linear.weight")
+        state_dict.pop("logvar_linear.bias")
+    except KeyError:
+        print("y_embedder.y_embedding or pos_embed not found in the state_dict")
+
+    assert len(state_dict) == 0, f"State dict is not empty, {state_dict.keys()}"
+
+    num_model_params = sum(p.numel() for p in transformer.parameters())
+    print(f"Total number of transformer parameters: {num_model_params}")
+
+    transformer = transformer.to(weight_dtype)
+
+    if not args.save_full_pipeline:
+        print(
+            colored(
+                f"Only saving transformer model of {args.model_type}. "
+                f"Set --save_full_pipeline to save the whole Pipeline",
+                "green",
+                attrs=["bold"],
+            )
+        )
+        transformer.save_pretrained(
+            os.path.join(args.dump_path, "transformer"), safe_serialization=True, max_shard_size="5GB"
+        )
+    else:
+        print(colored(f"Saving the whole Pipeline containing {args.model_type}", "green", attrs=["bold"]))
+        # VAE
+        vae = AutoencoderKLWan.from_pretrained(
+            "Wan-AI/Wan2.1-T2V-1.3B-Diffusers", subfolder="vae", torch_dtype=torch.float32
+        )
+
+        # Text Encoder
+        text_encoder_model_path = "Efficient-Large-Model/gemma-2-2b-it"
+        tokenizer = AutoTokenizer.from_pretrained(text_encoder_model_path)
+        tokenizer.padding_side = "right"
+        text_encoder = AutoModelForCausalLM.from_pretrained(
+            text_encoder_model_path, torch_dtype=torch.bfloat16
+        ).get_decoder()
+
+        # Choose the appropriate pipeline and scheduler based on model type
+        # Original Sana scheduler
+        if args.scheduler_type == "flow-dpm_solver":
+            scheduler = DPMSolverMultistepScheduler(
+                flow_shift=flow_shift,
+                use_flow_sigmas=True,
+                prediction_type="flow_prediction",
+            )
+        elif args.scheduler_type == "flow-euler":
+            scheduler = FlowMatchEulerDiscreteScheduler(shift=flow_shift)
+        elif args.scheduler_type == "uni-pc":
+            scheduler = UniPCMultistepScheduler(
+                prediction_type="flow_prediction",
+                use_flow_sigmas=True,
+                num_train_timesteps=1000,
+                flow_shift=flow_shift,
+            )
+        else:
+            raise ValueError(f"Scheduler type {args.scheduler_type} is not supported")
+
+        pipe = SanaVideoPipeline(
+            tokenizer=tokenizer,
+            text_encoder=text_encoder,
+            transformer=transformer,
+            vae=vae,
+            scheduler=scheduler,
+        )
+
+        pipe.save_pretrained(args.dump_path, safe_serialization=True, max_shard_size="5GB")
+
+
+DTYPE_MAPPING = {
+    "fp32": torch.float32,
+    "fp16": torch.float16,
+    "bf16": torch.bfloat16,
+}
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument(
+        "--orig_ckpt_path", default=None, type=str, required=False, help="Path to the checkpoint to convert."
+    )
+    parser.add_argument(
+        "--video_size",
+        default=480,
+        type=int,
+        choices=[480, 720],
+        required=False,
+        help="Video size of pretrained model, 480 or 720.",
+    )
+    parser.add_argument(
+        "--model_type",
+        default="SanaVideo",
+        type=str,
+        choices=[
+            "SanaVideo",
+        ],
+    )
+    parser.add_argument(
+        "--scheduler_type",
+        default="flow-dpm_solver",
+        type=str,
+        choices=["flow-dpm_solver", "flow-euler", "uni-pc"],
+        help="Scheduler type to use.",
+    )
+    parser.add_argument("--task", default="t2v", type=str, required=True, help="Task to convert, t2v or i2v.")
+    parser.add_argument("--dump_path", default=None, type=str, required=True, help="Path to the output pipeline.")
+    parser.add_argument("--save_full_pipeline", action="store_true", help="save all the pipeline elements in one.")
+    parser.add_argument("--dtype", default="fp32", type=str, choices=["fp32", "fp16", "bf16"], help="Weight dtype.")
+
+    args = parser.parse_args()
+
+    device = "cuda" if torch.cuda.is_available() else "cpu"
+    weight_dtype = DTYPE_MAPPING[args.dtype]
+
+    main(args)
diff --git a/scripts/convert_sd3_to_diffusers.py b/scripts/convert_sd3_to_diffusers.py
index 0a3569efeab0..83cb436e6e32 100644
--- a/scripts/convert_sd3_to_diffusers.py
+++ b/scripts/convert_sd3_to_diffusers.py
@@ -7,7 +7,7 @@
 
 from diffusers import AutoencoderKL, SD3Transformer2DModel
 from diffusers.loaders.single_file_utils import convert_ldm_vae_checkpoint
-from diffusers.models.modeling_utils import load_model_dict_into_meta
+from diffusers.models.model_loading_utils import load_model_dict_into_meta
 from diffusers.utils.import_utils import is_accelerate_available
 
 
diff --git a/scripts/convert_stable_audio.py b/scripts/convert_stable_audio.py
index 757d47a3168b..c3479fc6b2bb 100644
--- a/scripts/convert_stable_audio.py
+++ b/scripts/convert_stable_audio.py
@@ -18,7 +18,7 @@
     StableAudioPipeline,
     StableAudioProjectionModel,
 )
-from diffusers.models.modeling_utils import load_model_dict_into_meta
+from diffusers.models.model_loading_utils import load_model_dict_into_meta
 from diffusers.utils import is_accelerate_available
 
 
diff --git a/scripts/convert_stable_cascade.py b/scripts/convert_stable_cascade.py
index ce10970b0b6a..97ed18d9b4d4 100644
--- a/scripts/convert_stable_cascade.py
+++ b/scripts/convert_stable_cascade.py
@@ -20,7 +20,7 @@
 )
 from diffusers.loaders.single_file_utils import convert_stable_cascade_unet_single_file_to_diffusers
 from diffusers.models import StableCascadeUNet
-from diffusers.models.modeling_utils import load_model_dict_into_meta
+from diffusers.models.model_loading_utils import load_model_dict_into_meta
 from diffusers.pipelines.wuerstchen import PaellaVQModel
 from diffusers.utils import is_accelerate_available
 
diff --git a/scripts/convert_stable_cascade_lite.py b/scripts/convert_stable_cascade_lite.py
index ddccaa3b2e8a..5f4804e30f74 100644
--- a/scripts/convert_stable_cascade_lite.py
+++ b/scripts/convert_stable_cascade_lite.py
@@ -20,7 +20,7 @@
 )
 from diffusers.loaders.single_file_utils import convert_stable_cascade_unet_single_file_to_diffusers
 from diffusers.models import StableCascadeUNet
-from diffusers.models.modeling_utils import load_model_dict_into_meta
+from diffusers.models.model_loading_utils import load_model_dict_into_meta
 from diffusers.pipelines.wuerstchen import PaellaVQModel
 from diffusers.utils import is_accelerate_available
 
diff --git a/scripts/convert_wan_to_diffusers.py b/scripts/convert_wan_to_diffusers.py
index 39a364b07d78..06f87409262a 100644
--- a/scripts/convert_wan_to_diffusers.py
+++ b/scripts/convert_wan_to_diffusers.py
@@ -6,11 +6,20 @@
 from accelerate import init_empty_weights
 from huggingface_hub import hf_hub_download, snapshot_download
 from safetensors.torch import load_file
-from transformers import AutoProcessor, AutoTokenizer, CLIPVisionModelWithProjection, UMT5EncoderModel
+from transformers import (
+    AutoProcessor,
+    AutoTokenizer,
+    CLIPImageProcessor,
+    CLIPVisionModel,
+    CLIPVisionModelWithProjection,
+    UMT5EncoderModel,
+)
 
 from diffusers import (
     AutoencoderKLWan,
     UniPCMultistepScheduler,
+    WanAnimatePipeline,
+    WanAnimateTransformer3DModel,
     WanImageToVideoPipeline,
     WanPipeline,
     WanTransformer3DModel,
@@ -105,8 +114,203 @@
     "after_proj": "proj_out",
 }
 
+ANIMATE_TRANSFORMER_KEYS_RENAME_DICT = {
+    "time_embedding.0": "condition_embedder.time_embedder.linear_1",
+    "time_embedding.2": "condition_embedder.time_embedder.linear_2",
+    "text_embedding.0": "condition_embedder.text_embedder.linear_1",
+    "text_embedding.2": "condition_embedder.text_embedder.linear_2",
+    "time_projection.1": "condition_embedder.time_proj",
+    "head.modulation": "scale_shift_table",
+    "head.head": "proj_out",
+    "modulation": "scale_shift_table",
+    "ffn.0": "ffn.net.0.proj",
+    "ffn.2": "ffn.net.2",
+    # Hack to swap the layer names
+    # The original model calls the norms in following order: norm1, norm3, norm2
+    # We convert it to: norm1, norm2, norm3
+    "norm2": "norm__placeholder",
+    "norm3": "norm2",
+    "norm__placeholder": "norm3",
+    "img_emb.proj.0": "condition_embedder.image_embedder.norm1",
+    "img_emb.proj.1": "condition_embedder.image_embedder.ff.net.0.proj",
+    "img_emb.proj.3": "condition_embedder.image_embedder.ff.net.2",
+    "img_emb.proj.4": "condition_embedder.image_embedder.norm2",
+    # Add attention component mappings
+    "self_attn.q": "attn1.to_q",
+    "self_attn.k": "attn1.to_k",
+    "self_attn.v": "attn1.to_v",
+    "self_attn.o": "attn1.to_out.0",
+    "self_attn.norm_q": "attn1.norm_q",
+    "self_attn.norm_k": "attn1.norm_k",
+    "cross_attn.q": "attn2.to_q",
+    "cross_attn.k": "attn2.to_k",
+    "cross_attn.v": "attn2.to_v",
+    "cross_attn.o": "attn2.to_out.0",
+    "cross_attn.norm_q": "attn2.norm_q",
+    "cross_attn.norm_k": "attn2.norm_k",
+    "cross_attn.k_img": "attn2.to_k_img",
+    "cross_attn.v_img": "attn2.to_v_img",
+    "cross_attn.norm_k_img": "attn2.norm_k_img",
+    # After cross_attn -> attn2 rename, we need to rename the img keys
+    "attn2.to_k_img": "attn2.add_k_proj",
+    "attn2.to_v_img": "attn2.add_v_proj",
+    "attn2.norm_k_img": "attn2.norm_added_k",
+    # Wan Animate-specific mappings (motion encoder, face encoder, face adapter)
+    # Motion encoder mappings
+    # The name mapping is complicated for the convolutional part so we handle that in its own function
+    "motion_encoder.enc.fc": "motion_encoder.motion_network",
+    "motion_encoder.dec.direction.weight": "motion_encoder.motion_synthesis_weight",
+    # Face encoder mappings - CausalConv1d has a .conv submodule that we need to flatten
+    "face_encoder.conv1_local.conv": "face_encoder.conv1_local",
+    "face_encoder.conv2.conv": "face_encoder.conv2",
+    "face_encoder.conv3.conv": "face_encoder.conv3",
+    # Face adapter mappings are handled in a separate function
+}
+
+
+# TODO: Verify this and simplify if possible.
+def convert_animate_motion_encoder_weights(key: str, state_dict: Dict[str, Any], final_conv_idx: int = 8) -> None:
+    """
+    Convert all motion encoder weights for Animate model.
+
+    In the original model:
+    - All Linear layers in fc use EqualLinear
+    - All Conv2d layers in convs use EqualConv2d (except blur_conv which is initialized separately)
+    - Blur kernels are stored as buffers in Sequential modules
+    - ConvLayer is nn.Sequential with indices: [Blur (optional), EqualConv2d, FusedLeakyReLU (optional)]
+
+    Conversion strategy:
+    1. Drop .kernel buffers (blur kernels)
+    2. Rename sequential indices to named components (e.g., 0 -> conv2d, 1 -> bias_leaky_relu)
+    """
+    # Skip if not a weight, bias, or kernel
+    if ".weight" not in key and ".bias" not in key and ".kernel" not in key:
+        return
+
+    # Handle Blur kernel buffers from original implementation.
+    # After renaming, these appear under: motion_encoder.res_blocks.*.conv{2,skip}.blur_kernel
+    # Diffusers constructs blur kernels as a non-persistent buffer so we must drop these keys
+    if ".kernel" in key and "motion_encoder" in key:
+        # Remove unexpected blur kernel buffers to avoid strict load errors
+        state_dict.pop(key, None)
+        return
+
+    # Rename Sequential indices to named components in ConvLayer and ResBlock
+    if ".enc.net_app.convs." in key and (".weight" in key or ".bias" in key):
+        parts = key.split(".")
+
+        # Find the sequential index (digit) after convs or after conv1/conv2/skip
+        # Examples:
+        # - enc.net_app.convs.0.0.weight -> conv_in.weight (initial conv layer weight)
+        # - enc.net_app.convs.0.1.bias -> conv_in.act_fn.bias (initial conv layer bias)
+        # - enc.net_app.convs.{n:1-7}.conv1.0.weight -> res_blocks.{(n-1):0-6}.conv1.weight (conv1 weight)
+        #     - e.g. enc.net_app.convs.1.conv1.0.weight -> res_blocks.0.conv1.weight
+        # - enc.net_app.convs.{n:1-7}.conv1.1.bias -> res_blocks.{(n-1):0-6}.conv1.act_fn.bias (conv1 bias)
+        #     - e.g. enc.net_app.convs.1.conv1.1.bias -> res_blocks.0.conv1.act_fn.bias
+        # - enc.net_app.convs.{n:1-7}.conv2.1.weight -> res_blocks.{(n-1):0-6}.conv2.weight (conv2 weight)
+        # - enc.net_app.convs.1.conv2.2.bias -> res_blocks.0.conv2.act_fn.bias (conv2 bias)
+        # - enc.net_app.convs.{n:1-7}.skip.1.weight -> res_blocks.{(n-1):0-6}.conv_skip.weight (skip conv weight)
+        # - enc.net_app.convs.8 -> conv_out (final conv layer)
+
+        convs_idx = parts.index("convs") if "convs" in parts else -1
+        if convs_idx >= 0 and len(parts) - convs_idx >= 2:
+            bias = False
+            # The nn.Sequential index will always follow convs
+            sequential_idx = int(parts[convs_idx + 1])
+            if sequential_idx == 0:
+                if key.endswith(".weight"):
+                    new_key = "motion_encoder.conv_in.weight"
+                elif key.endswith(".bias"):
+                    new_key = "motion_encoder.conv_in.act_fn.bias"
+                    bias = True
+            elif sequential_idx == final_conv_idx:
+                if key.endswith(".weight"):
+                    new_key = "motion_encoder.conv_out.weight"
+            else:
+                # Intermediate .convs. layers, which get mapped to .res_blocks.
+                prefix = "motion_encoder.res_blocks."
+
+                layer_name = parts[convs_idx + 2]
+                if layer_name == "skip":
+                    layer_name = "conv_skip"
+
+                if key.endswith(".weight"):
+                    param_name = "weight"
+                elif key.endswith(".bias"):
+                    param_name = "act_fn.bias"
+                    bias = True
+
+                suffix_parts = [str(sequential_idx - 1), layer_name, param_name]
+                suffix = ".".join(suffix_parts)
+                new_key = prefix + suffix
+
+            param = state_dict.pop(key)
+            if bias:
+                param = param.squeeze()
+            state_dict[new_key] = param
+            return
+        return
+    return
+
+
+def convert_animate_face_adapter_weights(key: str, state_dict: Dict[str, Any]) -> None:
+    """
+    Convert face adapter weights for the Animate model.
+
+    The original model uses a fused KV projection but the diffusers models uses separate K and V projections.
+    """
+    # Skip if not a weight or bias
+    if ".weight" not in key and ".bias" not in key:
+        return
+
+    prefix = "face_adapter."
+    if ".fuser_blocks." in key:
+        parts = key.split(".")
+
+        module_list_idx = parts.index("fuser_blocks") if "fuser_blocks" in parts else -1
+        if module_list_idx >= 0 and (len(parts) - 1) - module_list_idx == 3:
+            block_idx = parts[module_list_idx + 1]
+            layer_name = parts[module_list_idx + 2]
+            param_name = parts[module_list_idx + 3]
+
+            if layer_name == "linear1_kv":
+                layer_name_k = "to_k"
+                layer_name_v = "to_v"
+
+                suffix_k = ".".join([block_idx, layer_name_k, param_name])
+                suffix_v = ".".join([block_idx, layer_name_v, param_name])
+                new_key_k = prefix + suffix_k
+                new_key_v = prefix + suffix_v
+
+                kv_proj = state_dict.pop(key)
+                k_proj, v_proj = torch.chunk(kv_proj, 2, dim=0)
+                state_dict[new_key_k] = k_proj
+                state_dict[new_key_v] = v_proj
+                return
+            else:
+                if layer_name == "q_norm":
+                    new_layer_name = "norm_q"
+                elif layer_name == "k_norm":
+                    new_layer_name = "norm_k"
+                elif layer_name == "linear1_q":
+                    new_layer_name = "to_q"
+                elif layer_name == "linear2":
+                    new_layer_name = "to_out"
+
+                suffix_parts = [block_idx, new_layer_name, param_name]
+                suffix = ".".join(suffix_parts)
+                new_key = prefix + suffix
+                state_dict[new_key] = state_dict.pop(key)
+                return
+    return
+
+
 TRANSFORMER_SPECIAL_KEYS_REMAP = {}
 VACE_TRANSFORMER_SPECIAL_KEYS_REMAP = {}
+ANIMATE_TRANSFORMER_SPECIAL_KEYS_REMAP = {
+    "motion_encoder": convert_animate_motion_encoder_weights,
+    "face_adapter": convert_animate_face_adapter_weights,
+}
 
 
 def update_state_dict_(state_dict: Dict[str, Any], old_key: str, new_key: str) -> Dict[str, Any]:
@@ -364,6 +568,37 @@ def get_transformer_config(model_type: str) -> Tuple[Dict[str, Any], ...]:
         }
         RENAME_DICT = TRANSFORMER_KEYS_RENAME_DICT
         SPECIAL_KEYS_REMAP = TRANSFORMER_SPECIAL_KEYS_REMAP
+    elif model_type == "Wan2.2-Animate-14B":
+        config = {
+            "model_id": "Wan-AI/Wan2.2-Animate-14B",
+            "diffusers_config": {
+                "image_dim": 1280,
+                "added_kv_proj_dim": 5120,
+                "attention_head_dim": 128,
+                "cross_attn_norm": True,
+                "eps": 1e-06,
+                "ffn_dim": 13824,
+                "freq_dim": 256,
+                "in_channels": 36,
+                "num_attention_heads": 40,
+                "num_layers": 40,
+                "out_channels": 16,
+                "patch_size": (1, 2, 2),
+                "qk_norm": "rms_norm_across_heads",
+                "text_dim": 4096,
+                "rope_max_seq_len": 1024,
+                "pos_embed_seq_len": None,
+                "motion_encoder_size": 512,  # Start of Wan Animate-specific configs
+                "motion_style_dim": 512,
+                "motion_dim": 20,
+                "motion_encoder_dim": 512,
+                "face_encoder_hidden_dim": 1024,
+                "face_encoder_num_heads": 4,
+                "inject_face_latents_blocks": 5,
+            },
+        }
+        RENAME_DICT = ANIMATE_TRANSFORMER_KEYS_RENAME_DICT
+        SPECIAL_KEYS_REMAP = ANIMATE_TRANSFORMER_SPECIAL_KEYS_REMAP
     return config, RENAME_DICT, SPECIAL_KEYS_REMAP
 
 
@@ -380,10 +615,12 @@ def convert_transformer(model_type: str, stage: str = None):
     original_state_dict = load_sharded_safetensors(model_dir)
 
     with init_empty_weights():
-        if "VACE" not in model_type:
-            transformer = WanTransformer3DModel.from_config(diffusers_config)
-        else:
+        if "Animate" in model_type:
+            transformer = WanAnimateTransformer3DModel.from_config(diffusers_config)
+        elif "VACE" in model_type:
             transformer = WanVACETransformer3DModel.from_config(diffusers_config)
+        else:
+            transformer = WanTransformer3DModel.from_config(diffusers_config)
 
     for key in list(original_state_dict.keys()):
         new_key = key[:]
@@ -397,7 +634,12 @@ def convert_transformer(model_type: str, stage: str = None):
                 continue
             handler_fn_inplace(key, original_state_dict)
 
+    # Load state dict into the meta model, which will materialize the tensors
     transformer.load_state_dict(original_state_dict, strict=True, assign=True)
+
+    # Move to CPU to ensure all tensors are materialized
+    transformer = transformer.to("cpu")
+
     return transformer
 
 
@@ -926,7 +1168,7 @@ def get_args():
 if __name__ == "__main__":
     args = get_args()
 
-    if "Wan2.2" in args.model_type and "TI2V" not in args.model_type:
+    if "Wan2.2" in args.model_type and "TI2V" not in args.model_type and "Animate" not in args.model_type:
         transformer = convert_transformer(args.model_type, stage="high_noise_model")
         transformer_2 = convert_transformer(args.model_type, stage="low_noise_model")
     else:
@@ -942,7 +1184,7 @@ def get_args():
     tokenizer = AutoTokenizer.from_pretrained("google/umt5-xxl")
     if "FLF2V" in args.model_type:
         flow_shift = 16.0
-    elif "TI2V" in args.model_type:
+    elif "TI2V" in args.model_type or "Animate" in args.model_type:
         flow_shift = 5.0
     else:
         flow_shift = 3.0
@@ -954,6 +1196,8 @@ def get_args():
     if args.dtype != "none":
         dtype = DTYPE_MAPPING[args.dtype]
         transformer.to(dtype)
+        if transformer_2 is not None:
+            transformer_2.to(dtype)
 
     if "Wan2.2" and "I2V" in args.model_type and "TI2V" not in args.model_type:
         pipe = WanImageToVideoPipeline(
@@ -1016,6 +1260,21 @@ def get_args():
             vae=vae,
             scheduler=scheduler,
         )
+    elif "Animate" in args.model_type:
+        image_encoder = CLIPVisionModel.from_pretrained(
+            "laion/CLIP-ViT-H-14-laion2B-s32B-b79K", torch_dtype=torch.bfloat16
+        )
+        image_processor = CLIPImageProcessor.from_pretrained("laion/CLIP-ViT-H-14-laion2B-s32B-b79K")
+
+        pipe = WanAnimatePipeline(
+            transformer=transformer,
+            text_encoder=text_encoder,
+            tokenizer=tokenizer,
+            vae=vae,
+            scheduler=scheduler,
+            image_encoder=image_encoder,
+            image_processor=image_processor,
+        )
     else:
         pipe = WanPipeline(
             transformer=transformer,
diff --git a/src/diffusers/__init__.py b/src/diffusers/__init__.py
index 94104667b541..6df4ad489415 100644
--- a/src/diffusers/__init__.py
+++ b/src/diffusers/__init__.py
@@ -169,10 +169,12 @@
             "LayerSkipConfig",
             "PyramidAttentionBroadcastConfig",
             "SmoothedEnergyGuidanceConfig",
+            "TaylorSeerCacheConfig",
             "apply_faster_cache",
             "apply_first_block_cache",
             "apply_layer_skip",
             "apply_pyramid_attention_broadcast",
+            "apply_taylorseer_cache",
         ]
     )
     _import_structure["models"].extend(
@@ -186,9 +188,11 @@
             "AutoencoderKLAllegro",
             "AutoencoderKLCogVideoX",
             "AutoencoderKLCosmos",
+            "AutoencoderKLFlux2",
             "AutoencoderKLHunyuanImage",
             "AutoencoderKLHunyuanImageRefiner",
             "AutoencoderKLHunyuanVideo",
+            "AutoencoderKLHunyuanVideo15",
             "AutoencoderKLLTXVideo",
             "AutoencoderKLMagvit",
             "AutoencoderKLMochi",
@@ -202,6 +206,7 @@
             "BriaTransformer2DModel",
             "CacheMixin",
             "ChromaTransformer2DModel",
+            "ChronoEditTransformer3DModel",
             "CogVideoXTransformer3DModel",
             "CogView3PlusTransformer2DModel",
             "CogView4Transformer2DModel",
@@ -214,6 +219,7 @@
             "CosmosTransformer3DModel",
             "DiTTransformer2DModel",
             "EasyAnimateTransformer3DModel",
+            "Flux2Transformer2DModel",
             "FluxControlNetModel",
             "FluxMultiControlNetModel",
             "FluxTransformer2DModel",
@@ -222,6 +228,7 @@
             "HunyuanDiT2DModel",
             "HunyuanDiT2DMultiControlNetModel",
             "HunyuanImageTransformer2DModel",
+            "HunyuanVideo15Transformer3DModel",
             "HunyuanVideoFramepackTransformer3DModel",
             "HunyuanVideoTransformer3DModel",
             "I2VGenXLUNet",
@@ -237,6 +244,7 @@
             "MultiAdapter",
             "MultiControlNetModel",
             "OmniGenTransformer2DModel",
+            "OvisImageTransformer2DModel",
             "ParallelConfig",
             "PixArtTransformer2DModel",
             "PriorTransformer",
@@ -246,6 +254,7 @@
             "QwenImageTransformer2DModel",
             "SanaControlNetModel",
             "SanaTransformer2DModel",
+            "SanaVideoTransformer3DModel",
             "SD3ControlNetModel",
             "SD3MultiControlNetModel",
             "SD3Transformer2DModel",
@@ -266,8 +275,10 @@
             "UNetSpatioTemporalConditionModel",
             "UVit2DModel",
             "VQModel",
+            "WanAnimateTransformer3DModel",
             "WanTransformer3DModel",
             "WanVACETransformer3DModel",
+            "ZImageTransformer2DModel",
             "attention_backend",
         ]
     )
@@ -405,6 +416,7 @@
             "QwenImageModularPipeline",
             "StableDiffusionXLAutoBlocks",
             "StableDiffusionXLModularPipeline",
+            "Wan22AutoBlocks",
             "WanAutoBlocks",
             "WanModularPipeline",
         ]
@@ -435,6 +447,7 @@
             "BriaPipeline",
             "ChromaImg2ImgPipeline",
             "ChromaPipeline",
+            "ChronoEditPipeline",
             "CLIPImageProjection",
             "CogVideoXFunControlPipeline",
             "CogVideoXImageToVideoPipeline",
@@ -452,6 +465,7 @@
             "EasyAnimateControlPipeline",
             "EasyAnimateInpaintPipeline",
             "EasyAnimatePipeline",
+            "Flux2Pipeline",
             "FluxControlImg2ImgPipeline",
             "FluxControlInpaintPipeline",
             "FluxControlNetImg2ImgPipeline",
@@ -472,6 +486,8 @@
             "HunyuanImagePipeline",
             "HunyuanImageRefinerPipeline",
             "HunyuanSkyreelsImageToVideoPipeline",
+            "HunyuanVideo15ImageToVideoPipeline",
+            "HunyuanVideo15Pipeline",
             "HunyuanVideoFramepackPipeline",
             "HunyuanVideoImageToVideoPipeline",
             "HunyuanVideoPipeline",
@@ -485,6 +501,9 @@
             "ImageTextPipelineOutput",
             "Kandinsky3Img2ImgPipeline",
             "Kandinsky3Pipeline",
+            "Kandinsky5I2IPipeline",
+            "Kandinsky5I2VPipeline",
+            "Kandinsky5T2IPipeline",
             "Kandinsky5T2VPipeline",
             "KandinskyCombinedPipeline",
             "KandinskyImg2ImgCombinedPipeline",
@@ -524,6 +543,7 @@
             "MochiPipeline",
             "MusicLDMPipeline",
             "OmniGenPipeline",
+            "OvisImagePipeline",
             "PaintByExamplePipeline",
             "PIAPipeline",
             "PixArtAlphaPipeline",
@@ -540,10 +560,13 @@
             "QwenImagePipeline",
             "ReduxImageEncoder",
             "SanaControlNetPipeline",
+            "SanaImageToVideoPipeline",
             "SanaPAGPipeline",
             "SanaPipeline",
             "SanaSprintImg2ImgPipeline",
             "SanaSprintPipeline",
+            "SanaVideoPipeline",
+            "SanaVideoPipeline",
             "SemanticStableDiffusionPipeline",
             "ShapEImg2ImgPipeline",
             "ShapEPipeline",
@@ -631,6 +654,7 @@
             "VisualClozeGenerationPipeline",
             "VisualClozePipeline",
             "VQDiffusionPipeline",
+            "WanAnimatePipeline",
             "WanImageToVideoPipeline",
             "WanPipeline",
             "WanVACEPipeline",
@@ -638,6 +662,8 @@
             "WuerstchenCombinedPipeline",
             "WuerstchenDecoderPipeline",
             "WuerstchenPriorPipeline",
+            "ZImageImg2ImgPipeline",
+            "ZImagePipeline",
         ]
     )
 
@@ -876,10 +902,12 @@
             LayerSkipConfig,
             PyramidAttentionBroadcastConfig,
             SmoothedEnergyGuidanceConfig,
+            TaylorSeerCacheConfig,
             apply_faster_cache,
             apply_first_block_cache,
             apply_layer_skip,
             apply_pyramid_attention_broadcast,
+            apply_taylorseer_cache,
         )
         from .models import (
             AllegroTransformer3DModel,
@@ -891,9 +919,11 @@
             AutoencoderKLAllegro,
             AutoencoderKLCogVideoX,
             AutoencoderKLCosmos,
+            AutoencoderKLFlux2,
             AutoencoderKLHunyuanImage,
             AutoencoderKLHunyuanImageRefiner,
             AutoencoderKLHunyuanVideo,
+            AutoencoderKLHunyuanVideo15,
             AutoencoderKLLTXVideo,
             AutoencoderKLMagvit,
             AutoencoderKLMochi,
@@ -907,6 +937,7 @@
             BriaTransformer2DModel,
             CacheMixin,
             ChromaTransformer2DModel,
+            ChronoEditTransformer3DModel,
             CogVideoXTransformer3DModel,
             CogView3PlusTransformer2DModel,
             CogView4Transformer2DModel,
@@ -919,6 +950,7 @@
             CosmosTransformer3DModel,
             DiTTransformer2DModel,
             EasyAnimateTransformer3DModel,
+            Flux2Transformer2DModel,
             FluxControlNetModel,
             FluxMultiControlNetModel,
             FluxTransformer2DModel,
@@ -927,6 +959,7 @@
             HunyuanDiT2DModel,
             HunyuanDiT2DMultiControlNetModel,
             HunyuanImageTransformer2DModel,
+            HunyuanVideo15Transformer3DModel,
             HunyuanVideoFramepackTransformer3DModel,
             HunyuanVideoTransformer3DModel,
             I2VGenXLUNet,
@@ -942,6 +975,7 @@
             MultiAdapter,
             MultiControlNetModel,
             OmniGenTransformer2DModel,
+            OvisImageTransformer2DModel,
             ParallelConfig,
             PixArtTransformer2DModel,
             PriorTransformer,
@@ -951,6 +985,7 @@
             QwenImageTransformer2DModel,
             SanaControlNetModel,
             SanaTransformer2DModel,
+            SanaVideoTransformer3DModel,
             SD3ControlNetModel,
             SD3MultiControlNetModel,
             SD3Transformer2DModel,
@@ -970,8 +1005,10 @@
             UNetSpatioTemporalConditionModel,
             UVit2DModel,
             VQModel,
+            WanAnimateTransformer3DModel,
             WanTransformer3DModel,
             WanVACETransformer3DModel,
+            ZImageTransformer2DModel,
             attention_backend,
         )
         from .modular_pipelines import ComponentsManager, ComponentSpec, ModularPipeline, ModularPipelineBlocks
@@ -1084,6 +1121,7 @@
             QwenImageModularPipeline,
             StableDiffusionXLAutoBlocks,
             StableDiffusionXLModularPipeline,
+            Wan22AutoBlocks,
             WanAutoBlocks,
             WanModularPipeline,
         )
@@ -1110,6 +1148,7 @@
             BriaPipeline,
             ChromaImg2ImgPipeline,
             ChromaPipeline,
+            ChronoEditPipeline,
             CLIPImageProjection,
             CogVideoXFunControlPipeline,
             CogVideoXImageToVideoPipeline,
@@ -1127,6 +1166,7 @@
             EasyAnimateControlPipeline,
             EasyAnimateInpaintPipeline,
             EasyAnimatePipeline,
+            Flux2Pipeline,
             FluxControlImg2ImgPipeline,
             FluxControlInpaintPipeline,
             FluxControlNetImg2ImgPipeline,
@@ -1147,6 +1187,8 @@
             HunyuanImagePipeline,
             HunyuanImageRefinerPipeline,
             HunyuanSkyreelsImageToVideoPipeline,
+            HunyuanVideo15ImageToVideoPipeline,
+            HunyuanVideo15Pipeline,
             HunyuanVideoFramepackPipeline,
             HunyuanVideoImageToVideoPipeline,
             HunyuanVideoPipeline,
@@ -1160,6 +1202,9 @@
             ImageTextPipelineOutput,
             Kandinsky3Img2ImgPipeline,
             Kandinsky3Pipeline,
+            Kandinsky5I2IPipeline,
+            Kandinsky5I2VPipeline,
+            Kandinsky5T2IPipeline,
             Kandinsky5T2VPipeline,
             KandinskyCombinedPipeline,
             KandinskyImg2ImgCombinedPipeline,
@@ -1199,6 +1244,7 @@
             MochiPipeline,
             MusicLDMPipeline,
             OmniGenPipeline,
+            OvisImagePipeline,
             PaintByExamplePipeline,
             PIAPipeline,
             PixArtAlphaPipeline,
@@ -1215,10 +1261,12 @@
             QwenImagePipeline,
             ReduxImageEncoder,
             SanaControlNetPipeline,
+            SanaImageToVideoPipeline,
             SanaPAGPipeline,
             SanaPipeline,
             SanaSprintImg2ImgPipeline,
             SanaSprintPipeline,
+            SanaVideoPipeline,
             SemanticStableDiffusionPipeline,
             ShapEImg2ImgPipeline,
             ShapEPipeline,
@@ -1305,6 +1353,7 @@
             VisualClozeGenerationPipeline,
             VisualClozePipeline,
             VQDiffusionPipeline,
+            WanAnimatePipeline,
             WanImageToVideoPipeline,
             WanPipeline,
             WanVACEPipeline,
@@ -1312,6 +1361,8 @@
             WuerstchenCombinedPipeline,
             WuerstchenDecoderPipeline,
             WuerstchenPriorPipeline,
+            ZImageImg2ImgPipeline,
+            ZImagePipeline,
         )
 
     try:
diff --git a/src/diffusers/guiders/adaptive_projected_guidance.py b/src/diffusers/guiders/adaptive_projected_guidance.py
index 492d10d2f108..8ec30d02d758 100644
--- a/src/diffusers/guiders/adaptive_projected_guidance.py
+++ b/src/diffusers/guiders/adaptive_projected_guidance.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 
 import math
-from typing import TYPE_CHECKING, Dict, List, Optional, Tuple
+from typing import TYPE_CHECKING, Dict, List, Optional, Tuple, Union
 
 import torch
 
@@ -88,6 +88,19 @@ def prepare_inputs(self, data: Dict[str, Tuple[torch.Tensor, torch.Tensor]]) ->
             data_batches.append(data_batch)
         return data_batches
 
+    def prepare_inputs_from_block_state(
+        self, data: "BlockState", input_fields: Dict[str, Union[str, Tuple[str, str]]]
+    ) -> List["BlockState"]:
+        if self._step == 0:
+            if self.adaptive_projected_guidance_momentum is not None:
+                self.momentum_buffer = MomentumBuffer(self.adaptive_projected_guidance_momentum)
+        tuple_indices = [0] if self.num_conditions == 1 else [0, 1]
+        data_batches = []
+        for tuple_idx, input_prediction in zip(tuple_indices, self._input_predictions):
+            data_batch = self._prepare_batch_from_block_state(input_fields, data, tuple_idx, input_prediction)
+            data_batches.append(data_batch)
+        return data_batches
+
     def forward(self, pred_cond: torch.Tensor, pred_uncond: Optional[torch.Tensor] = None) -> GuiderOutput:
         pred = None
 
diff --git a/src/diffusers/guiders/adaptive_projected_guidance_mix.py b/src/diffusers/guiders/adaptive_projected_guidance_mix.py
index 732741fc927f..bdc97bcf6269 100644
--- a/src/diffusers/guiders/adaptive_projected_guidance_mix.py
+++ b/src/diffusers/guiders/adaptive_projected_guidance_mix.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 
 import math
-from typing import TYPE_CHECKING, Dict, List, Optional, Tuple
+from typing import TYPE_CHECKING, Dict, List, Optional, Tuple, Union
 
 import torch
 
@@ -99,6 +99,19 @@ def prepare_inputs(self, data: Dict[str, Tuple[torch.Tensor, torch.Tensor]]) ->
             data_batches.append(data_batch)
         return data_batches
 
+    def prepare_inputs_from_block_state(
+        self, data: "BlockState", input_fields: Dict[str, Union[str, Tuple[str, str]]]
+    ) -> List["BlockState"]:
+        if self._step == 0:
+            if self.adaptive_projected_guidance_momentum is not None:
+                self.momentum_buffer = MomentumBuffer(self.adaptive_projected_guidance_momentum)
+        tuple_indices = [0] if self.num_conditions == 1 else [0, 1]
+        data_batches = []
+        for tuple_idx, input_prediction in zip(tuple_indices, self._input_predictions):
+            data_batch = self._prepare_batch_from_block_state(input_fields, data, tuple_idx, input_prediction)
+            data_batches.append(data_batch)
+        return data_batches
+
     def forward(self, pred_cond: torch.Tensor, pred_uncond: Optional[torch.Tensor] = None) -> GuiderOutput:
         pred = None
 
diff --git a/src/diffusers/guiders/auto_guidance.py b/src/diffusers/guiders/auto_guidance.py
index 4374f45aff7c..b7f62e2f4a6e 100644
--- a/src/diffusers/guiders/auto_guidance.py
+++ b/src/diffusers/guiders/auto_guidance.py
@@ -141,6 +141,16 @@ def prepare_inputs(self, data: Dict[str, Tuple[torch.Tensor, torch.Tensor]]) ->
             data_batches.append(data_batch)
         return data_batches
 
+    def prepare_inputs_from_block_state(
+        self, data: "BlockState", input_fields: Dict[str, Union[str, Tuple[str, str]]]
+    ) -> List["BlockState"]:
+        tuple_indices = [0] if self.num_conditions == 1 else [0, 1]
+        data_batches = []
+        for tuple_idx, input_prediction in zip(tuple_indices, self._input_predictions):
+            data_batch = self._prepare_batch_from_block_state(input_fields, data, tuple_idx, input_prediction)
+            data_batches.append(data_batch)
+        return data_batches
+
     def forward(self, pred_cond: torch.Tensor, pred_uncond: Optional[torch.Tensor] = None) -> GuiderOutput:
         pred = None
 
diff --git a/src/diffusers/guiders/classifier_free_guidance.py b/src/diffusers/guiders/classifier_free_guidance.py
index d475b302263d..5e55d4d869c1 100644
--- a/src/diffusers/guiders/classifier_free_guidance.py
+++ b/src/diffusers/guiders/classifier_free_guidance.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 
 import math
-from typing import TYPE_CHECKING, Dict, List, Optional, Tuple
+from typing import TYPE_CHECKING, Dict, List, Optional, Tuple, Union
 
 import torch
 
@@ -99,6 +99,16 @@ def prepare_inputs(self, data: Dict[str, Tuple[torch.Tensor, torch.Tensor]]) ->
             data_batches.append(data_batch)
         return data_batches
 
+    def prepare_inputs_from_block_state(
+        self, data: "BlockState", input_fields: Dict[str, Union[str, Tuple[str, str]]]
+    ) -> List["BlockState"]:
+        tuple_indices = [0] if self.num_conditions == 1 else [0, 1]
+        data_batches = []
+        for tuple_idx, input_prediction in zip(tuple_indices, self._input_predictions):
+            data_batch = self._prepare_batch_from_block_state(input_fields, data, tuple_idx, input_prediction)
+            data_batches.append(data_batch)
+        return data_batches
+
     def forward(self, pred_cond: torch.Tensor, pred_uncond: Optional[torch.Tensor] = None) -> GuiderOutput:
         pred = None
 
diff --git a/src/diffusers/guiders/classifier_free_zero_star_guidance.py b/src/diffusers/guiders/classifier_free_zero_star_guidance.py
index 1ea6bbb1c830..23b492e51b02 100644
--- a/src/diffusers/guiders/classifier_free_zero_star_guidance.py
+++ b/src/diffusers/guiders/classifier_free_zero_star_guidance.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 
 import math
-from typing import TYPE_CHECKING, Dict, List, Optional, Tuple
+from typing import TYPE_CHECKING, Dict, List, Optional, Tuple, Union
 
 import torch
 
@@ -85,6 +85,16 @@ def prepare_inputs(self, data: Dict[str, Tuple[torch.Tensor, torch.Tensor]]) ->
             data_batches.append(data_batch)
         return data_batches
 
+    def prepare_inputs_from_block_state(
+        self, data: "BlockState", input_fields: Dict[str, Union[str, Tuple[str, str]]]
+    ) -> List["BlockState"]:
+        tuple_indices = [0] if self.num_conditions == 1 else [0, 1]
+        data_batches = []
+        for tuple_idx, input_prediction in zip(tuple_indices, self._input_predictions):
+            data_batch = self._prepare_batch_from_block_state(input_fields, data, tuple_idx, input_prediction)
+            data_batches.append(data_batch)
+        return data_batches
+
     def forward(self, pred_cond: torch.Tensor, pred_uncond: Optional[torch.Tensor] = None) -> GuiderOutput:
         pred = None
 
diff --git a/src/diffusers/guiders/frequency_decoupled_guidance.py b/src/diffusers/guiders/frequency_decoupled_guidance.py
index cd542a43a429..4ec6e2d36da9 100644
--- a/src/diffusers/guiders/frequency_decoupled_guidance.py
+++ b/src/diffusers/guiders/frequency_decoupled_guidance.py
@@ -226,6 +226,16 @@ def prepare_inputs(self, data: Dict[str, Tuple[torch.Tensor, torch.Tensor]]) ->
             data_batches.append(data_batch)
         return data_batches
 
+    def prepare_inputs_from_block_state(
+        self, data: "BlockState", input_fields: Dict[str, Union[str, Tuple[str, str]]]
+    ) -> List["BlockState"]:
+        tuple_indices = [0] if self.num_conditions == 1 else [0, 1]
+        data_batches = []
+        for tuple_idx, input_prediction in zip(tuple_indices, self._input_predictions):
+            data_batch = self._prepare_batch_from_block_state(input_fields, data, tuple_idx, input_prediction)
+            data_batches.append(data_batch)
+        return data_batches
+
     def forward(self, pred_cond: torch.Tensor, pred_uncond: Optional[torch.Tensor] = None) -> GuiderOutput:
         pred = None
 
diff --git a/src/diffusers/guiders/guider_utils.py b/src/diffusers/guiders/guider_utils.py
index 71e4becfcdf3..6c328328fc3b 100644
--- a/src/diffusers/guiders/guider_utils.py
+++ b/src/diffusers/guiders/guider_utils.py
@@ -166,6 +166,11 @@ def cleanup_models(self, denoiser: torch.nn.Module) -> None:
     def prepare_inputs(self, data: "BlockState") -> List["BlockState"]:
         raise NotImplementedError("BaseGuidance::prepare_inputs must be implemented in subclasses.")
 
+    def prepare_inputs_from_block_state(
+        self, data: "BlockState", input_fields: Dict[str, Union[str, Tuple[str, str]]]
+    ) -> List["BlockState"]:
+        raise NotImplementedError("BaseGuidance::prepare_inputs_from_block_state must be implemented in subclasses.")
+
     def __call__(self, data: List["BlockState"]) -> Any:
         if not all(hasattr(d, "noise_pred") for d in data):
             raise ValueError("Expected all data to have `noise_pred` attribute.")
@@ -234,6 +239,51 @@ def _prepare_batch(
         data_batch[cls._identifier_key] = identifier
         return BlockState(**data_batch)
 
+    @classmethod
+    def _prepare_batch_from_block_state(
+        cls,
+        input_fields: Dict[str, Union[str, Tuple[str, str]]],
+        data: "BlockState",
+        tuple_index: int,
+        identifier: str,
+    ) -> "BlockState":
+        """
+        Prepares a batch of data for the guidance technique. This method is used in the `prepare_inputs` method of the
+        `BaseGuidance` class. It prepares the batch based on the provided tuple index.
+
+        Args:
+            input_fields (`Dict[str, Union[str, Tuple[str, str]]]`):
+                A dictionary where the keys are the names of the fields that will be used to store the data once it is
+                prepared with `prepare_inputs`. The values can be either a string or a tuple of length 2, which is used
+                to look up the required data provided for preparation. If a string is provided, it will be used as the
+                conditional data (or unconditional if used with a guidance method that requires it). If a tuple of
+                length 2 is provided, the first element must be the conditional data identifier and the second element
+                must be the unconditional data identifier or None.
+            data (`BlockState`):
+                The input data to be prepared.
+            tuple_index (`int`):
+                The index to use when accessing input fields that are tuples.
+
+        Returns:
+            `BlockState`: The prepared batch of data.
+        """
+        from ..modular_pipelines.modular_pipeline import BlockState
+
+        data_batch = {}
+        for key, value in input_fields.items():
+            try:
+                if isinstance(value, str):
+                    data_batch[key] = getattr(data, value)
+                elif isinstance(value, tuple):
+                    data_batch[key] = getattr(data, value[tuple_index])
+                else:
+                    # We've already checked that value is a string or a tuple of strings with length 2
+                    pass
+            except AttributeError:
+                logger.debug(f"`data` does not have attribute(s) {value}, skipping.")
+        data_batch[cls._identifier_key] = identifier
+        return BlockState(**data_batch)
+
     @classmethod
     @validate_hf_hub_args
     def from_pretrained(
@@ -323,7 +373,7 @@ def rescale_noise_cfg(noise_cfg, noise_pred_text, guidance_rescale=0.0):
     r"""
     Rescales `noise_cfg` tensor based on `guidance_rescale` to improve image quality and fix overexposure. Based on
     Section 3.4 from [Common Diffusion Noise Schedules and Sample Steps are
-    Flawed](https://arxiv.org/pdf/2305.08891.pdf).
+    Flawed](https://huggingface.co/papers/2305.08891).
 
     Args:
         noise_cfg (`torch.Tensor`):
diff --git a/src/diffusers/guiders/perturbed_attention_guidance.py b/src/diffusers/guiders/perturbed_attention_guidance.py
index 29341736e8d9..f233e90ca410 100644
--- a/src/diffusers/guiders/perturbed_attention_guidance.py
+++ b/src/diffusers/guiders/perturbed_attention_guidance.py
@@ -187,6 +187,26 @@ def prepare_inputs(self, data: Dict[str, Tuple[torch.Tensor, torch.Tensor]]) ->
             data_batches.append(data_batch)
         return data_batches
 
+    def prepare_inputs_from_block_state(
+        self, data: "BlockState", input_fields: Dict[str, Union[str, Tuple[str, str]]]
+    ) -> List["BlockState"]:
+        if self.num_conditions == 1:
+            tuple_indices = [0]
+            input_predictions = ["pred_cond"]
+        elif self.num_conditions == 2:
+            tuple_indices = [0, 1]
+            input_predictions = (
+                ["pred_cond", "pred_uncond"] if self._is_cfg_enabled() else ["pred_cond", "pred_cond_skip"]
+            )
+        else:
+            tuple_indices = [0, 1, 0]
+            input_predictions = ["pred_cond", "pred_uncond", "pred_cond_skip"]
+        data_batches = []
+        for tuple_idx, input_prediction in zip(tuple_indices, input_predictions):
+            data_batch = self._prepare_batch_from_block_state(input_fields, data, tuple_idx, input_prediction)
+            data_batches.append(data_batch)
+        return data_batches
+
     # Copied from diffusers.guiders.skip_layer_guidance.SkipLayerGuidance.forward
     def forward(
         self,
diff --git a/src/diffusers/guiders/skip_layer_guidance.py b/src/diffusers/guiders/skip_layer_guidance.py
index fa5b93b68009..e6109300d99c 100644
--- a/src/diffusers/guiders/skip_layer_guidance.py
+++ b/src/diffusers/guiders/skip_layer_guidance.py
@@ -183,6 +183,26 @@ def prepare_inputs(self, data: Dict[str, Tuple[torch.Tensor, torch.Tensor]]) ->
             data_batches.append(data_batch)
         return data_batches
 
+    def prepare_inputs_from_block_state(
+        self, data: "BlockState", input_fields: Dict[str, Union[str, Tuple[str, str]]]
+    ) -> List["BlockState"]:
+        if self.num_conditions == 1:
+            tuple_indices = [0]
+            input_predictions = ["pred_cond"]
+        elif self.num_conditions == 2:
+            tuple_indices = [0, 1]
+            input_predictions = (
+                ["pred_cond", "pred_uncond"] if self._is_cfg_enabled() else ["pred_cond", "pred_cond_skip"]
+            )
+        else:
+            tuple_indices = [0, 1, 0]
+            input_predictions = ["pred_cond", "pred_uncond", "pred_cond_skip"]
+        data_batches = []
+        for tuple_idx, input_prediction in zip(tuple_indices, input_predictions):
+            data_batch = self._prepare_batch_from_block_state(input_fields, data, tuple_idx, input_prediction)
+            data_batches.append(data_batch)
+        return data_batches
+
     def forward(
         self,
         pred_cond: torch.Tensor,
diff --git a/src/diffusers/guiders/smoothed_energy_guidance.py b/src/diffusers/guiders/smoothed_energy_guidance.py
index 7446b33f1250..6c3906e820e0 100644
--- a/src/diffusers/guiders/smoothed_energy_guidance.py
+++ b/src/diffusers/guiders/smoothed_energy_guidance.py
@@ -172,6 +172,26 @@ def prepare_inputs(self, data: Dict[str, Tuple[torch.Tensor, torch.Tensor]]) ->
             data_batches.append(data_batch)
         return data_batches
 
+    def prepare_inputs_from_block_state(
+        self, data: "BlockState", input_fields: Dict[str, Union[str, Tuple[str, str]]]
+    ) -> List["BlockState"]:
+        if self.num_conditions == 1:
+            tuple_indices = [0]
+            input_predictions = ["pred_cond"]
+        elif self.num_conditions == 2:
+            tuple_indices = [0, 1]
+            input_predictions = (
+                ["pred_cond", "pred_uncond"] if self._is_cfg_enabled() else ["pred_cond", "pred_cond_seg"]
+            )
+        else:
+            tuple_indices = [0, 1, 0]
+            input_predictions = ["pred_cond", "pred_uncond", "pred_cond_seg"]
+        data_batches = []
+        for tuple_idx, input_prediction in zip(tuple_indices, input_predictions):
+            data_batch = self._prepare_batch_from_block_state(input_fields, data, tuple_idx, input_prediction)
+            data_batches.append(data_batch)
+        return data_batches
+
     def forward(
         self,
         pred_cond: torch.Tensor,
diff --git a/src/diffusers/guiders/tangential_classifier_free_guidance.py b/src/diffusers/guiders/tangential_classifier_free_guidance.py
index cfa3c4a61619..76899c6e8494 100644
--- a/src/diffusers/guiders/tangential_classifier_free_guidance.py
+++ b/src/diffusers/guiders/tangential_classifier_free_guidance.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 
 import math
-from typing import TYPE_CHECKING, Dict, List, Optional, Tuple
+from typing import TYPE_CHECKING, Dict, List, Optional, Tuple, Union
 
 import torch
 
@@ -74,6 +74,16 @@ def prepare_inputs(self, data: Dict[str, Tuple[torch.Tensor, torch.Tensor]]) ->
             data_batches.append(data_batch)
         return data_batches
 
+    def prepare_inputs_from_block_state(
+        self, data: "BlockState", input_fields: Dict[str, Union[str, Tuple[str, str]]]
+    ) -> List["BlockState"]:
+        tuple_indices = [0] if self.num_conditions == 1 else [0, 1]
+        data_batches = []
+        for tuple_idx, input_prediction in zip(tuple_indices, self._input_predictions):
+            data_batch = self._prepare_batch_from_block_state(input_fields, data, tuple_idx, input_prediction)
+            data_batches.append(data_batch)
+        return data_batches
+
     def forward(self, pred_cond: torch.Tensor, pred_uncond: Optional[torch.Tensor] = None) -> GuiderOutput:
         pred = None
 
diff --git a/src/diffusers/hooks/__init__.py b/src/diffusers/hooks/__init__.py
index 524a92ea9966..eb12b8a52a1e 100644
--- a/src/diffusers/hooks/__init__.py
+++ b/src/diffusers/hooks/__init__.py
@@ -25,3 +25,4 @@
     from .layerwise_casting import apply_layerwise_casting, apply_layerwise_casting_hook
     from .pyramid_attention_broadcast import PyramidAttentionBroadcastConfig, apply_pyramid_attention_broadcast
     from .smoothed_energy_guidance_utils import SmoothedEnergyGuidanceConfig
+    from .taylorseer_cache import TaylorSeerCacheConfig, apply_taylorseer_cache
diff --git a/src/diffusers/hooks/_helpers.py b/src/diffusers/hooks/_helpers.py
index 790199f3c978..da7313cb4737 100644
--- a/src/diffusers/hooks/_helpers.py
+++ b/src/diffusers/hooks/_helpers.py
@@ -111,6 +111,7 @@ def _register_attention_processors_metadata():
     from ..models.transformers.transformer_hunyuanimage import HunyuanImageAttnProcessor
     from ..models.transformers.transformer_qwenimage import QwenDoubleStreamAttnProcessor2_0
     from ..models.transformers.transformer_wan import WanAttnProcessor2_0
+    from ..models.transformers.transformer_z_image import ZSingleStreamAttnProcessor
 
     # AttnProcessor2_0
     AttentionProcessorRegistry.register(
@@ -158,6 +159,14 @@ def _register_attention_processors_metadata():
         ),
     )
 
+    # ZSingleStreamAttnProcessor
+    AttentionProcessorRegistry.register(
+        model_class=ZSingleStreamAttnProcessor,
+        metadata=AttentionProcessorMetadata(
+            skip_processor_output_fn=_skip_proc_output_fn_Attention_ZSingleStreamAttnProcessor,
+        ),
+    )
+
 
 def _register_transformer_blocks_metadata():
     from ..models.attention import BasicTransformerBlock
@@ -179,6 +188,7 @@ def _register_transformer_blocks_metadata():
     from ..models.transformers.transformer_mochi import MochiTransformerBlock
     from ..models.transformers.transformer_qwenimage import QwenImageTransformerBlock
     from ..models.transformers.transformer_wan import WanTransformerBlock
+    from ..models.transformers.transformer_z_image import ZImageTransformerBlock
 
     # BasicTransformerBlock
     TransformerBlockRegistry.register(
@@ -312,6 +322,15 @@ def _register_transformer_blocks_metadata():
         ),
     )
 
+    # ZImage
+    TransformerBlockRegistry.register(
+        model_class=ZImageTransformerBlock,
+        metadata=TransformerBlockMetadata(
+            return_hidden_states_index=0,
+            return_encoder_hidden_states_index=None,
+        ),
+    )
+
 
 # fmt: off
 def _skip_attention___ret___hidden_states(self, *args, **kwargs):
@@ -338,4 +357,5 @@ def _skip_attention___ret___hidden_states___encoder_hidden_states(self, *args, *
 _skip_proc_output_fn_Attention_FluxAttnProcessor = _skip_attention___ret___hidden_states
 _skip_proc_output_fn_Attention_QwenDoubleStreamAttnProcessor2_0 = _skip_attention___ret___hidden_states
 _skip_proc_output_fn_Attention_HunyuanImageAttnProcessor = _skip_attention___ret___hidden_states
+_skip_proc_output_fn_Attention_ZSingleStreamAttnProcessor = _skip_attention___ret___hidden_states
 # fmt: on
diff --git a/src/diffusers/hooks/context_parallel.py b/src/diffusers/hooks/context_parallel.py
index 915fe453b90b..6491d17b4f46 100644
--- a/src/diffusers/hooks/context_parallel.py
+++ b/src/diffusers/hooks/context_parallel.py
@@ -203,10 +203,12 @@ def post_forward(self, module, output):
 
     def _prepare_cp_input(self, x: torch.Tensor, cp_input: ContextParallelInput) -> torch.Tensor:
         if cp_input.expected_dims is not None and x.dim() != cp_input.expected_dims:
-            raise ValueError(
-                f"Expected input tensor to have {cp_input.expected_dims} dimensions, but got {x.dim()} dimensions."
+            logger.warning_once(
+                f"Expected input tensor to have {cp_input.expected_dims} dimensions, but got {x.dim()} dimensions, split will not be applied."
             )
-        return EquipartitionSharder.shard(x, cp_input.split_dim, self.parallel_config._flattened_mesh)
+            return x
+        else:
+            return EquipartitionSharder.shard(x, cp_input.split_dim, self.parallel_config._flattened_mesh)
 
 
 class ContextParallelGatherHook(ModelHook):
diff --git a/src/diffusers/hooks/group_offloading.py b/src/diffusers/hooks/group_offloading.py
index 38f291f5203c..47f1f4199615 100644
--- a/src/diffusers/hooks/group_offloading.py
+++ b/src/diffusers/hooks/group_offloading.py
@@ -15,7 +15,7 @@
 import hashlib
 import os
 from contextlib import contextmanager, nullcontext
-from dataclasses import dataclass
+from dataclasses import dataclass, replace
 from enum import Enum
 from typing import Dict, List, Optional, Set, Tuple, Union
 
@@ -59,6 +59,9 @@ class GroupOffloadingConfig:
     num_blocks_per_group: Optional[int] = None
     offload_to_disk_path: Optional[str] = None
     stream: Optional[Union[torch.cuda.Stream, torch.Stream]] = None
+    block_modules: Optional[List[str]] = None
+    exclude_kwargs: Optional[List[str]] = None
+    module_prefix: Optional[str] = ""
 
 
 class ModuleGroup:
@@ -77,7 +80,7 @@ def __init__(
         low_cpu_mem_usage: bool = False,
         onload_self: bool = True,
         offload_to_disk_path: Optional[str] = None,
-        group_id: Optional[int] = None,
+        group_id: Optional[Union[int, str]] = None,
     ) -> None:
         self.modules = modules
         self.offload_device = offload_device
@@ -153,27 +156,27 @@ def _pinned_memory_tensors(self):
         finally:
             pinned_dict = None
 
-    def _transfer_tensor_to_device(self, tensor, source_tensor):
+    def _transfer_tensor_to_device(self, tensor, source_tensor, default_stream):
         tensor.data = source_tensor.to(self.onload_device, non_blocking=self.non_blocking)
         if self.record_stream:
-            tensor.data.record_stream(self._torch_accelerator_module.current_stream())
+            tensor.data.record_stream(default_stream)
 
-    def _process_tensors_from_modules(self, pinned_memory=None):
+    def _process_tensors_from_modules(self, pinned_memory=None, default_stream=None):
         for group_module in self.modules:
             for param in group_module.parameters():
                 source = pinned_memory[param] if pinned_memory else param.data
-                self._transfer_tensor_to_device(param, source)
+                self._transfer_tensor_to_device(param, source, default_stream)
             for buffer in group_module.buffers():
                 source = pinned_memory[buffer] if pinned_memory else buffer.data
-                self._transfer_tensor_to_device(buffer, source)
+                self._transfer_tensor_to_device(buffer, source, default_stream)
 
         for param in self.parameters:
             source = pinned_memory[param] if pinned_memory else param.data
-            self._transfer_tensor_to_device(param, source)
+            self._transfer_tensor_to_device(param, source, default_stream)
 
         for buffer in self.buffers:
             source = pinned_memory[buffer] if pinned_memory else buffer.data
-            self._transfer_tensor_to_device(buffer, source)
+            self._transfer_tensor_to_device(buffer, source, default_stream)
 
     def _onload_from_disk(self):
         if self.stream is not None:
@@ -208,10 +211,12 @@ def _onload_from_memory(self):
             self.stream.synchronize()
 
         context = nullcontext() if self.stream is None else self._torch_accelerator_module.stream(self.stream)
+        default_stream = self._torch_accelerator_module.current_stream() if self.stream is not None else None
+
         with context:
             if self.stream is not None:
                 with self._pinned_memory_tensors() as pinned_memory:
-                    self._process_tensors_from_modules(pinned_memory)
+                    self._process_tensors_from_modules(pinned_memory, default_stream=default_stream)
             else:
                 self._process_tensors_from_modules(None)
 
@@ -320,7 +325,21 @@ def pre_forward(self, module: torch.nn.Module, *args, **kwargs):
                 self.group.stream.synchronize()
 
         args = send_to_device(args, self.group.onload_device, non_blocking=self.group.non_blocking)
-        kwargs = send_to_device(kwargs, self.group.onload_device, non_blocking=self.group.non_blocking)
+
+        # Some Autoencoder models use a feature cache that is passed through submodules
+        # and modified in place. The `send_to_device` call returns a copy of this feature cache object
+        # which breaks the inplace updates. Use `exclude_kwargs` to mark these cache features
+        exclude_kwargs = self.config.exclude_kwargs or []
+        if exclude_kwargs:
+            moved_kwargs = send_to_device(
+                {k: v for k, v in kwargs.items() if k not in exclude_kwargs},
+                self.group.onload_device,
+                non_blocking=self.group.non_blocking,
+            )
+            kwargs.update(moved_kwargs)
+        else:
+            kwargs = send_to_device(kwargs, self.group.onload_device, non_blocking=self.group.non_blocking)
+
         return args, kwargs
 
     def post_forward(self, module: torch.nn.Module, output):
@@ -453,6 +472,8 @@ def apply_group_offloading(
     record_stream: bool = False,
     low_cpu_mem_usage: bool = False,
     offload_to_disk_path: Optional[str] = None,
+    block_modules: Optional[List[str]] = None,
+    exclude_kwargs: Optional[List[str]] = None,
 ) -> None:
     r"""
     Applies group offloading to the internal layers of a torch.nn.Module. To understand what group offloading is, and
@@ -510,6 +531,13 @@ def apply_group_offloading(
             If True, the CPU memory usage is minimized by pinning tensors on-the-fly instead of pre-pinning them. This
             option only matters when using streamed CPU offloading (i.e. `use_stream=True`). This can be useful when
             the CPU memory is a bottleneck but may counteract the benefits of using streams.
+        block_modules (`List[str]`, *optional*):
+            List of module names that should be treated as blocks for offloading. If provided, only these modules will
+            be considered for block-level offloading. If not provided, the default block detection logic will be used.
+        exclude_kwargs (`List[str]`, *optional*):
+            List of kwarg keys that should not be processed by send_to_device. This is useful for mutable state like
+            caching lists that need to maintain their object identity across forward passes. If not provided, will be
+            inferred from the module's `_skip_keys` attribute if it exists.
 
     Example:
         ```python
@@ -551,6 +579,12 @@ def apply_group_offloading(
 
     _raise_error_if_accelerate_model_or_sequential_hook_present(module)
 
+    if block_modules is None:
+        block_modules = getattr(module, "_group_offload_block_modules", None)
+
+    if exclude_kwargs is None:
+        exclude_kwargs = getattr(module, "_skip_keys", None)
+
     config = GroupOffloadingConfig(
         onload_device=onload_device,
         offload_device=offload_device,
@@ -561,6 +595,8 @@ def apply_group_offloading(
         record_stream=record_stream,
         low_cpu_mem_usage=low_cpu_mem_usage,
         offload_to_disk_path=offload_to_disk_path,
+        block_modules=block_modules,
+        exclude_kwargs=exclude_kwargs,
     )
     _apply_group_offloading(module, config)
 
@@ -576,46 +612,66 @@ def _apply_group_offloading(module: torch.nn.Module, config: GroupOffloadingConf
 
 def _apply_group_offloading_block_level(module: torch.nn.Module, config: GroupOffloadingConfig) -> None:
     r"""
-    This function applies offloading to groups of torch.nn.ModuleList or torch.nn.Sequential blocks. In comparison to
-    the "leaf_level" offloading, which is more fine-grained, this offloading is done at the top-level blocks.
-    """
+    This function applies offloading to groups of torch.nn.ModuleList or torch.nn.Sequential blocks, and explicitly
+    defined block modules. In comparison to the "leaf_level" offloading, which is more fine-grained, this offloading is
+    done at the top-level blocks and modules specified in block_modules.
 
+    When block_modules is provided, only those modules will be treated as blocks for offloading. For each specified
+    module, recursively apply block offloading to it.
+    """
     if config.stream is not None and config.num_blocks_per_group != 1:
         logger.warning(
             f"Using streams is only supported for num_blocks_per_group=1. Got {config.num_blocks_per_group=}. Setting it to 1."
         )
         config.num_blocks_per_group = 1
 
-    # Create module groups for ModuleList and Sequential blocks
+    block_modules = set(config.block_modules) if config.block_modules is not None else set()
+
+    # Create module groups for ModuleList and Sequential blocks, and explicitly defined block modules
     modules_with_group_offloading = set()
     unmatched_modules = []
     matched_module_groups = []
+
     for name, submodule in module.named_children():
-        if not isinstance(submodule, (torch.nn.ModuleList, torch.nn.Sequential)):
-            unmatched_modules.append((name, submodule))
+        # Check if this is an explicitly defined block module
+        if name in block_modules:
+            # Track submodule using a prefix to avoid filename collisions during disk offload.
+            # Without this, submodules sharing the same model class would be assigned identical
+            # filenames (derived from the class name).
+            prefix = f"{config.module_prefix}{name}." if config.module_prefix else f"{name}."
+            submodule_config = replace(config, module_prefix=prefix)
+
+            _apply_group_offloading_block_level(submodule, submodule_config)
             modules_with_group_offloading.add(name)
-            continue
 
-        for i in range(0, len(submodule), config.num_blocks_per_group):
-            current_modules = submodule[i : i + config.num_blocks_per_group]
-            group_id = f"{name}_{i}_{i + len(current_modules) - 1}"
-            group = ModuleGroup(
-                modules=current_modules,
-                offload_device=config.offload_device,
-                onload_device=config.onload_device,
-                offload_to_disk_path=config.offload_to_disk_path,
-                offload_leader=current_modules[-1],
-                onload_leader=current_modules[0],
-                non_blocking=config.non_blocking,
-                stream=config.stream,
-                record_stream=config.record_stream,
-                low_cpu_mem_usage=config.low_cpu_mem_usage,
-                onload_self=True,
-                group_id=group_id,
-            )
-            matched_module_groups.append(group)
-            for j in range(i, i + len(current_modules)):
-                modules_with_group_offloading.add(f"{name}.{j}")
+        elif isinstance(submodule, (torch.nn.ModuleList, torch.nn.Sequential)):
+            # Handle ModuleList and Sequential blocks as before
+            for i in range(0, len(submodule), config.num_blocks_per_group):
+                current_modules = list(submodule[i : i + config.num_blocks_per_group])
+                if len(current_modules) == 0:
+                    continue
+
+                group_id = f"{config.module_prefix}{name}_{i}_{i + len(current_modules) - 1}"
+                group = ModuleGroup(
+                    modules=current_modules,
+                    offload_device=config.offload_device,
+                    onload_device=config.onload_device,
+                    offload_to_disk_path=config.offload_to_disk_path,
+                    offload_leader=current_modules[-1],
+                    onload_leader=current_modules[0],
+                    non_blocking=config.non_blocking,
+                    stream=config.stream,
+                    record_stream=config.record_stream,
+                    low_cpu_mem_usage=config.low_cpu_mem_usage,
+                    onload_self=True,
+                    group_id=group_id,
+                )
+                matched_module_groups.append(group)
+                for j in range(i, i + len(current_modules)):
+                    modules_with_group_offloading.add(f"{name}.{j}")
+        else:
+            # This is an unmatched module
+            unmatched_modules.append((name, submodule))
 
     # Apply group offloading hooks to the module groups
     for i, group in enumerate(matched_module_groups):
@@ -630,28 +686,29 @@ def _apply_group_offloading_block_level(module: torch.nn.Module, config: GroupOf
     parameters = [param for _, param in parameters]
     buffers = [buffer for _, buffer in buffers]
 
-    # Create a group for the unmatched submodules of the top-level module so that they are on the correct
-    # device when the forward pass is called.
+    # Create a group for the remaining unmatched submodules of the top-level
+    # module so that they are on the correct device when the forward pass is called.
     unmatched_modules = [unmatched_module for _, unmatched_module in unmatched_modules]
-    unmatched_group = ModuleGroup(
-        modules=unmatched_modules,
-        offload_device=config.offload_device,
-        onload_device=config.onload_device,
-        offload_to_disk_path=config.offload_to_disk_path,
-        offload_leader=module,
-        onload_leader=module,
-        parameters=parameters,
-        buffers=buffers,
-        non_blocking=False,
-        stream=None,
-        record_stream=False,
-        onload_self=True,
-        group_id=f"{module.__class__.__name__}_unmatched_group",
-    )
-    if config.stream is None:
-        _apply_group_offloading_hook(module, unmatched_group, config=config)
-    else:
-        _apply_lazy_group_offloading_hook(module, unmatched_group, config=config)
+    if len(unmatched_modules) > 0 or len(parameters) > 0 or len(buffers) > 0:
+        unmatched_group = ModuleGroup(
+            modules=unmatched_modules,
+            offload_device=config.offload_device,
+            onload_device=config.onload_device,
+            offload_to_disk_path=config.offload_to_disk_path,
+            offload_leader=module,
+            onload_leader=module,
+            parameters=parameters,
+            buffers=buffers,
+            non_blocking=False,
+            stream=None,
+            record_stream=False,
+            onload_self=True,
+            group_id=f"{config.module_prefix}{module.__class__.__name__}_unmatched_group",
+        )
+        if config.stream is None:
+            _apply_group_offloading_hook(module, unmatched_group, config=config)
+        else:
+            _apply_lazy_group_offloading_hook(module, unmatched_group, config=config)
 
 
 def _apply_group_offloading_leaf_level(module: torch.nn.Module, config: GroupOffloadingConfig) -> None:
diff --git a/src/diffusers/hooks/taylorseer_cache.py b/src/diffusers/hooks/taylorseer_cache.py
new file mode 100644
index 000000000000..7cad9f4fa161
--- /dev/null
+++ b/src/diffusers/hooks/taylorseer_cache.py
@@ -0,0 +1,346 @@
+import math
+import re
+from dataclasses import dataclass
+from typing import Dict, List, Optional, Tuple
+
+import torch
+import torch.nn as nn
+
+from ..utils import logging
+from .hooks import HookRegistry, ModelHook, StateManager
+
+
+logger = logging.get_logger(__name__)
+_TAYLORSEER_CACHE_HOOK = "taylorseer_cache"
+_SPATIAL_ATTENTION_BLOCK_IDENTIFIERS = (
+    "^blocks.*attn",
+    "^transformer_blocks.*attn",
+    "^single_transformer_blocks.*attn",
+)
+_TEMPORAL_ATTENTION_BLOCK_IDENTIFIERS = ("^temporal_transformer_blocks.*attn",)
+_TRANSFORMER_BLOCK_IDENTIFIERS = _SPATIAL_ATTENTION_BLOCK_IDENTIFIERS + _TEMPORAL_ATTENTION_BLOCK_IDENTIFIERS
+_BLOCK_IDENTIFIERS = ("^[^.]*block[^.]*\\.[^.]+$",)
+_PROJ_OUT_IDENTIFIERS = ("^proj_out$",)
+
+
+@dataclass
+class TaylorSeerCacheConfig:
+    """
+    Configuration for TaylorSeer cache. See: https://huggingface.co/papers/2503.06923
+
+    Attributes:
+        cache_interval (`int`, defaults to `5`):
+            The interval between full computation steps. After a full computation, the cached (predicted) outputs are
+            reused for this many subsequent denoising steps before refreshing with a new full forward pass.
+
+        disable_cache_before_step (`int`, defaults to `3`):
+            The denoising step index before which caching is disabled, meaning full computation is performed for the
+            initial steps (0 to disable_cache_before_step - 1) to gather data for Taylor series approximations. During
+            these steps, Taylor factors are updated, but caching/predictions are not applied. Caching begins at this
+            step.
+
+        disable_cache_after_step (`int`, *optional*, defaults to `None`):
+            The denoising step index after which caching is disabled. If set, for steps >= this value, all modules run
+            full computations without predictions or state updates, ensuring accuracy in later stages if needed.
+
+        max_order (`int`, defaults to `1`):
+            The highest order in the Taylor series expansion for approximating module outputs. Higher orders provide
+            better approximations but increase computation and memory usage.
+
+        taylor_factors_dtype (`torch.dtype`, defaults to `torch.bfloat16`):
+            Data type used for storing and computing Taylor series factors. Lower precision reduces memory but may
+            affect stability; higher precision improves accuracy at the cost of more memory.
+
+        skip_predict_identifiers (`List[str]`, *optional*, defaults to `None`):
+            Regex patterns (using `re.fullmatch`) for module names to place as "skip" in "cache" mode. In this mode,
+            the module computes fully during initial or refresh steps but returns a zero tensor (matching recorded
+            shape) during prediction steps to skip computation cheaply.
+
+        cache_identifiers (`List[str]`, *optional*, defaults to `None`):
+            Regex patterns (using `re.fullmatch`) for module names to place in Taylor-series caching mode, where
+            outputs are approximated and cached for reuse.
+
+        use_lite_mode (`bool`, *optional*, defaults to `False`):
+            Enables a lightweight TaylorSeer variant that minimizes memory usage by applying predefined patterns for
+            skipping and caching (e.g., skipping blocks and caching projections). This overrides any custom
+            `inactive_identifiers` or `active_identifiers`.
+
+    Notes:
+        - Patterns are matched using `re.fullmatch` on the module name.
+        - If `skip_predict_identifiers` or `cache_identifiers` are provided, only matching modules are hooked.
+        - If neither is provided, all attention-like modules are hooked by default.
+
+    Example of inactive and active usage:
+
+    ```py
+    def forward(x):
+        x = self.module1(x)  # inactive module: returns zeros tensor based on shape recorded during full compute
+        x = self.module2(x)  # active module: caches output here, avoiding recomputation of prior steps
+        return x
+    ```
+    """
+
+    cache_interval: int = 5
+    disable_cache_before_step: int = 3
+    disable_cache_after_step: Optional[int] = None
+    max_order: int = 1
+    taylor_factors_dtype: Optional[torch.dtype] = torch.bfloat16
+    skip_predict_identifiers: Optional[List[str]] = None
+    cache_identifiers: Optional[List[str]] = None
+    use_lite_mode: bool = False
+
+    def __repr__(self) -> str:
+        return (
+            "TaylorSeerCacheConfig("
+            f"cache_interval={self.cache_interval}, "
+            f"disable_cache_before_step={self.disable_cache_before_step}, "
+            f"disable_cache_after_step={self.disable_cache_after_step}, "
+            f"max_order={self.max_order}, "
+            f"taylor_factors_dtype={self.taylor_factors_dtype}, "
+            f"skip_predict_identifiers={self.skip_predict_identifiers}, "
+            f"cache_identifiers={self.cache_identifiers}, "
+            f"use_lite_mode={self.use_lite_mode})"
+        )
+
+
+class TaylorSeerState:
+    def __init__(
+        self,
+        taylor_factors_dtype: Optional[torch.dtype] = torch.bfloat16,
+        max_order: int = 1,
+        is_inactive: bool = False,
+    ):
+        self.taylor_factors_dtype = taylor_factors_dtype
+        self.max_order = max_order
+        self.is_inactive = is_inactive
+
+        self.module_dtypes: Tuple[torch.dtype, ...] = ()
+        self.last_update_step: Optional[int] = None
+        self.taylor_factors: Dict[int, Dict[int, torch.Tensor]] = {}
+        self.inactive_shapes: Optional[Tuple[Tuple[int, ...], ...]] = None
+        self.device: Optional[torch.device] = None
+        self.current_step: int = -1
+
+    def reset(self) -> None:
+        self.current_step = -1
+        self.last_update_step = None
+        self.taylor_factors = {}
+        self.inactive_shapes = None
+        self.device = None
+
+    def update(
+        self,
+        outputs: Tuple[torch.Tensor, ...],
+    ) -> None:
+        self.module_dtypes = tuple(output.dtype for output in outputs)
+        self.device = outputs[0].device
+
+        if self.is_inactive:
+            self.inactive_shapes = tuple(output.shape for output in outputs)
+        else:
+            for i, features in enumerate(outputs):
+                new_factors: Dict[int, torch.Tensor] = {0: features}
+                is_first_update = self.last_update_step is None
+                if not is_first_update:
+                    delta_step = self.current_step - self.last_update_step
+                    if delta_step == 0:
+                        raise ValueError("Delta step cannot be zero for TaylorSeer update.")
+
+                    # Recursive divided differences up to max_order
+                    prev_factors = self.taylor_factors.get(i, {})
+                    for j in range(self.max_order):
+                        prev = prev_factors.get(j)
+                        if prev is None:
+                            break
+                        new_factors[j + 1] = (new_factors[j] - prev.to(features.dtype)) / delta_step
+                self.taylor_factors[i] = {
+                    order: factor.to(self.taylor_factors_dtype) for order, factor in new_factors.items()
+                }
+
+        self.last_update_step = self.current_step
+
+    @torch.compiler.disable
+    def predict(self) -> List[torch.Tensor]:
+        if self.last_update_step is None:
+            raise ValueError("Cannot predict without prior initialization/update.")
+
+        step_offset = self.current_step - self.last_update_step
+
+        outputs = []
+        if self.is_inactive:
+            if self.inactive_shapes is None:
+                raise ValueError("Inactive shapes not set during prediction.")
+            for i in range(len(self.module_dtypes)):
+                outputs.append(
+                    torch.zeros(
+                        self.inactive_shapes[i],
+                        dtype=self.module_dtypes[i],
+                        device=self.device,
+                    )
+                )
+        else:
+            if not self.taylor_factors:
+                raise ValueError("Taylor factors empty during prediction.")
+            num_outputs = len(self.taylor_factors)
+            num_orders = len(self.taylor_factors[0])
+            for i in range(num_outputs):
+                output_dtype = self.module_dtypes[i]
+                taylor_factors = self.taylor_factors[i]
+                output = torch.zeros_like(taylor_factors[0], dtype=output_dtype)
+                for order in range(num_orders):
+                    coeff = (step_offset**order) / math.factorial(order)
+                    factor = taylor_factors[order]
+                    output = output + factor.to(output_dtype) * coeff
+                outputs.append(output)
+        return outputs
+
+
+class TaylorSeerCacheHook(ModelHook):
+    _is_stateful = True
+
+    def __init__(
+        self,
+        cache_interval: int,
+        disable_cache_before_step: int,
+        taylor_factors_dtype: torch.dtype,
+        state_manager: StateManager,
+        disable_cache_after_step: Optional[int] = None,
+    ):
+        super().__init__()
+        self.cache_interval = cache_interval
+        self.disable_cache_before_step = disable_cache_before_step
+        self.disable_cache_after_step = disable_cache_after_step
+        self.taylor_factors_dtype = taylor_factors_dtype
+        self.state_manager = state_manager
+
+    def initialize_hook(self, module: torch.nn.Module):
+        return module
+
+    def reset_state(self, module: torch.nn.Module) -> None:
+        """
+        Reset state between sampling runs.
+        """
+        self.state_manager.reset()
+
+    @torch.compiler.disable
+    def _measure_should_compute(self) -> bool:
+        state: TaylorSeerState = self.state_manager.get_state()
+        state.current_step += 1
+        current_step = state.current_step
+        is_warmup_phase = current_step < self.disable_cache_before_step
+        is_compute_interval = (current_step - self.disable_cache_before_step - 1) % self.cache_interval == 0
+        is_cooldown_phase = self.disable_cache_after_step is not None and current_step >= self.disable_cache_after_step
+        should_compute = is_warmup_phase or is_compute_interval or is_cooldown_phase
+        return should_compute, state
+
+    def new_forward(self, module: torch.nn.Module, *args, **kwargs):
+        should_compute, state = self._measure_should_compute()
+        if should_compute:
+            outputs = self.fn_ref.original_forward(*args, **kwargs)
+            wrapped_outputs = (outputs,) if isinstance(outputs, torch.Tensor) else outputs
+            state.update(wrapped_outputs)
+            return outputs
+
+        outputs_list = state.predict()
+        return outputs_list[0] if len(outputs_list) == 1 else tuple(outputs_list)
+
+
+def _resolve_patterns(config: TaylorSeerCacheConfig) -> Tuple[List[str], List[str]]:
+    """
+    Resolve effective inactive and active pattern lists from config + templates.
+    """
+
+    inactive_patterns = config.skip_predict_identifiers if config.skip_predict_identifiers is not None else None
+    active_patterns = config.cache_identifiers if config.cache_identifiers is not None else None
+
+    return inactive_patterns or [], active_patterns or []
+
+
+def apply_taylorseer_cache(module: torch.nn.Module, config: TaylorSeerCacheConfig):
+    """
+    Applies the TaylorSeer cache to a given pipeline (typically the transformer / UNet).
+
+    This function hooks selected modules in the model to enable caching or skipping based on the provided
+    configuration, reducing redundant computations in diffusion denoising loops.
+
+    Args:
+        module (torch.nn.Module): The model subtree to apply the hooks to.
+        config (TaylorSeerCacheConfig): Configuration for the cache.
+
+    Example:
+    ```python
+    >>> import torch
+    >>> from diffusers import FluxPipeline, TaylorSeerCacheConfig
+
+    >>> pipe = FluxPipeline.from_pretrained(
+    ...     "black-forest-labs/FLUX.1-dev",
+    ...     torch_dtype=torch.bfloat16,
+    ... )
+    >>> pipe.to("cuda")
+
+    >>> config = TaylorSeerCacheConfig(
+    ...     cache_interval=5,
+    ...     max_order=1,
+    ...     disable_cache_before_step=3,
+    ...     taylor_factors_dtype=torch.float32,
+    ... )
+    >>> pipe.transformer.enable_cache(config)
+    ```
+    """
+    inactive_patterns, active_patterns = _resolve_patterns(config)
+
+    active_patterns = active_patterns or _TRANSFORMER_BLOCK_IDENTIFIERS
+
+    if config.use_lite_mode:
+        logger.info("Using TaylorSeer Lite variant for cache.")
+        active_patterns = _PROJ_OUT_IDENTIFIERS
+        inactive_patterns = _BLOCK_IDENTIFIERS
+        if config.skip_predict_identifiers or config.cache_identifiers:
+            logger.warning("Lite mode overrides user patterns.")
+
+    for name, submodule in module.named_modules():
+        matches_inactive = any(re.fullmatch(pattern, name) for pattern in inactive_patterns)
+        matches_active = any(re.fullmatch(pattern, name) for pattern in active_patterns)
+        if not (matches_inactive or matches_active):
+            continue
+        _apply_taylorseer_cache_hook(
+            module=submodule,
+            config=config,
+            is_inactive=matches_inactive,
+        )
+
+
+def _apply_taylorseer_cache_hook(
+    module: nn.Module,
+    config: TaylorSeerCacheConfig,
+    is_inactive: bool,
+):
+    """
+    Registers the TaylorSeer hook on the specified nn.Module.
+
+    Args:
+        name: Name of the module.
+        module: The nn.Module to be hooked.
+        config: Cache configuration.
+        is_inactive: Whether this module should operate in "inactive" mode.
+    """
+    state_manager = StateManager(
+        TaylorSeerState,
+        init_kwargs={
+            "taylor_factors_dtype": config.taylor_factors_dtype,
+            "max_order": config.max_order,
+            "is_inactive": is_inactive,
+        },
+    )
+
+    registry = HookRegistry.check_if_exists_or_initialize(module)
+
+    hook = TaylorSeerCacheHook(
+        cache_interval=config.cache_interval,
+        disable_cache_before_step=config.disable_cache_before_step,
+        taylor_factors_dtype=config.taylor_factors_dtype,
+        disable_cache_after_step=config.disable_cache_after_step,
+        state_manager=state_manager,
+    )
+
+    registry.register_hook(hook, _TAYLORSEER_CACHE_HOOK)
diff --git a/src/diffusers/image_processor.py b/src/diffusers/image_processor.py
index 0e3082eada8a..abd0a25819f5 100644
--- a/src/diffusers/image_processor.py
+++ b/src/diffusers/image_processor.py
@@ -409,7 +409,7 @@ def _resize_and_fill(
         src_w = width if ratio < src_ratio else image.width * height // image.height
         src_h = height if ratio >= src_ratio else image.height * width // image.width
 
-        resized = image.resize((src_w, src_h), resample=PIL_INTERPOLATION["lanczos"])
+        resized = image.resize((src_w, src_h), resample=PIL_INTERPOLATION[self.config.resample])
         res = Image.new("RGB", (width, height))
         res.paste(resized, box=(width // 2 - src_w // 2, height // 2 - src_h // 2))
 
@@ -460,7 +460,7 @@ def _resize_and_crop(
         src_w = width if ratio > src_ratio else image.width * height // image.height
         src_h = height if ratio <= src_ratio else image.height * width // image.width
 
-        resized = image.resize((src_w, src_h), resample=PIL_INTERPOLATION["lanczos"])
+        resized = image.resize((src_w, src_h), resample=PIL_INTERPOLATION[self.config.resample])
         res = Image.new("RGB", (width, height))
         res.paste(resized, box=(width // 2 - src_w // 2, height // 2 - src_h // 2))
         return res
@@ -1045,16 +1045,39 @@ def depth_pil_to_numpy(images: Union[List[PIL.Image.Image], PIL.Image.Image]) ->
     def rgblike_to_depthmap(image: Union[np.ndarray, torch.Tensor]) -> Union[np.ndarray, torch.Tensor]:
         r"""
         Convert an RGB-like depth image to a depth map.
+        """
+        # 1. Cast the tensor to a larger integer type (e.g., int32)
+        #    to safely perform the multiplication by 256.
+        # 2. Perform the 16-bit combination: High-byte * 256 + Low-byte.
+        # 3. Cast the final result to the desired depth map type (uint16) if needed
+        #    before returning, though leaving it as int32/int64 is often safer
+        #    for return value from a library function.
+
+        if isinstance(image, torch.Tensor):
+            # Cast to a safe dtype (e.g., int32 or int64) for the calculation
+            original_dtype = image.dtype
+            image_safe = image.to(torch.int32)
+
+            # Calculate the depth map
+            depth_map = image_safe[:, :, 1] * 256 + image_safe[:, :, 2]
+
+            # You may want to cast the final result to uint16, but casting to a
+            # larger int type (like int32) is sufficient to fix the overflow.
+            # depth_map = depth_map.to(torch.uint16) # Uncomment if uint16 is strictly required
+            return depth_map.to(original_dtype)
 
-        Args:
-            image (`Union[np.ndarray, torch.Tensor]`):
-                The RGB-like depth image to convert.
+        elif isinstance(image, np.ndarray):
+            # NumPy equivalent: Cast to a safe dtype (e.g., np.int32)
+            original_dtype = image.dtype
+            image_safe = image.astype(np.int32)
 
-        Returns:
-            `Union[np.ndarray, torch.Tensor]`:
-                The corresponding depth map.
-        """
-        return image[:, :, 1] * 2**8 + image[:, :, 2]
+            # Calculate the depth map
+            depth_map = image_safe[:, :, 1] * 256 + image_safe[:, :, 2]
+
+            # depth_map = depth_map.astype(np.uint16) # Uncomment if uint16 is strictly required
+            return depth_map.astype(original_dtype)
+        else:
+            raise TypeError("Input image must be a torch.Tensor or np.ndarray")
 
     def numpy_to_depth(self, images: np.ndarray) -> List[PIL.Image.Image]:
         r"""
diff --git a/src/diffusers/loaders/__init__.py b/src/diffusers/loaders/__init__.py
index 48507aae038c..ace4e8543a1c 100644
--- a/src/diffusers/loaders/__init__.py
+++ b/src/diffusers/loaders/__init__.py
@@ -81,6 +81,8 @@ def text_encoder_attn_modules(text_encoder):
             "HiDreamImageLoraLoaderMixin",
             "SkyReelsV2LoraLoaderMixin",
             "QwenImageLoraLoaderMixin",
+            "ZImageLoraLoaderMixin",
+            "Flux2LoraLoaderMixin",
         ]
         _import_structure["textual_inversion"] = ["TextualInversionLoaderMixin"]
         _import_structure["ip_adapter"] = [
@@ -113,6 +115,7 @@ def text_encoder_attn_modules(text_encoder):
                 AuraFlowLoraLoaderMixin,
                 CogVideoXLoraLoaderMixin,
                 CogView4LoraLoaderMixin,
+                Flux2LoraLoaderMixin,
                 FluxLoraLoaderMixin,
                 HiDreamImageLoraLoaderMixin,
                 HunyuanVideoLoraLoaderMixin,
@@ -128,6 +131,7 @@ def text_encoder_attn_modules(text_encoder):
                 StableDiffusionLoraLoaderMixin,
                 StableDiffusionXLLoraLoaderMixin,
                 WanLoraLoaderMixin,
+                ZImageLoraLoaderMixin,
             )
             from .single_file import FromSingleFileMixin
             from .textual_inversion import TextualInversionLoaderMixin
diff --git a/src/diffusers/loaders/lora_conversion_utils.py b/src/diffusers/loaders/lora_conversion_utils.py
index 099dbfc1d2d8..2e87f757c352 100644
--- a/src/diffusers/loaders/lora_conversion_utils.py
+++ b/src/diffusers/loaders/lora_conversion_utils.py
@@ -2213,6 +2213,225 @@ def convert_key(key: str) -> str:
 
         state_dict = {convert_key(k): v for k, v in state_dict.items()}
 
+    has_default = any("default." in k for k in state_dict)
+    if has_default:
+        state_dict = {k.replace("default.", ""): v for k, v in state_dict.items()}
+
+    converted_state_dict = {}
+    all_keys = list(state_dict.keys())
+    down_key = ".lora_down.weight"
+    up_key = ".lora_up.weight"
+    a_key = ".lora_A.weight"
+    b_key = ".lora_B.weight"
+
+    has_non_diffusers_lora_id = any(down_key in k or up_key in k for k in all_keys)
+    has_diffusers_lora_id = any(a_key in k or b_key in k for k in all_keys)
+
+    if has_non_diffusers_lora_id:
+
+        def get_alpha_scales(down_weight, alpha_key):
+            rank = down_weight.shape[0]
+            alpha = state_dict.pop(alpha_key).item()
+            scale = alpha / rank  # LoRA is scaled by 'alpha / rank' in forward pass, so we need to scale it back here
+            scale_down = scale
+            scale_up = 1.0
+            while scale_down * 2 < scale_up:
+                scale_down *= 2
+                scale_up /= 2
+            return scale_down, scale_up
+
+        for k in all_keys:
+            if k.endswith(down_key):
+                diffusers_down_key = k.replace(down_key, ".lora_A.weight")
+                diffusers_up_key = k.replace(down_key, up_key).replace(up_key, ".lora_B.weight")
+                alpha_key = k.replace(down_key, ".alpha")
+
+                down_weight = state_dict.pop(k)
+                up_weight = state_dict.pop(k.replace(down_key, up_key))
+                scale_down, scale_up = get_alpha_scales(down_weight, alpha_key)
+                converted_state_dict[diffusers_down_key] = down_weight * scale_down
+                converted_state_dict[diffusers_up_key] = up_weight * scale_up
+
+    # Already in diffusers format (lora_A/lora_B), just pop
+    elif has_diffusers_lora_id:
+        for k in all_keys:
+            if a_key in k or b_key in k:
+                converted_state_dict[k] = state_dict.pop(k)
+            elif ".alpha" in k:
+                state_dict.pop(k)
+
+    if len(state_dict) > 0:
+        raise ValueError(f"`state_dict` should be empty at this point but has {state_dict.keys()=}")
+
+    converted_state_dict = {f"transformer.{k}": v for k, v in converted_state_dict.items()}
+    return converted_state_dict
+
+
+def _convert_non_diffusers_flux2_lora_to_diffusers(state_dict):
+    converted_state_dict = {}
+
+    prefix = "diffusion_model."
+    original_state_dict = {k[len(prefix) :]: v for k, v in state_dict.items()}
+
+    num_double_layers = 8
+    num_single_layers = 48
+    lora_keys = ("lora_A", "lora_B")
+    attn_types = ("img_attn", "txt_attn")
+
+    for sl in range(num_single_layers):
+        single_block_prefix = f"single_blocks.{sl}"
+        attn_prefix = f"single_transformer_blocks.{sl}.attn"
+
+        for lora_key in lora_keys:
+            converted_state_dict[f"{attn_prefix}.to_qkv_mlp_proj.{lora_key}.weight"] = original_state_dict.pop(
+                f"{single_block_prefix}.linear1.{lora_key}.weight"
+            )
+
+            converted_state_dict[f"{attn_prefix}.to_out.{lora_key}.weight"] = original_state_dict.pop(
+                f"{single_block_prefix}.linear2.{lora_key}.weight"
+            )
+
+    for dl in range(num_double_layers):
+        transformer_block_prefix = f"transformer_blocks.{dl}"
+
+        for lora_key in lora_keys:
+            for attn_type in attn_types:
+                attn_prefix = f"{transformer_block_prefix}.attn"
+                qkv_key = f"double_blocks.{dl}.{attn_type}.qkv.{lora_key}.weight"
+                fused_qkv_weight = original_state_dict.pop(qkv_key)
+
+                if lora_key == "lora_A":
+                    diff_attn_proj_keys = (
+                        ["to_q", "to_k", "to_v"]
+                        if attn_type == "img_attn"
+                        else ["add_q_proj", "add_k_proj", "add_v_proj"]
+                    )
+                    for proj_key in diff_attn_proj_keys:
+                        converted_state_dict[f"{attn_prefix}.{proj_key}.{lora_key}.weight"] = torch.cat(
+                            [fused_qkv_weight]
+                        )
+                else:
+                    sample_q, sample_k, sample_v = torch.chunk(fused_qkv_weight, 3, dim=0)
+
+                    if attn_type == "img_attn":
+                        converted_state_dict[f"{attn_prefix}.to_q.{lora_key}.weight"] = torch.cat([sample_q])
+                        converted_state_dict[f"{attn_prefix}.to_k.{lora_key}.weight"] = torch.cat([sample_k])
+                        converted_state_dict[f"{attn_prefix}.to_v.{lora_key}.weight"] = torch.cat([sample_v])
+                    else:
+                        converted_state_dict[f"{attn_prefix}.add_q_proj.{lora_key}.weight"] = torch.cat([sample_q])
+                        converted_state_dict[f"{attn_prefix}.add_k_proj.{lora_key}.weight"] = torch.cat([sample_k])
+                        converted_state_dict[f"{attn_prefix}.add_v_proj.{lora_key}.weight"] = torch.cat([sample_v])
+
+        proj_mappings = [
+            ("img_attn.proj", "attn.to_out.0"),
+            ("txt_attn.proj", "attn.to_add_out"),
+        ]
+        for org_proj, diff_proj in proj_mappings:
+            for lora_key in lora_keys:
+                original_key = f"double_blocks.{dl}.{org_proj}.{lora_key}.weight"
+                diffusers_key = f"{transformer_block_prefix}.{diff_proj}.{lora_key}.weight"
+                converted_state_dict[diffusers_key] = original_state_dict.pop(original_key)
+
+        mlp_mappings = [
+            ("img_mlp.0", "ff.linear_in"),
+            ("img_mlp.2", "ff.linear_out"),
+            ("txt_mlp.0", "ff_context.linear_in"),
+            ("txt_mlp.2", "ff_context.linear_out"),
+        ]
+        for org_mlp, diff_mlp in mlp_mappings:
+            for lora_key in lora_keys:
+                original_key = f"double_blocks.{dl}.{org_mlp}.{lora_key}.weight"
+                diffusers_key = f"{transformer_block_prefix}.{diff_mlp}.{lora_key}.weight"
+                converted_state_dict[diffusers_key] = original_state_dict.pop(original_key)
+
+    if len(original_state_dict) > 0:
+        raise ValueError(f"`original_state_dict` should be empty at this point but has {original_state_dict.keys()=}.")
+
+    for key in list(converted_state_dict.keys()):
+        converted_state_dict[f"transformer.{key}"] = converted_state_dict.pop(key)
+
+    return converted_state_dict
+
+
+def _convert_non_diffusers_z_image_lora_to_diffusers(state_dict):
+    """
+    Convert non-diffusers ZImage LoRA state dict to diffusers format.
+
+    Handles:
+    - `diffusion_model.` prefix removal
+    - `lora_unet_` prefix conversion with key mapping
+    - `default.` prefix removal
+    - `.lora_down.weight`/`.lora_up.weight` → `.lora_A.weight`/`.lora_B.weight` conversion with alpha scaling
+    """
+    has_diffusion_model = any(k.startswith("diffusion_model.") for k in state_dict)
+    if has_diffusion_model:
+        state_dict = {k.removeprefix("diffusion_model."): v for k, v in state_dict.items()}
+
+    has_lora_unet = any(k.startswith("lora_unet_") for k in state_dict)
+    if has_lora_unet:
+        state_dict = {k.removeprefix("lora_unet_"): v for k, v in state_dict.items()}
+
+        def convert_key(key: str) -> str:
+            # ZImage has: layers, noise_refiner, context_refiner blocks
+            # Keys may be like: layers_0_attention_to_q.lora_down.weight
+
+            if "." in key:
+                base, suffix = key.rsplit(".", 1)
+            else:
+                base, suffix = key, ""
+
+            # Protected n-grams that must keep their internal underscores
+            protected = {
+                # pairs for attention
+                ("to", "q"),
+                ("to", "k"),
+                ("to", "v"),
+                ("to", "out"),
+                # feed_forward
+                ("feed", "forward"),
+            }
+
+            prot_by_len = {}
+            for ng in protected:
+                prot_by_len.setdefault(len(ng), set()).add(ng)
+
+            parts = base.split("_")
+            merged = []
+            i = 0
+            lengths_desc = sorted(prot_by_len.keys(), reverse=True)
+
+            while i < len(parts):
+                matched = False
+                for L in lengths_desc:
+                    if i + L <= len(parts) and tuple(parts[i : i + L]) in prot_by_len[L]:
+                        merged.append("_".join(parts[i : i + L]))
+                        i += L
+                        matched = True
+                        break
+                if not matched:
+                    merged.append(parts[i])
+                    i += 1
+
+            converted_base = ".".join(merged)
+            return converted_base + (("." + suffix) if suffix else "")
+
+        state_dict = {convert_key(k): v for k, v in state_dict.items()}
+
+    def normalize_out_key(k: str) -> str:
+        if ".to_out" in k:
+            return k
+        return re.sub(
+            r"\.out(?=\.(?:lora_down|lora_up)\.weight$|\.alpha$)",
+            ".to_out.0",
+            k,
+        )
+
+    state_dict = {normalize_out_key(k): v for k, v in state_dict.items()}
+
+    has_default = any("default." in k for k in state_dict)
+    if has_default:
+        state_dict = {k.replace("default.", ""): v for k, v in state_dict.items()}
+
     converted_state_dict = {}
     all_keys = list(state_dict.keys())
     down_key = ".lora_down.weight"
diff --git a/src/diffusers/loaders/lora_pipeline.py b/src/diffusers/loaders/lora_pipeline.py
index 2bb6c0ea026e..bcbe54649f89 100644
--- a/src/diffusers/loaders/lora_pipeline.py
+++ b/src/diffusers/loaders/lora_pipeline.py
@@ -45,12 +45,14 @@
     _convert_hunyuan_video_lora_to_diffusers,
     _convert_kohya_flux_lora_to_diffusers,
     _convert_musubi_wan_lora_to_diffusers,
+    _convert_non_diffusers_flux2_lora_to_diffusers,
     _convert_non_diffusers_hidream_lora_to_diffusers,
     _convert_non_diffusers_lora_to_diffusers,
     _convert_non_diffusers_ltxv_lora_to_diffusers,
     _convert_non_diffusers_lumina2_lora_to_diffusers,
     _convert_non_diffusers_qwen_lora_to_diffusers,
     _convert_non_diffusers_wan_lora_to_diffusers,
+    _convert_non_diffusers_z_image_lora_to_diffusers,
     _convert_xlabs_flux_lora_to_diffusers,
     _maybe_map_sgm_blocks_to_diffusers,
 )
@@ -4940,7 +4942,8 @@ def lora_state_dict(
         has_alphas_in_sd = any(k.endswith(".alpha") for k in state_dict)
         has_lora_unet = any(k.startswith("lora_unet_") for k in state_dict)
         has_diffusion_model = any(k.startswith("diffusion_model.") for k in state_dict)
-        if has_alphas_in_sd or has_lora_unet or has_diffusion_model:
+        has_default = any("default." in k for k in state_dict)
+        if has_alphas_in_sd or has_lora_unet or has_diffusion_model or has_default:
             state_dict = _convert_non_diffusers_qwen_lora_to_diffusers(state_dict)
 
         out = (state_dict, metadata) if return_lora_metadata else state_dict
@@ -5083,6 +5086,415 @@ def unfuse_lora(self, components: List[str] = ["transformer"], **kwargs):
         super().unfuse_lora(components=components, **kwargs)
 
 
+class ZImageLoraLoaderMixin(LoraBaseMixin):
+    r"""
+    Load LoRA layers into [`ZImageTransformer2DModel`]. Specific to [`ZImagePipeline`].
+    """
+
+    _lora_loadable_modules = ["transformer"]
+    transformer_name = TRANSFORMER_NAME
+
+    @classmethod
+    @validate_hf_hub_args
+    def lora_state_dict(
+        cls,
+        pretrained_model_name_or_path_or_dict: Union[str, Dict[str, torch.Tensor]],
+        **kwargs,
+    ):
+        r"""
+        See [`~loaders.StableDiffusionLoraLoaderMixin.lora_state_dict`] for more details.
+        """
+        # Load the main state dict first which has the LoRA layers for either of
+        # transformer and text encoder or both.
+        cache_dir = kwargs.pop("cache_dir", None)
+        force_download = kwargs.pop("force_download", False)
+        proxies = kwargs.pop("proxies", None)
+        local_files_only = kwargs.pop("local_files_only", None)
+        token = kwargs.pop("token", None)
+        revision = kwargs.pop("revision", None)
+        subfolder = kwargs.pop("subfolder", None)
+        weight_name = kwargs.pop("weight_name", None)
+        use_safetensors = kwargs.pop("use_safetensors", None)
+        return_lora_metadata = kwargs.pop("return_lora_metadata", False)
+
+        allow_pickle = False
+        if use_safetensors is None:
+            use_safetensors = True
+            allow_pickle = True
+
+        user_agent = {"file_type": "attn_procs_weights", "framework": "pytorch"}
+
+        state_dict, metadata = _fetch_state_dict(
+            pretrained_model_name_or_path_or_dict=pretrained_model_name_or_path_or_dict,
+            weight_name=weight_name,
+            use_safetensors=use_safetensors,
+            local_files_only=local_files_only,
+            cache_dir=cache_dir,
+            force_download=force_download,
+            proxies=proxies,
+            token=token,
+            revision=revision,
+            subfolder=subfolder,
+            user_agent=user_agent,
+            allow_pickle=allow_pickle,
+        )
+
+        is_dora_scale_present = any("dora_scale" in k for k in state_dict)
+        if is_dora_scale_present:
+            warn_msg = "It seems like you are using a DoRA checkpoint that is not compatible in Diffusers at the moment. So, we are going to filter out the keys associated to 'dora_scale` from the state dict. If you think this is a mistake please open an issue https://github.com/huggingface/diffusers/issues/new."
+            logger.warning(warn_msg)
+            state_dict = {k: v for k, v in state_dict.items() if "dora_scale" not in k}
+
+        has_alphas_in_sd = any(k.endswith(".alpha") for k in state_dict)
+        has_lora_unet = any(k.startswith("lora_unet_") for k in state_dict)
+        has_diffusion_model = any(k.startswith("diffusion_model.") for k in state_dict)
+        has_default = any("default." in k for k in state_dict)
+        if has_alphas_in_sd or has_lora_unet or has_diffusion_model or has_default:
+            state_dict = _convert_non_diffusers_z_image_lora_to_diffusers(state_dict)
+
+        out = (state_dict, metadata) if return_lora_metadata else state_dict
+        return out
+
+    # Copied from diffusers.loaders.lora_pipeline.CogVideoXLoraLoaderMixin.load_lora_weights
+    def load_lora_weights(
+        self,
+        pretrained_model_name_or_path_or_dict: Union[str, Dict[str, torch.Tensor]],
+        adapter_name: Optional[str] = None,
+        hotswap: bool = False,
+        **kwargs,
+    ):
+        """
+        See [`~loaders.StableDiffusionLoraLoaderMixin.load_lora_weights`] for more details.
+        """
+        if not USE_PEFT_BACKEND:
+            raise ValueError("PEFT backend is required for this method.")
+
+        low_cpu_mem_usage = kwargs.pop("low_cpu_mem_usage", _LOW_CPU_MEM_USAGE_DEFAULT_LORA)
+        if low_cpu_mem_usage and is_peft_version("<", "0.13.0"):
+            raise ValueError(
+                "`low_cpu_mem_usage=True` is not compatible with this `peft` version. Please update it with `pip install -U peft`."
+            )
+
+        # if a dict is passed, copy it instead of modifying it inplace
+        if isinstance(pretrained_model_name_or_path_or_dict, dict):
+            pretrained_model_name_or_path_or_dict = pretrained_model_name_or_path_or_dict.copy()
+
+        # First, ensure that the checkpoint is a compatible one and can be successfully loaded.
+        kwargs["return_lora_metadata"] = True
+        state_dict, metadata = self.lora_state_dict(pretrained_model_name_or_path_or_dict, **kwargs)
+
+        is_correct_format = all("lora" in key for key in state_dict.keys())
+        if not is_correct_format:
+            raise ValueError("Invalid LoRA checkpoint.")
+
+        self.load_lora_into_transformer(
+            state_dict,
+            transformer=getattr(self, self.transformer_name) if not hasattr(self, "transformer") else self.transformer,
+            adapter_name=adapter_name,
+            metadata=metadata,
+            _pipeline=self,
+            low_cpu_mem_usage=low_cpu_mem_usage,
+            hotswap=hotswap,
+        )
+
+    @classmethod
+    # Copied from diffusers.loaders.lora_pipeline.SD3LoraLoaderMixin.load_lora_into_transformer with SD3Transformer2DModel->ZImageTransformer2DModel
+    def load_lora_into_transformer(
+        cls,
+        state_dict,
+        transformer,
+        adapter_name=None,
+        _pipeline=None,
+        low_cpu_mem_usage=False,
+        hotswap: bool = False,
+        metadata=None,
+    ):
+        """
+        See [`~loaders.StableDiffusionLoraLoaderMixin.load_lora_into_unet`] for more details.
+        """
+        if low_cpu_mem_usage and is_peft_version("<", "0.13.0"):
+            raise ValueError(
+                "`low_cpu_mem_usage=True` is not compatible with this `peft` version. Please update it with `pip install -U peft`."
+            )
+
+        # Load the layers corresponding to transformer.
+        logger.info(f"Loading {cls.transformer_name}.")
+        transformer.load_lora_adapter(
+            state_dict,
+            network_alphas=None,
+            adapter_name=adapter_name,
+            metadata=metadata,
+            _pipeline=_pipeline,
+            low_cpu_mem_usage=low_cpu_mem_usage,
+            hotswap=hotswap,
+        )
+
+    @classmethod
+    # Copied from diffusers.loaders.lora_pipeline.CogVideoXLoraLoaderMixin.save_lora_weights
+    def save_lora_weights(
+        cls,
+        save_directory: Union[str, os.PathLike],
+        transformer_lora_layers: Dict[str, Union[torch.nn.Module, torch.Tensor]] = None,
+        is_main_process: bool = True,
+        weight_name: str = None,
+        save_function: Callable = None,
+        safe_serialization: bool = True,
+        transformer_lora_adapter_metadata: Optional[dict] = None,
+    ):
+        r"""
+        See [`~loaders.StableDiffusionLoraLoaderMixin.save_lora_weights`] for more information.
+        """
+        lora_layers = {}
+        lora_metadata = {}
+
+        if transformer_lora_layers:
+            lora_layers[cls.transformer_name] = transformer_lora_layers
+            lora_metadata[cls.transformer_name] = transformer_lora_adapter_metadata
+
+        if not lora_layers:
+            raise ValueError("You must pass at least one of `transformer_lora_layers` or `text_encoder_lora_layers`.")
+
+        cls._save_lora_weights(
+            save_directory=save_directory,
+            lora_layers=lora_layers,
+            lora_metadata=lora_metadata,
+            is_main_process=is_main_process,
+            weight_name=weight_name,
+            save_function=save_function,
+            safe_serialization=safe_serialization,
+        )
+
+    # Copied from diffusers.loaders.lora_pipeline.CogVideoXLoraLoaderMixin.fuse_lora
+    def fuse_lora(
+        self,
+        components: List[str] = ["transformer"],
+        lora_scale: float = 1.0,
+        safe_fusing: bool = False,
+        adapter_names: Optional[List[str]] = None,
+        **kwargs,
+    ):
+        r"""
+        See [`~loaders.StableDiffusionLoraLoaderMixin.fuse_lora`] for more details.
+        """
+        super().fuse_lora(
+            components=components,
+            lora_scale=lora_scale,
+            safe_fusing=safe_fusing,
+            adapter_names=adapter_names,
+            **kwargs,
+        )
+
+    # Copied from diffusers.loaders.lora_pipeline.CogVideoXLoraLoaderMixin.unfuse_lora
+    def unfuse_lora(self, components: List[str] = ["transformer"], **kwargs):
+        r"""
+        See [`~loaders.StableDiffusionLoraLoaderMixin.unfuse_lora`] for more details.
+        """
+        super().unfuse_lora(components=components, **kwargs)
+
+
+class Flux2LoraLoaderMixin(LoraBaseMixin):
+    r"""
+    Load LoRA layers into [`Flux2Transformer2DModel`]. Specific to [`Flux2Pipeline`].
+    """
+
+    _lora_loadable_modules = ["transformer"]
+    transformer_name = TRANSFORMER_NAME
+
+    @classmethod
+    @validate_hf_hub_args
+    def lora_state_dict(
+        cls,
+        pretrained_model_name_or_path_or_dict: Union[str, Dict[str, torch.Tensor]],
+        **kwargs,
+    ):
+        r"""
+        See [`~loaders.StableDiffusionLoraLoaderMixin.lora_state_dict`] for more details.
+        """
+        # Load the main state dict first which has the LoRA layers for either of
+        # transformer and text encoder or both.
+        cache_dir = kwargs.pop("cache_dir", None)
+        force_download = kwargs.pop("force_download", False)
+        proxies = kwargs.pop("proxies", None)
+        local_files_only = kwargs.pop("local_files_only", None)
+        token = kwargs.pop("token", None)
+        revision = kwargs.pop("revision", None)
+        subfolder = kwargs.pop("subfolder", None)
+        weight_name = kwargs.pop("weight_name", None)
+        use_safetensors = kwargs.pop("use_safetensors", None)
+        return_lora_metadata = kwargs.pop("return_lora_metadata", False)
+
+        allow_pickle = False
+        if use_safetensors is None:
+            use_safetensors = True
+            allow_pickle = True
+
+        user_agent = {"file_type": "attn_procs_weights", "framework": "pytorch"}
+
+        state_dict, metadata = _fetch_state_dict(
+            pretrained_model_name_or_path_or_dict=pretrained_model_name_or_path_or_dict,
+            weight_name=weight_name,
+            use_safetensors=use_safetensors,
+            local_files_only=local_files_only,
+            cache_dir=cache_dir,
+            force_download=force_download,
+            proxies=proxies,
+            token=token,
+            revision=revision,
+            subfolder=subfolder,
+            user_agent=user_agent,
+            allow_pickle=allow_pickle,
+        )
+
+        is_dora_scale_present = any("dora_scale" in k for k in state_dict)
+        if is_dora_scale_present:
+            warn_msg = "It seems like you are using a DoRA checkpoint that is not compatible in Diffusers at the moment. So, we are going to filter out the keys associated to 'dora_scale` from the state dict. If you think this is a mistake please open an issue https://github.com/huggingface/diffusers/issues/new."
+            logger.warning(warn_msg)
+            state_dict = {k: v for k, v in state_dict.items() if "dora_scale" not in k}
+
+        is_ai_toolkit = any(k.startswith("diffusion_model.") for k in state_dict)
+        if is_ai_toolkit:
+            state_dict = _convert_non_diffusers_flux2_lora_to_diffusers(state_dict)
+
+        out = (state_dict, metadata) if return_lora_metadata else state_dict
+        return out
+
+    # Copied from diffusers.loaders.lora_pipeline.CogVideoXLoraLoaderMixin.load_lora_weights
+    def load_lora_weights(
+        self,
+        pretrained_model_name_or_path_or_dict: Union[str, Dict[str, torch.Tensor]],
+        adapter_name: Optional[str] = None,
+        hotswap: bool = False,
+        **kwargs,
+    ):
+        """
+        See [`~loaders.StableDiffusionLoraLoaderMixin.load_lora_weights`] for more details.
+        """
+        if not USE_PEFT_BACKEND:
+            raise ValueError("PEFT backend is required for this method.")
+
+        low_cpu_mem_usage = kwargs.pop("low_cpu_mem_usage", _LOW_CPU_MEM_USAGE_DEFAULT_LORA)
+        if low_cpu_mem_usage and is_peft_version("<", "0.13.0"):
+            raise ValueError(
+                "`low_cpu_mem_usage=True` is not compatible with this `peft` version. Please update it with `pip install -U peft`."
+            )
+
+        # if a dict is passed, copy it instead of modifying it inplace
+        if isinstance(pretrained_model_name_or_path_or_dict, dict):
+            pretrained_model_name_or_path_or_dict = pretrained_model_name_or_path_or_dict.copy()
+
+        # First, ensure that the checkpoint is a compatible one and can be successfully loaded.
+        kwargs["return_lora_metadata"] = True
+        state_dict, metadata = self.lora_state_dict(pretrained_model_name_or_path_or_dict, **kwargs)
+
+        is_correct_format = all("lora" in key for key in state_dict.keys())
+        if not is_correct_format:
+            raise ValueError("Invalid LoRA checkpoint.")
+
+        self.load_lora_into_transformer(
+            state_dict,
+            transformer=getattr(self, self.transformer_name) if not hasattr(self, "transformer") else self.transformer,
+            adapter_name=adapter_name,
+            metadata=metadata,
+            _pipeline=self,
+            low_cpu_mem_usage=low_cpu_mem_usage,
+            hotswap=hotswap,
+        )
+
+    @classmethod
+    # Copied from diffusers.loaders.lora_pipeline.SD3LoraLoaderMixin.load_lora_into_transformer with SD3Transformer2DModel->CogView4Transformer2DModel
+    def load_lora_into_transformer(
+        cls,
+        state_dict,
+        transformer,
+        adapter_name=None,
+        _pipeline=None,
+        low_cpu_mem_usage=False,
+        hotswap: bool = False,
+        metadata=None,
+    ):
+        """
+        See [`~loaders.StableDiffusionLoraLoaderMixin.load_lora_into_unet`] for more details.
+        """
+        if low_cpu_mem_usage and is_peft_version("<", "0.13.0"):
+            raise ValueError(
+                "`low_cpu_mem_usage=True` is not compatible with this `peft` version. Please update it with `pip install -U peft`."
+            )
+
+        # Load the layers corresponding to transformer.
+        logger.info(f"Loading {cls.transformer_name}.")
+        transformer.load_lora_adapter(
+            state_dict,
+            network_alphas=None,
+            adapter_name=adapter_name,
+            metadata=metadata,
+            _pipeline=_pipeline,
+            low_cpu_mem_usage=low_cpu_mem_usage,
+            hotswap=hotswap,
+        )
+
+    @classmethod
+    # Copied from diffusers.loaders.lora_pipeline.CogVideoXLoraLoaderMixin.save_lora_weights
+    def save_lora_weights(
+        cls,
+        save_directory: Union[str, os.PathLike],
+        transformer_lora_layers: Dict[str, Union[torch.nn.Module, torch.Tensor]] = None,
+        is_main_process: bool = True,
+        weight_name: str = None,
+        save_function: Callable = None,
+        safe_serialization: bool = True,
+        transformer_lora_adapter_metadata: Optional[dict] = None,
+    ):
+        r"""
+        See [`~loaders.StableDiffusionLoraLoaderMixin.save_lora_weights`] for more information.
+        """
+        lora_layers = {}
+        lora_metadata = {}
+
+        if transformer_lora_layers:
+            lora_layers[cls.transformer_name] = transformer_lora_layers
+            lora_metadata[cls.transformer_name] = transformer_lora_adapter_metadata
+
+        if not lora_layers:
+            raise ValueError("You must pass at least one of `transformer_lora_layers` or `text_encoder_lora_layers`.")
+
+        cls._save_lora_weights(
+            save_directory=save_directory,
+            lora_layers=lora_layers,
+            lora_metadata=lora_metadata,
+            is_main_process=is_main_process,
+            weight_name=weight_name,
+            save_function=save_function,
+            safe_serialization=safe_serialization,
+        )
+
+    # Copied from diffusers.loaders.lora_pipeline.CogVideoXLoraLoaderMixin.fuse_lora
+    def fuse_lora(
+        self,
+        components: List[str] = ["transformer"],
+        lora_scale: float = 1.0,
+        safe_fusing: bool = False,
+        adapter_names: Optional[List[str]] = None,
+        **kwargs,
+    ):
+        r"""
+        See [`~loaders.StableDiffusionLoraLoaderMixin.fuse_lora`] for more details.
+        """
+        super().fuse_lora(
+            components=components,
+            lora_scale=lora_scale,
+            safe_fusing=safe_fusing,
+            adapter_names=adapter_names,
+            **kwargs,
+        )
+
+    # Copied from diffusers.loaders.lora_pipeline.CogVideoXLoraLoaderMixin.unfuse_lora
+    def unfuse_lora(self, components: List[str] = ["transformer"], **kwargs):
+        r"""
+        See [`~loaders.StableDiffusionLoraLoaderMixin.unfuse_lora`] for more details.
+        """
+        super().unfuse_lora(components=components, **kwargs)
+
+
 class LoraLoaderMixin(StableDiffusionLoraLoaderMixin):
     def __init__(self, *args, **kwargs):
         deprecation_message = "LoraLoaderMixin is deprecated and this will be removed in a future version. Please use `StableDiffusionLoraLoaderMixin`, instead."
diff --git a/src/diffusers/loaders/peft.py b/src/diffusers/loaders/peft.py
index 7d65b30659fb..3f8519bbfa32 100644
--- a/src/diffusers/loaders/peft.py
+++ b/src/diffusers/loaders/peft.py
@@ -22,6 +22,7 @@
 import safetensors
 import torch
 
+from ..hooks.group_offloading import _maybe_remove_and_reapply_group_offloading
 from ..utils import (
     MIN_PEFT_VERSION,
     USE_PEFT_BACKEND,
@@ -62,6 +63,8 @@
     "WanVACETransformer3DModel": lambda model_cls, weights: weights,
     "ChromaTransformer2DModel": lambda model_cls, weights: weights,
     "QwenImageTransformer2DModel": lambda model_cls, weights: weights,
+    "Flux2Transformer2DModel": lambda model_cls, weights: weights,
+    "ZImageTransformer2DModel": lambda model_cls, weights: weights,
 }
 
 
@@ -792,6 +795,8 @@ def delete_adapters(self, adapter_names: Union[List[str], str]):
             if hasattr(self, "peft_config"):
                 self.peft_config.pop(adapter_name, None)
 
+        _maybe_remove_and_reapply_group_offloading(self)
+
     def enable_lora_hotswap(
         self, target_rank: int = 128, check_compiled: Literal["error", "warn", "ignore"] = "error"
     ) -> None:
diff --git a/src/diffusers/loaders/single_file_model.py b/src/diffusers/loaders/single_file_model.py
index b53647d47630..803fdfc2d952 100644
--- a/src/diffusers/loaders/single_file_model.py
+++ b/src/diffusers/loaders/single_file_model.py
@@ -34,6 +34,7 @@
     convert_chroma_transformer_checkpoint_to_diffusers,
     convert_controlnet_checkpoint,
     convert_cosmos_transformer_checkpoint_to_diffusers,
+    convert_flux2_transformer_checkpoint_to_diffusers,
     convert_flux_transformer_checkpoint_to_diffusers,
     convert_hidream_transformer_to_diffusers,
     convert_hunyuan_video_transformer_to_diffusers,
@@ -48,6 +49,7 @@
     convert_stable_cascade_unet_single_file_to_diffusers,
     convert_wan_transformer_to_diffusers,
     convert_wan_vae_to_diffusers,
+    convert_z_image_transformer_checkpoint_to_diffusers,
     create_controlnet_diffusers_config_from_ldm,
     create_unet_diffusers_config_from_ldm,
     create_vae_diffusers_config_from_ldm,
@@ -162,6 +164,14 @@
         "checkpoint_mapping_fn": lambda x: x,
         "default_subfolder": "transformer",
     },
+    "Flux2Transformer2DModel": {
+        "checkpoint_mapping_fn": convert_flux2_transformer_checkpoint_to_diffusers,
+        "default_subfolder": "transformer",
+    },
+    "ZImageTransformer2DModel": {
+        "checkpoint_mapping_fn": convert_z_image_transformer_checkpoint_to_diffusers,
+        "default_subfolder": "transformer",
+    },
 }
 
 
diff --git a/src/diffusers/loaders/single_file_utils.py b/src/diffusers/loaders/single_file_utils.py
index ef6c41e3ce97..b866a5a21ae3 100644
--- a/src/diffusers/loaders/single_file_utils.py
+++ b/src/diffusers/loaders/single_file_utils.py
@@ -120,6 +120,7 @@
     "hunyuan-video": "txt_in.individual_token_refiner.blocks.0.adaLN_modulation.1.bias",
     "instruct-pix2pix": "model.diffusion_model.input_blocks.0.0.weight",
     "lumina2": ["model.diffusion_model.cap_embedder.0.weight", "cap_embedder.0.weight"],
+    "z-image-turbo": "cap_embedder.0.weight",
     "sana": [
         "blocks.0.cross_attn.q_linear.weight",
         "blocks.0.cross_attn.q_linear.bias",
@@ -140,6 +141,7 @@
         "net.blocks.0.self_attn.q_proj.weight",
         "net.pos_embedder.dim_spatial_range",
     ],
+    "flux2": ["model.diffusion_model.single_stream_modulation.lin.weight", "single_stream_modulation.lin.weight"],
 }
 
 DIFFUSERS_DEFAULT_PIPELINE_PATHS = {
@@ -189,6 +191,7 @@
     "flux-fill": {"pretrained_model_name_or_path": "black-forest-labs/FLUX.1-Fill-dev"},
     "flux-depth": {"pretrained_model_name_or_path": "black-forest-labs/FLUX.1-Depth-dev"},
     "flux-schnell": {"pretrained_model_name_or_path": "black-forest-labs/FLUX.1-schnell"},
+    "flux-2-dev": {"pretrained_model_name_or_path": "black-forest-labs/FLUX.2-dev"},
     "ltx-video": {"pretrained_model_name_or_path": "diffusers/LTX-Video-0.9.0"},
     "ltx-video-0.9.1": {"pretrained_model_name_or_path": "diffusers/LTX-Video-0.9.1"},
     "ltx-video-0.9.5": {"pretrained_model_name_or_path": "Lightricks/LTX-Video-0.9.5"},
@@ -216,6 +219,7 @@
     "cosmos-2.0-t2i-14B": {"pretrained_model_name_or_path": "nvidia/Cosmos-Predict2-14B-Text2Image"},
     "cosmos-2.0-v2w-2B": {"pretrained_model_name_or_path": "nvidia/Cosmos-Predict2-2B-Video2World"},
     "cosmos-2.0-v2w-14B": {"pretrained_model_name_or_path": "nvidia/Cosmos-Predict2-14B-Video2World"},
+    "z-image-turbo": {"pretrained_model_name_or_path": "Tongyi-MAI/Z-Image-Turbo"},
 }
 
 # Use to configure model sample size when original config is provided
@@ -387,6 +391,14 @@ def is_valid_url(url):
     return False
 
 
+def _is_single_file_path_or_url(pretrained_model_name_or_path):
+    if not os.path.isfile(pretrained_model_name_or_path) or not is_valid_url(pretrained_model_name_or_path):
+        return False
+
+    repo_id, weight_name = _extract_repo_id_and_weights_name(pretrained_model_name_or_path)
+    return bool(repo_id and weight_name)
+
+
 def _extract_repo_id_and_weights_name(pretrained_model_name_or_path):
     if not is_valid_url(pretrained_model_name_or_path):
         raise ValueError("Invalid `pretrained_model_name_or_path` provided. Please set it to a valid URL.")
@@ -398,7 +410,6 @@ def _extract_repo_id_and_weights_name(pretrained_model_name_or_path):
         pretrained_model_name_or_path = pretrained_model_name_or_path.replace(prefix, "")
     match = re.match(pattern, pretrained_model_name_or_path)
     if not match:
-        logger.warning("Unable to identify the repo_id and weights_name from the provided URL.")
         return repo_id, weights_name
 
     repo_id = f"{match.group(1)}/{match.group(2)}"
@@ -649,6 +660,9 @@ def infer_diffusers_model_type(checkpoint):
         else:
             model_type = "animatediff_v3"
 
+    elif any(key in checkpoint for key in CHECKPOINT_KEY_NAMES["flux2"]):
+        model_type = "flux-2-dev"
+
     elif any(key in checkpoint for key in CHECKPOINT_KEY_NAMES["flux"]):
         if any(
             g in checkpoint for g in ["guidance_in.in_layer.bias", "model.diffusion_model.guidance_in.in_layer.bias"]
@@ -709,6 +723,12 @@ def infer_diffusers_model_type(checkpoint):
     ):
         model_type = "instruct-pix2pix"
 
+    elif (
+        CHECKPOINT_KEY_NAMES["z-image-turbo"] in checkpoint
+        and checkpoint[CHECKPOINT_KEY_NAMES["z-image-turbo"]].shape[0] == 2560
+    ):
+        model_type = "z-image-turbo"
+
     elif any(key in checkpoint for key in CHECKPOINT_KEY_NAMES["lumina2"]):
         model_type = "lumina2"
 
@@ -3647,3 +3667,221 @@ def rename_transformer_blocks_(key: str, state_dict):
             handler_fn_inplace(key, converted_state_dict)
 
     return converted_state_dict
+
+
+def convert_flux2_transformer_checkpoint_to_diffusers(checkpoint, **kwargs):
+    FLUX2_TRANSFORMER_KEYS_RENAME_DICT = {
+        # Image and text input projections
+        "img_in": "x_embedder",
+        "txt_in": "context_embedder",
+        # Timestep and guidance embeddings
+        "time_in.in_layer": "time_guidance_embed.timestep_embedder.linear_1",
+        "time_in.out_layer": "time_guidance_embed.timestep_embedder.linear_2",
+        "guidance_in.in_layer": "time_guidance_embed.guidance_embedder.linear_1",
+        "guidance_in.out_layer": "time_guidance_embed.guidance_embedder.linear_2",
+        # Modulation parameters
+        "double_stream_modulation_img.lin": "double_stream_modulation_img.linear",
+        "double_stream_modulation_txt.lin": "double_stream_modulation_txt.linear",
+        "single_stream_modulation.lin": "single_stream_modulation.linear",
+        # Final output layer
+        # "final_layer.adaLN_modulation.1": "norm_out.linear",  # Handle separately since we need to swap mod params
+        "final_layer.linear": "proj_out",
+    }
+
+    FLUX2_TRANSFORMER_ADA_LAYER_NORM_KEY_MAP = {
+        "final_layer.adaLN_modulation.1": "norm_out.linear",
+    }
+
+    FLUX2_TRANSFORMER_DOUBLE_BLOCK_KEY_MAP = {
+        # Handle fused QKV projections separately as we need to break into Q, K, V projections
+        "img_attn.norm.query_norm": "attn.norm_q",
+        "img_attn.norm.key_norm": "attn.norm_k",
+        "img_attn.proj": "attn.to_out.0",
+        "img_mlp.0": "ff.linear_in",
+        "img_mlp.2": "ff.linear_out",
+        "txt_attn.norm.query_norm": "attn.norm_added_q",
+        "txt_attn.norm.key_norm": "attn.norm_added_k",
+        "txt_attn.proj": "attn.to_add_out",
+        "txt_mlp.0": "ff_context.linear_in",
+        "txt_mlp.2": "ff_context.linear_out",
+    }
+
+    FLUX2_TRANSFORMER_SINGLE_BLOCK_KEY_MAP = {
+        "linear1": "attn.to_qkv_mlp_proj",
+        "linear2": "attn.to_out",
+        "norm.query_norm": "attn.norm_q",
+        "norm.key_norm": "attn.norm_k",
+    }
+
+    def convert_flux2_single_stream_blocks(key: str, state_dict: dict[str, object]) -> None:
+        # Skip if not a weight, bias, or scale
+        if ".weight" not in key and ".bias" not in key and ".scale" not in key:
+            return
+
+        # Mapping:
+        #     - single_blocks.{N}.linear1               --> single_transformer_blocks.{N}.attn.to_qkv_mlp_proj
+        #     - single_blocks.{N}.linear2               --> single_transformer_blocks.{N}.attn.to_out
+        #     - single_blocks.{N}.norm.query_norm.scale --> single_transformer_blocks.{N}.attn.norm_q.weight
+        #     - single_blocks.{N}.norm.key_norm.scale   --> single_transformer_blocks.{N}.attn.norm_k.weight
+        new_prefix = "single_transformer_blocks"
+        if "single_blocks." in key:
+            parts = key.split(".")
+            block_idx = parts[1]
+            within_block_name = ".".join(parts[2:-1])
+            param_type = parts[-1]
+
+            if param_type == "scale":
+                param_type = "weight"
+
+            new_within_block_name = FLUX2_TRANSFORMER_SINGLE_BLOCK_KEY_MAP[within_block_name]
+            new_key = ".".join([new_prefix, block_idx, new_within_block_name, param_type])
+
+            param = state_dict.pop(key)
+            state_dict[new_key] = param
+
+        return
+
+    def convert_ada_layer_norm_weights(key: str, state_dict: dict[str, object]) -> None:
+        # Skip if not a weight
+        if ".weight" not in key:
+            return
+
+        # If adaLN_modulation is in the key, swap scale and shift parameters
+        # Original implementation is (shift, scale); diffusers implementation is (scale, shift)
+        if "adaLN_modulation" in key:
+            key_without_param_type, param_type = key.rsplit(".", maxsplit=1)
+            # Assume all such keys are in the AdaLayerNorm key map
+            new_key_without_param_type = FLUX2_TRANSFORMER_ADA_LAYER_NORM_KEY_MAP[key_without_param_type]
+            new_key = ".".join([new_key_without_param_type, param_type])
+
+            swapped_weight = swap_scale_shift(state_dict.pop(key), 0)
+            state_dict[new_key] = swapped_weight
+
+        return
+
+    def convert_flux2_double_stream_blocks(key: str, state_dict: dict[str, object]) -> None:
+        # Skip if not a weight, bias, or scale
+        if ".weight" not in key and ".bias" not in key and ".scale" not in key:
+            return
+
+        new_prefix = "transformer_blocks"
+        if "double_blocks." in key:
+            parts = key.split(".")
+            block_idx = parts[1]
+            modality_block_name = parts[2]  # img_attn, img_mlp, txt_attn, txt_mlp
+            within_block_name = ".".join(parts[2:-1])
+            param_type = parts[-1]
+
+            if param_type == "scale":
+                param_type = "weight"
+
+            if "qkv" in within_block_name:
+                fused_qkv_weight = state_dict.pop(key)
+                to_q_weight, to_k_weight, to_v_weight = torch.chunk(fused_qkv_weight, 3, dim=0)
+                if "img" in modality_block_name:
+                    # double_blocks.{N}.img_attn.qkv --> transformer_blocks.{N}.attn.{to_q|to_k|to_v}
+                    to_q_weight, to_k_weight, to_v_weight = torch.chunk(fused_qkv_weight, 3, dim=0)
+                    new_q_name = "attn.to_q"
+                    new_k_name = "attn.to_k"
+                    new_v_name = "attn.to_v"
+                elif "txt" in modality_block_name:
+                    # double_blocks.{N}.txt_attn.qkv --> transformer_blocks.{N}.attn.{add_q_proj|add_k_proj|add_v_proj}
+                    to_q_weight, to_k_weight, to_v_weight = torch.chunk(fused_qkv_weight, 3, dim=0)
+                    new_q_name = "attn.add_q_proj"
+                    new_k_name = "attn.add_k_proj"
+                    new_v_name = "attn.add_v_proj"
+                new_q_key = ".".join([new_prefix, block_idx, new_q_name, param_type])
+                new_k_key = ".".join([new_prefix, block_idx, new_k_name, param_type])
+                new_v_key = ".".join([new_prefix, block_idx, new_v_name, param_type])
+                state_dict[new_q_key] = to_q_weight
+                state_dict[new_k_key] = to_k_weight
+                state_dict[new_v_key] = to_v_weight
+            else:
+                new_within_block_name = FLUX2_TRANSFORMER_DOUBLE_BLOCK_KEY_MAP[within_block_name]
+                new_key = ".".join([new_prefix, block_idx, new_within_block_name, param_type])
+
+                param = state_dict.pop(key)
+                state_dict[new_key] = param
+        return
+
+    def update_state_dict(state_dict: dict[str, object], old_key: str, new_key: str) -> None:
+        state_dict[new_key] = state_dict.pop(old_key)
+
+    TRANSFORMER_SPECIAL_KEYS_REMAP = {
+        "adaLN_modulation": convert_ada_layer_norm_weights,
+        "double_blocks": convert_flux2_double_stream_blocks,
+        "single_blocks": convert_flux2_single_stream_blocks,
+    }
+
+    converted_state_dict = {key: checkpoint.pop(key) for key in list(checkpoint.keys())}
+
+    # Handle official code --> diffusers key remapping via the remap dict
+    for key in list(converted_state_dict.keys()):
+        new_key = key[:]
+        for replace_key, rename_key in FLUX2_TRANSFORMER_KEYS_RENAME_DICT.items():
+            new_key = new_key.replace(replace_key, rename_key)
+
+        update_state_dict(converted_state_dict, key, new_key)
+
+    # Handle any special logic which can't be expressed by a simple 1:1 remapping with the handlers in
+    # special_keys_remap
+    for key in list(converted_state_dict.keys()):
+        for special_key, handler_fn_inplace in TRANSFORMER_SPECIAL_KEYS_REMAP.items():
+            if special_key not in key:
+                continue
+            handler_fn_inplace(key, converted_state_dict)
+
+    return converted_state_dict
+
+
+def convert_z_image_transformer_checkpoint_to_diffusers(checkpoint, **kwargs):
+    Z_IMAGE_KEYS_RENAME_DICT = {
+        "final_layer.": "all_final_layer.2-1.",
+        "x_embedder.": "all_x_embedder.2-1.",
+        ".attention.out.bias": ".attention.to_out.0.bias",
+        ".attention.k_norm.weight": ".attention.norm_k.weight",
+        ".attention.q_norm.weight": ".attention.norm_q.weight",
+        ".attention.out.weight": ".attention.to_out.0.weight",
+    }
+
+    def convert_z_image_fused_attention(key: str, state_dict: dict[str, object]) -> None:
+        if ".attention.qkv.weight" not in key:
+            return
+
+        fused_qkv_weight = state_dict.pop(key)
+        to_q_weight, to_k_weight, to_v_weight = torch.chunk(fused_qkv_weight, 3, dim=0)
+        new_q_name = key.replace(".attention.qkv.weight", ".attention.to_q.weight")
+        new_k_name = key.replace(".attention.qkv.weight", ".attention.to_k.weight")
+        new_v_name = key.replace(".attention.qkv.weight", ".attention.to_v.weight")
+
+        state_dict[new_q_name] = to_q_weight
+        state_dict[new_k_name] = to_k_weight
+        state_dict[new_v_name] = to_v_weight
+        return
+
+    TRANSFORMER_SPECIAL_KEYS_REMAP = {
+        ".attention.qkv.weight": convert_z_image_fused_attention,
+    }
+
+    def update_state_dict(state_dict: dict[str, object], old_key: str, new_key: str) -> None:
+        state_dict[new_key] = state_dict.pop(old_key)
+
+    converted_state_dict = {key: checkpoint.pop(key) for key in list(checkpoint.keys())}
+
+    # Handle single file --> diffusers key remapping via the remap dict
+    for key in list(converted_state_dict.keys()):
+        new_key = key[:]
+        for replace_key, rename_key in Z_IMAGE_KEYS_RENAME_DICT.items():
+            new_key = new_key.replace(replace_key, rename_key)
+
+        update_state_dict(converted_state_dict, key, new_key)
+
+    # Handle any special logic which can't be expressed by a simple 1:1 remapping with the handlers in
+    # special_keys_remap
+    for key in list(converted_state_dict.keys()):
+        for special_key, handler_fn_inplace in TRANSFORMER_SPECIAL_KEYS_REMAP.items():
+            if special_key not in key:
+                continue
+            handler_fn_inplace(key, converted_state_dict)
+
+    return converted_state_dict
diff --git a/src/diffusers/models/__init__.py b/src/diffusers/models/__init__.py
index e3b297464143..29d8b0b5a55d 100755
--- a/src/diffusers/models/__init__.py
+++ b/src/diffusers/models/__init__.py
@@ -35,9 +35,11 @@
     _import_structure["autoencoders.autoencoder_kl_allegro"] = ["AutoencoderKLAllegro"]
     _import_structure["autoencoders.autoencoder_kl_cogvideox"] = ["AutoencoderKLCogVideoX"]
     _import_structure["autoencoders.autoencoder_kl_cosmos"] = ["AutoencoderKLCosmos"]
+    _import_structure["autoencoders.autoencoder_kl_flux2"] = ["AutoencoderKLFlux2"]
     _import_structure["autoencoders.autoencoder_kl_hunyuan_video"] = ["AutoencoderKLHunyuanVideo"]
     _import_structure["autoencoders.autoencoder_kl_hunyuanimage"] = ["AutoencoderKLHunyuanImage"]
     _import_structure["autoencoders.autoencoder_kl_hunyuanimage_refiner"] = ["AutoencoderKLHunyuanImageRefiner"]
+    _import_structure["autoencoders.autoencoder_kl_hunyuanvideo15"] = ["AutoencoderKLHunyuanVideo15"]
     _import_structure["autoencoders.autoencoder_kl_ltx"] = ["AutoencoderKLLTXVideo"]
     _import_structure["autoencoders.autoencoder_kl_magvit"] = ["AutoencoderKLMagvit"]
     _import_structure["autoencoders.autoencoder_kl_mochi"] = ["AutoencoderKLMochi"]
@@ -86,13 +88,16 @@
     _import_structure["transformers.transformer_bria"] = ["BriaTransformer2DModel"]
     _import_structure["transformers.transformer_bria_fibo"] = ["BriaFiboTransformer2DModel"]
     _import_structure["transformers.transformer_chroma"] = ["ChromaTransformer2DModel"]
+    _import_structure["transformers.transformer_chronoedit"] = ["ChronoEditTransformer3DModel"]
     _import_structure["transformers.transformer_cogview3plus"] = ["CogView3PlusTransformer2DModel"]
     _import_structure["transformers.transformer_cogview4"] = ["CogView4Transformer2DModel"]
     _import_structure["transformers.transformer_cosmos"] = ["CosmosTransformer3DModel"]
     _import_structure["transformers.transformer_easyanimate"] = ["EasyAnimateTransformer3DModel"]
     _import_structure["transformers.transformer_flux"] = ["FluxTransformer2DModel"]
+    _import_structure["transformers.transformer_flux2"] = ["Flux2Transformer2DModel"]
     _import_structure["transformers.transformer_hidream_image"] = ["HiDreamImageTransformer2DModel"]
     _import_structure["transformers.transformer_hunyuan_video"] = ["HunyuanVideoTransformer3DModel"]
+    _import_structure["transformers.transformer_hunyuan_video15"] = ["HunyuanVideo15Transformer3DModel"]
     _import_structure["transformers.transformer_hunyuan_video_framepack"] = ["HunyuanVideoFramepackTransformer3DModel"]
     _import_structure["transformers.transformer_hunyuanimage"] = ["HunyuanImageTransformer2DModel"]
     _import_structure["transformers.transformer_kandinsky"] = ["Kandinsky5Transformer3DModel"]
@@ -100,13 +105,17 @@
     _import_structure["transformers.transformer_lumina2"] = ["Lumina2Transformer2DModel"]
     _import_structure["transformers.transformer_mochi"] = ["MochiTransformer3DModel"]
     _import_structure["transformers.transformer_omnigen"] = ["OmniGenTransformer2DModel"]
+    _import_structure["transformers.transformer_ovis_image"] = ["OvisImageTransformer2DModel"]
     _import_structure["transformers.transformer_prx"] = ["PRXTransformer2DModel"]
     _import_structure["transformers.transformer_qwenimage"] = ["QwenImageTransformer2DModel"]
+    _import_structure["transformers.transformer_sana_video"] = ["SanaVideoTransformer3DModel"]
     _import_structure["transformers.transformer_sd3"] = ["SD3Transformer2DModel"]
     _import_structure["transformers.transformer_skyreels_v2"] = ["SkyReelsV2Transformer3DModel"]
     _import_structure["transformers.transformer_temporal"] = ["TransformerTemporalModel"]
     _import_structure["transformers.transformer_wan"] = ["WanTransformer3DModel"]
+    _import_structure["transformers.transformer_wan_animate"] = ["WanAnimateTransformer3DModel"]
     _import_structure["transformers.transformer_wan_vace"] = ["WanVACETransformer3DModel"]
+    _import_structure["transformers.transformer_z_image"] = ["ZImageTransformer2DModel"]
     _import_structure["unets.unet_1d"] = ["UNet1DModel"]
     _import_structure["unets.unet_2d"] = ["UNet2DModel"]
     _import_structure["unets.unet_2d_condition"] = ["UNet2DConditionModel"]
@@ -137,9 +146,11 @@
             AutoencoderKLAllegro,
             AutoencoderKLCogVideoX,
             AutoencoderKLCosmos,
+            AutoencoderKLFlux2,
             AutoencoderKLHunyuanImage,
             AutoencoderKLHunyuanImageRefiner,
             AutoencoderKLHunyuanVideo,
+            AutoencoderKLHunyuanVideo15,
             AutoencoderKLLTXVideo,
             AutoencoderKLMagvit,
             AutoencoderKLMochi,
@@ -178,6 +189,7 @@
             BriaFiboTransformer2DModel,
             BriaTransformer2DModel,
             ChromaTransformer2DModel,
+            ChronoEditTransformer3DModel,
             CogVideoXTransformer3DModel,
             CogView3PlusTransformer2DModel,
             CogView4Transformer2DModel,
@@ -186,10 +198,12 @@
             DiTTransformer2DModel,
             DualTransformer2DModel,
             EasyAnimateTransformer3DModel,
+            Flux2Transformer2DModel,
             FluxTransformer2DModel,
             HiDreamImageTransformer2DModel,
             HunyuanDiT2DModel,
             HunyuanImageTransformer2DModel,
+            HunyuanVideo15Transformer3DModel,
             HunyuanVideoFramepackTransformer3DModel,
             HunyuanVideoTransformer3DModel,
             Kandinsky5Transformer3DModel,
@@ -199,19 +213,23 @@
             LuminaNextDiT2DModel,
             MochiTransformer3DModel,
             OmniGenTransformer2DModel,
+            OvisImageTransformer2DModel,
             PixArtTransformer2DModel,
             PriorTransformer,
             PRXTransformer2DModel,
             QwenImageTransformer2DModel,
             SanaTransformer2DModel,
+            SanaVideoTransformer3DModel,
             SD3Transformer2DModel,
             SkyReelsV2Transformer3DModel,
             StableAudioDiTModel,
             T5FilmDecoder,
             Transformer2DModel,
             TransformerTemporalModel,
+            WanAnimateTransformer3DModel,
             WanTransformer3DModel,
             WanVACETransformer3DModel,
+            ZImageTransformer2DModel,
         )
         from .unets import (
             I2VGenXLUNet,
diff --git a/src/diffusers/models/_modeling_parallel.py b/src/diffusers/models/_modeling_parallel.py
index 2a1d2cc6ceea..2a4eb520c796 100644
--- a/src/diffusers/models/_modeling_parallel.py
+++ b/src/diffusers/models/_modeling_parallel.py
@@ -44,11 +44,16 @@ class ContextParallelConfig:
 
     Args:
         ring_degree (`int`, *optional*, defaults to `1`):
-            Number of devices to use for ring attention within a context parallel region. Must be a divisor of the
-            total number of devices in the context parallel mesh.
+            Number of devices to use for Ring Attention. Sequence is split across devices. Each device computes
+            attention between its local Q and KV chunks passed sequentially around ring. Lower memory (only holds 1/N
+            of KV at a time), overlaps compute with communication, but requires N iterations to see all tokens. Best
+            for long sequences with limited memory/bandwidth. Number of devices to use for ring attention within a
+            context parallel region. Must be a divisor of the total number of devices in the context parallel mesh.
         ulysses_degree (`int`, *optional*, defaults to `1`):
-            Number of devices to use for ulysses attention within a context parallel region. Must be a divisor of the
-            total number of devices in the context parallel mesh.
+            Number of devices to use for Ulysses Attention. Sequence split is across devices. Each device computes
+            local QKV, then all-gathers all KV chunks to compute full attention in one pass. Higher memory (stores all
+            KV), requires high-bandwidth all-to-all communication, but lower latency. Best for moderate sequences with
+            good interconnect bandwidth.
         convert_to_fp32 (`bool`, *optional*, defaults to `True`):
             Whether to convert output and LSE to float32 for ring attention numerical stability.
         rotate_method (`str`, *optional*, defaults to `"allgather"`):
@@ -79,29 +84,46 @@ def __post_init__(self):
         if self.ulysses_degree is None:
             self.ulysses_degree = 1
 
+        if self.ring_degree == 1 and self.ulysses_degree == 1:
+            raise ValueError(
+                "Either ring_degree or ulysses_degree must be greater than 1 in order to use context parallel inference"
+            )
+        if self.ring_degree < 1 or self.ulysses_degree < 1:
+            raise ValueError("`ring_degree` and `ulysses_degree` must be greater than or equal to 1.")
+        if self.ring_degree > 1 and self.ulysses_degree > 1:
+            raise ValueError(
+                "Unified Ulysses-Ring attention is not yet supported. Please set either `ring_degree` or `ulysses_degree` to 1."
+            )
+        if self.rotate_method != "allgather":
+            raise NotImplementedError(
+                f"Only rotate_method='allgather' is supported for now, but got {self.rotate_method}."
+            )
+
+    @property
+    def mesh_shape(self) -> Tuple[int, int]:
+        return (self.ring_degree, self.ulysses_degree)
+
+    @property
+    def mesh_dim_names(self) -> Tuple[str, str]:
+        """Dimension names for the device mesh."""
+        return ("ring", "ulysses")
+
     def setup(self, rank: int, world_size: int, device: torch.device, mesh: torch.distributed.device_mesh.DeviceMesh):
         self._rank = rank
         self._world_size = world_size
         self._device = device
         self._mesh = mesh
-        if self.ring_degree is None:
-            self.ring_degree = 1
-        if self.ulysses_degree is None:
-            self.ulysses_degree = 1
-        if self.rotate_method != "allgather":
-            raise NotImplementedError(
-                f"Only rotate_method='allgather' is supported for now, but got {self.rotate_method}."
+
+        if self.ulysses_degree * self.ring_degree > world_size:
+            raise ValueError(
+                f"The product of `ring_degree` ({self.ring_degree}) and `ulysses_degree` ({self.ulysses_degree}) must not exceed the world size ({world_size})."
             )
-        if self._flattened_mesh is None:
-            self._flattened_mesh = self._mesh._flatten()
-        if self._ring_mesh is None:
-            self._ring_mesh = self._mesh["ring"]
-        if self._ulysses_mesh is None:
-            self._ulysses_mesh = self._mesh["ulysses"]
-        if self._ring_local_rank is None:
-            self._ring_local_rank = self._ring_mesh.get_local_rank()
-        if self._ulysses_local_rank is None:
-            self._ulysses_local_rank = self._ulysses_mesh.get_local_rank()
+
+        self._flattened_mesh = self._mesh._flatten()
+        self._ring_mesh = self._mesh["ring"]
+        self._ulysses_mesh = self._mesh["ulysses"]
+        self._ring_local_rank = self._ring_mesh.get_local_rank()
+        self._ulysses_local_rank = self._ulysses_mesh.get_local_rank()
 
 
 @dataclass
@@ -119,7 +141,7 @@ class ParallelConfig:
     _rank: int = None
     _world_size: int = None
     _device: torch.device = None
-    _cp_mesh: torch.distributed.device_mesh.DeviceMesh = None
+    _mesh: torch.distributed.device_mesh.DeviceMesh = None
 
     def setup(
         self,
@@ -127,14 +149,14 @@ def setup(
         world_size: int,
         device: torch.device,
         *,
-        cp_mesh: Optional[torch.distributed.device_mesh.DeviceMesh] = None,
+        mesh: Optional[torch.distributed.device_mesh.DeviceMesh] = None,
     ):
         self._rank = rank
         self._world_size = world_size
         self._device = device
-        self._cp_mesh = cp_mesh
+        self._mesh = mesh
         if self.context_parallel_config is not None:
-            self.context_parallel_config.setup(rank, world_size, device, cp_mesh)
+            self.context_parallel_config.setup(rank, world_size, device, mesh)
 
 
 @dataclass(frozen=True)
diff --git a/src/diffusers/models/attention.py b/src/diffusers/models/attention.py
index 5164cf311d3c..8b583d1a1cce 100644
--- a/src/diffusers/models/attention.py
+++ b/src/diffusers/models/attention.py
@@ -105,7 +105,7 @@ def fuse_qkv_projections(self):
                 raise ValueError("`fuse_qkv_projections()` is not supported for models having added KV projections.")
 
         for module in self.modules():
-            if isinstance(module, AttentionModuleMixin):
+            if isinstance(module, AttentionModuleMixin) and module._supports_qkv_fusion:
                 module.fuse_projections()
 
     def unfuse_qkv_projections(self):
@@ -114,13 +114,14 @@ def unfuse_qkv_projections(self):
         > [!WARNING] > This API is 🧪 experimental.
         """
         for module in self.modules():
-            if isinstance(module, AttentionModuleMixin):
+            if isinstance(module, AttentionModuleMixin) and module._supports_qkv_fusion:
                 module.unfuse_projections()
 
 
 class AttentionModuleMixin:
     _default_processor_cls = None
     _available_processors = []
+    _supports_qkv_fusion = True
     fused_projections = False
 
     def set_processor(self, processor: AttentionProcessor) -> None:
@@ -248,6 +249,14 @@ def fuse_projections(self):
         """
         Fuse the query, key, and value projections into a single projection for efficiency.
         """
+        # Skip if the AttentionModuleMixin subclass does not support fusion (for example, the QKV projections in Flux2
+        # single stream blocks are always fused)
+        if not self._supports_qkv_fusion:
+            logger.debug(
+                f"{self.__class__.__name__} does not support fusing QKV projections, so `fuse_projections` will no-op."
+            )
+            return
+
         # Skip if already fused
         if getattr(self, "fused_projections", False):
             return
@@ -307,6 +316,11 @@ def unfuse_projections(self):
         """
         Unfuse the query, key, and value projections back to separate projections.
         """
+        # Skip if the AttentionModuleMixin subclass does not support fusion (for example, the QKV projections in Flux2
+        # single stream blocks are always fused)
+        if not self._supports_qkv_fusion:
+            return
+
         # Skip if not fused
         if not getattr(self, "fused_projections", False):
             return
diff --git a/src/diffusers/models/attention_dispatch.py b/src/diffusers/models/attention_dispatch.py
index ab0d7102ee83..ffad94cc7f27 100644
--- a/src/diffusers/models/attention_dispatch.py
+++ b/src/diffusers/models/attention_dispatch.py
@@ -16,8 +16,9 @@
 import functools
 import inspect
 import math
+from dataclasses import dataclass
 from enum import Enum
-from typing import TYPE_CHECKING, Any, Callable, Dict, List, Literal, Optional, Tuple, Union
+from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Tuple, Union
 
 import torch
 
@@ -42,7 +43,7 @@
     is_xformers_available,
     is_xformers_version,
 )
-from ..utils.constants import DIFFUSERS_ATTN_BACKEND, DIFFUSERS_ATTN_CHECKS, DIFFUSERS_ENABLE_HUB_KERNELS
+from ..utils.constants import DIFFUSERS_ATTN_BACKEND, DIFFUSERS_ATTN_CHECKS
 
 
 if TYPE_CHECKING:
@@ -82,24 +83,11 @@
     flash_attn_3_func = None
     flash_attn_3_varlen_func = None
 
-
 if _CAN_USE_AITER_ATTN:
     from aiter import flash_attn_func as aiter_flash_attn_func
 else:
     aiter_flash_attn_func = None
 
-if DIFFUSERS_ENABLE_HUB_KERNELS:
-    if not is_kernels_available():
-        raise ImportError(
-            "To use FA3 kernel for your hardware from the Hub, the `kernels` library must be installed. Install with `pip install kernels`."
-        )
-    from ..utils.kernels_utils import _get_fa3_from_hub
-
-    flash_attn_interface_hub = _get_fa3_from_hub()
-    flash_attn_3_func_hub = flash_attn_interface_hub.flash_attn_func
-else:
-    flash_attn_3_func_hub = None
-
 if _CAN_USE_SAGE_ATTN:
     from sageattention import (
         sageattn,
@@ -172,21 +160,19 @@ def wrap(func):
 # - CP with sage attention, flex, xformers, other missing backends
 # - Add support for normal and CP training with backends that don't support it yet
 
-_SAGE_ATTENTION_PV_ACCUM_DTYPE = Literal["fp32", "fp32+fp32"]
-_SAGE_ATTENTION_QK_QUANT_GRAN = Literal["per_thread", "per_warp"]
-_SAGE_ATTENTION_QUANTIZATION_BACKEND = Literal["cuda", "triton"]
-
 
 class AttentionBackendName(str, Enum):
     # EAGER = "eager"
 
     # `flash-attn`
     FLASH = "flash"
+    FLASH_HUB = "flash_hub"
     FLASH_VARLEN = "flash_varlen"
+    FLASH_VARLEN_HUB = "flash_varlen_hub"
     _FLASH_3 = "_flash_3"
     _FLASH_VARLEN_3 = "_flash_varlen_3"
     _FLASH_3_HUB = "_flash_3_hub"
-    # _FLASH_VARLEN_3_HUB = "_flash_varlen_3_hub"  # not supported yet.
+    _FLASH_3_VARLEN_HUB = "_flash_3_varlen_hub"
 
     # `aiter`
     AITER = "aiter"
@@ -203,6 +189,7 @@ class AttentionBackendName(str, Enum):
 
     # `sageattention`
     SAGE = "sage"
+    SAGE_HUB = "sage_hub"
     SAGE_VARLEN = "sage_varlen"
     _SAGE_QK_INT8_PV_FP8_CUDA = "_sage_qk_int8_pv_fp8_cuda"
     _SAGE_QK_INT8_PV_FP8_CUDA_SM90 = "_sage_qk_int8_pv_fp8_cuda_sm90"
@@ -220,7 +207,7 @@ class _AttentionBackendRegistry:
     _backends = {}
     _constraints = {}
     _supported_arg_names = {}
-    _supports_context_parallel = {}
+    _supports_context_parallel = set()
     _active_backend = AttentionBackendName(DIFFUSERS_ATTN_BACKEND)
     _checks_enabled = DIFFUSERS_ATTN_CHECKS
 
@@ -237,7 +224,9 @@ def decorator(func):
             cls._backends[backend] = func
             cls._constraints[backend] = constraints or []
             cls._supported_arg_names[backend] = set(inspect.signature(func).parameters.keys())
-            cls._supports_context_parallel[backend] = supports_context_parallel
+            if supports_context_parallel:
+                cls._supports_context_parallel.add(backend.value)
+
             return func
 
         return decorator
@@ -251,15 +240,45 @@ def list_backends(cls):
         return list(cls._backends.keys())
 
     @classmethod
-    def _is_context_parallel_enabled(
-        cls, backend: AttentionBackendName, parallel_config: Optional["ParallelConfig"]
+    def _is_context_parallel_available(
+        cls,
+        backend: AttentionBackendName,
     ) -> bool:
-        supports_context_parallel = backend in cls._supports_context_parallel
-        is_degree_greater_than_1 = parallel_config is not None and (
-            parallel_config.context_parallel_config.ring_degree > 1
-            or parallel_config.context_parallel_config.ulysses_degree > 1
-        )
-        return supports_context_parallel and is_degree_greater_than_1
+        supports_context_parallel = backend.value in cls._supports_context_parallel
+        return supports_context_parallel
+
+
+@dataclass
+class _HubKernelConfig:
+    """Configuration for downloading and using a hub-based attention kernel."""
+
+    repo_id: str
+    function_attr: str
+    revision: Optional[str] = None
+    kernel_fn: Optional[Callable] = None
+
+
+# Registry for hub-based attention kernels
+_HUB_KERNELS_REGISTRY: Dict["AttentionBackendName", _HubKernelConfig] = {
+    # TODO: temporary revision for now. Remove when merged upstream into `main`.
+    AttentionBackendName._FLASH_3_HUB: _HubKernelConfig(
+        repo_id="kernels-community/flash-attn3", function_attr="flash_attn_func", revision="fake-ops-return-probs"
+    ),
+    AttentionBackendName._FLASH_3_VARLEN_HUB: _HubKernelConfig(
+        repo_id="kernels-community/flash-attn3",
+        function_attr="flash_attn_varlen_func",
+        # revision="fake-ops-return-probs",
+    ),
+    AttentionBackendName.FLASH_HUB: _HubKernelConfig(
+        repo_id="kernels-community/flash-attn2", function_attr="flash_attn_func", revision=None
+    ),
+    AttentionBackendName.FLASH_VARLEN_HUB: _HubKernelConfig(
+        repo_id="kernels-community/flash-attn2", function_attr="flash_attn_varlen_func", revision=None
+    ),
+    AttentionBackendName.SAGE_HUB: _HubKernelConfig(
+        repo_id="kernels-community/sage_attention", function_attr="sageattn", revision=None
+    ),
+}
 
 
 @contextlib.contextmanager
@@ -272,6 +291,7 @@ def attention_backend(backend: Union[str, AttentionBackendName] = AttentionBacke
 
     backend = AttentionBackendName(backend)
     _check_attention_backend_requirements(backend)
+    _maybe_download_kernel_for_backend(backend)
 
     old_backend = _AttentionBackendRegistry._active_backend
     _AttentionBackendRegistry._active_backend = backend
@@ -306,14 +326,6 @@ def dispatch_attention_fn(
         backend_name = AttentionBackendName(backend)
         backend_fn = _AttentionBackendRegistry._backends.get(backend_name)
 
-    if parallel_config is not None and not _AttentionBackendRegistry._is_context_parallel_enabled(
-        backend_name, parallel_config
-    ):
-        raise ValueError(
-            f"Backend {backend_name} either does not support context parallelism or context parallelism "
-            f"was enabled with a world size of 1."
-        )
-
     kwargs = {
         "query": query,
         "key": key,
@@ -392,12 +404,18 @@ def _check_shape(
     attn_mask: Optional[torch.Tensor] = None,
     **kwargs,
 ) -> None:
+    # Expected shapes:
+    # query: (batch_size, seq_len_q, num_heads, head_dim)
+    # key:   (batch_size, seq_len_kv, num_heads, head_dim)
+    # value: (batch_size, seq_len_kv, num_heads, head_dim)
+    # attn_mask: (seq_len_q, seq_len_kv) or (batch_size, seq_len_q, seq_len_kv)
+    #            or (batch_size, num_heads, seq_len_q, seq_len_kv)
     if query.shape[-1] != key.shape[-1]:
-        raise ValueError("Query and key must have the same last dimension.")
-    if query.shape[-2] != value.shape[-2]:
-        raise ValueError("Query and value must have the same second to last dimension.")
-    if attn_mask is not None and attn_mask.shape[-1] != key.shape[-2]:
-        raise ValueError("Attention mask must match the key's second to last dimension.")
+        raise ValueError("Query and key must have the same head dimension.")
+    if key.shape[-3] != value.shape[-3]:
+        raise ValueError("Key and value must have the same sequence length.")
+    if attn_mask is not None and attn_mask.shape[-1] != key.shape[-3]:
+        raise ValueError("Attention mask must match the key's sequence length.")
 
 
 # ===== Helper functions =====
@@ -416,15 +434,16 @@ def _check_attention_backend_requirements(backend: AttentionBackendName) -> None
                 f"Flash Attention 3 backend '{backend.value}' is not usable because of missing package or the version is too old. Please build FA3 beta release from source."
             )
 
-    # TODO: add support Hub variant of FA3 varlen later
-    elif backend in [AttentionBackendName._FLASH_3_HUB]:
-        if not DIFFUSERS_ENABLE_HUB_KERNELS:
-            raise RuntimeError(
-                f"Flash Attention 3 Hub backend '{backend.value}' is not usable because the `DIFFUSERS_ENABLE_HUB_KERNELS` env var isn't set. Please set it like `export DIFFUSERS_ENABLE_HUB_KERNELS=yes`."
-            )
+    elif backend in [
+        AttentionBackendName.FLASH_HUB,
+        AttentionBackendName.FLASH_VARLEN_HUB,
+        AttentionBackendName._FLASH_3_HUB,
+        AttentionBackendName._FLASH_3_VARLEN_HUB,
+        AttentionBackendName.SAGE_HUB,
+    ]:
         if not is_kernels_available():
             raise RuntimeError(
-                f"Flash Attention 3 Hub backend '{backend.value}' is not usable because the `kernels` package isn't available. Please install it with `pip install kernels`."
+                f"Backend '{backend.value}' is not usable because the `kernels` package isn't available. Please install it with `pip install kernels`."
             )
 
     elif backend == AttentionBackendName.AITER:
@@ -574,6 +593,29 @@ def _flex_attention_causal_mask_mod(batch_idx, head_idx, q_idx, kv_idx):
     return q_idx >= kv_idx
 
 
+# ===== Helpers for downloading kernels =====
+def _maybe_download_kernel_for_backend(backend: AttentionBackendName) -> None:
+    if backend not in _HUB_KERNELS_REGISTRY:
+        return
+    config = _HUB_KERNELS_REGISTRY[backend]
+
+    if config.kernel_fn is not None:
+        return
+
+    try:
+        from kernels import get_kernel
+
+        kernel_module = get_kernel(config.repo_id, revision=config.revision)
+        kernel_func = getattr(kernel_module, config.function_attr)
+
+        # Cache the downloaded kernel function in the config object
+        config.kernel_fn = kernel_func
+
+    except Exception as e:
+        logger.error(f"An error occurred while fetching kernel '{config.repo_id}' from the Hub: {e}")
+        raise
+
+
 # ===== torch op registrations =====
 # Registrations are required for fullgraph tracing compatibility
 # TODO: this is only required because the beta release FA3 does not have it. There is a PR adding
@@ -649,6 +691,86 @@ def _(
 # ===== Helper functions to use attention backends with templated CP autograd functions =====
 
 
+def _native_attention_forward_op(
+    ctx: torch.autograd.function.FunctionCtx,
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    attn_mask: Optional[torch.Tensor] = None,
+    dropout_p: float = 0.0,
+    is_causal: bool = False,
+    scale: Optional[float] = None,
+    enable_gqa: bool = False,
+    return_lse: bool = False,
+    _save_ctx: bool = True,
+    _parallel_config: Optional["ParallelConfig"] = None,
+):
+    # Native attention does not return_lse
+    if return_lse:
+        raise ValueError("Native attention does not support return_lse=True")
+
+    # used for backward pass
+    if _save_ctx:
+        ctx.save_for_backward(query, key, value)
+        ctx.attn_mask = attn_mask
+        ctx.dropout_p = dropout_p
+        ctx.is_causal = is_causal
+        ctx.scale = scale
+        ctx.enable_gqa = enable_gqa
+
+    query, key, value = (x.permute(0, 2, 1, 3) for x in (query, key, value))
+    out = torch.nn.functional.scaled_dot_product_attention(
+        query=query,
+        key=key,
+        value=value,
+        attn_mask=attn_mask,
+        dropout_p=dropout_p,
+        is_causal=is_causal,
+        scale=scale,
+        enable_gqa=enable_gqa,
+    )
+    out = out.permute(0, 2, 1, 3)
+
+    return out
+
+
+def _native_attention_backward_op(
+    ctx: torch.autograd.function.FunctionCtx,
+    grad_out: torch.Tensor,
+    *args,
+    **kwargs,
+):
+    query, key, value = ctx.saved_tensors
+
+    query.requires_grad_(True)
+    key.requires_grad_(True)
+    value.requires_grad_(True)
+
+    query_t, key_t, value_t = (x.permute(0, 2, 1, 3) for x in (query, key, value))
+    out = torch.nn.functional.scaled_dot_product_attention(
+        query=query_t,
+        key=key_t,
+        value=value_t,
+        attn_mask=ctx.attn_mask,
+        dropout_p=ctx.dropout_p,
+        is_causal=ctx.is_causal,
+        scale=ctx.scale,
+        enable_gqa=ctx.enable_gqa,
+    )
+    out = out.permute(0, 2, 1, 3)
+
+    grad_out_t = grad_out.permute(0, 2, 1, 3)
+    grad_query_t, grad_key_t, grad_value_t = torch.autograd.grad(
+        outputs=out, inputs=[query_t, key_t, value_t], grad_outputs=grad_out_t, retain_graph=False
+    )
+
+    grad_query = grad_query_t.permute(0, 2, 1, 3)
+    grad_key = grad_key_t.permute(0, 2, 1, 3)
+    grad_value = grad_value_t.permute(0, 2, 1, 3)
+
+    return grad_query, grad_key, grad_value
+
+
 # https://github.com/pytorch/pytorch/blob/8904ba638726f8c9a5aff5977c4aa76c9d2edfa6/aten/src/ATen/native/native_functions.yaml#L14958
 # forward declaration:
 #   aten::_scaled_dot_product_cudnn_attention(Tensor query, Tensor key, Tensor value, Tensor? attn_bias, bool compute_log_sumexp, float dropout_p=0., bool is_causal=False, bool return_debug_mask=False, *, float? scale=None) -> (Tensor output, Tensor logsumexp, Tensor cum_seq_q, Tensor cum_seq_k, SymInt max_q, SymInt max_k, Tensor philox_seed, Tensor philox_offset, Tensor debug_attn_mask)
@@ -1247,6 +1369,95 @@ def _flash_attention(
     return (out, lse) if return_lse else out
 
 
+@_AttentionBackendRegistry.register(
+    AttentionBackendName.FLASH_HUB,
+    constraints=[_check_device, _check_qkv_dtype_bf16_or_fp16, _check_shape],
+    supports_context_parallel=False,
+)
+def _flash_attention_hub(
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    dropout_p: float = 0.0,
+    is_causal: bool = False,
+    scale: Optional[float] = None,
+    return_lse: bool = False,
+    _parallel_config: Optional["ParallelConfig"] = None,
+) -> torch.Tensor:
+    lse = None
+    func = _HUB_KERNELS_REGISTRY[AttentionBackendName.FLASH_HUB].kernel_fn
+    out = func(
+        q=query,
+        k=key,
+        v=value,
+        dropout_p=dropout_p,
+        softmax_scale=scale,
+        causal=is_causal,
+        return_attn_probs=return_lse,
+    )
+    if return_lse:
+        out, lse, *_ = out
+
+    return (out, lse) if return_lse else out
+
+
+@_AttentionBackendRegistry.register(
+    AttentionBackendName.FLASH_VARLEN_HUB,
+    constraints=[_check_device, _check_qkv_dtype_bf16_or_fp16, _check_shape],
+    supports_context_parallel=False,
+)
+def _flash_varlen_attention_hub(
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    attn_mask: Optional[torch.Tensor] = None,
+    dropout_p: float = 0.0,
+    scale: Optional[float] = None,
+    is_causal: bool = False,
+    return_lse: bool = False,
+    _parallel_config: Optional["ParallelConfig"] = None,
+) -> torch.Tensor:
+    batch_size, seq_len_q, _, _ = query.shape
+    _, seq_len_kv, _, _ = key.shape
+
+    if attn_mask is not None:
+        attn_mask = _normalize_attn_mask(attn_mask, batch_size, seq_len_kv)
+
+    (_, seqlens_k), (cu_seqlens_q, cu_seqlens_k), (max_seqlen_q, max_seqlen_k) = (
+        _prepare_for_flash_attn_or_sage_varlen(
+            batch_size, seq_len_q, seq_len_kv, attn_mask=attn_mask, device=query.device
+        )
+    )
+
+    key_valid, value_valid = [], []
+    for b in range(batch_size):
+        valid_len = seqlens_k[b]
+        key_valid.append(key[b, :valid_len])
+        value_valid.append(value[b, :valid_len])
+
+    query_packed = query.flatten(0, 1)
+    key_packed = torch.cat(key_valid, dim=0)
+    value_packed = torch.cat(value_valid, dim=0)
+
+    func = _HUB_KERNELS_REGISTRY[AttentionBackendName.FLASH_VARLEN_HUB].kernel_fn
+    out = func(
+        q=query_packed,
+        k=key_packed,
+        v=value_packed,
+        cu_seqlens_q=cu_seqlens_q,
+        cu_seqlens_k=cu_seqlens_k,
+        max_seqlen_q=max_seqlen_q,
+        max_seqlen_k=max_seqlen_k,
+        dropout_p=dropout_p,
+        softmax_scale=scale,
+        causal=is_causal,
+        return_attn_probs=return_lse,
+    )
+    out = out.unflatten(0, (batch_size, -1))
+
+    return out
+
+
 @_AttentionBackendRegistry.register(
     AttentionBackendName.FLASH_VARLEN,
     constraints=[_check_device, _check_qkv_dtype_bf16_or_fp16, _check_shape],
@@ -1328,6 +1539,7 @@ def _flash_attention_3(
 @_AttentionBackendRegistry.register(
     AttentionBackendName._FLASH_3_HUB,
     constraints=[_check_device, _check_qkv_dtype_bf16_or_fp16, _check_shape],
+    supports_context_parallel=False,
 )
 def _flash_attention_3_hub(
     query: torch.Tensor,
@@ -1341,7 +1553,11 @@ def _flash_attention_3_hub(
     return_attn_probs: bool = False,
     _parallel_config: Optional["ParallelConfig"] = None,
 ) -> torch.Tensor:
-    out = flash_attn_3_func_hub(
+    if _parallel_config:
+        raise NotImplementedError(f"{AttentionBackendName._FLASH_3_HUB.value} is not implemented for parallelism yet.")
+
+    func = _HUB_KERNELS_REGISTRY[AttentionBackendName._FLASH_3_HUB].kernel_fn
+    out = func(
         q=query,
         k=key,
         v=value,
@@ -1364,6 +1580,60 @@ def _flash_attention_3_hub(
     return (out[0], out[1]) if return_attn_probs else out
 
 
+@_AttentionBackendRegistry.register(
+    AttentionBackendName._FLASH_3_VARLEN_HUB,
+    constraints=[_check_device, _check_qkv_dtype_bf16_or_fp16, _check_shape],
+    supports_context_parallel=False,
+)
+def _flash_attention_3_varlen_hub(
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    attn_mask: Optional[torch.Tensor] = None,
+    scale: Optional[float] = None,
+    is_causal: bool = False,
+    return_lse: bool = False,
+    _parallel_config: Optional["ParallelConfig"] = None,
+) -> torch.Tensor:
+    batch_size, seq_len_q, _, _ = query.shape
+    _, seq_len_kv, _, _ = key.shape
+
+    if attn_mask is not None:
+        attn_mask = _normalize_attn_mask(attn_mask, batch_size, seq_len_kv)
+
+    (_, seqlens_k), (cu_seqlens_q, cu_seqlens_k), (max_seqlen_q, max_seqlen_k) = (
+        _prepare_for_flash_attn_or_sage_varlen(
+            batch_size, seq_len_q, seq_len_kv, attn_mask=attn_mask, device=query.device
+        )
+    )
+
+    key_valid, value_valid = [], []
+    for b in range(batch_size):
+        valid_len = seqlens_k[b]
+        key_valid.append(key[b, :valid_len])
+        value_valid.append(value[b, :valid_len])
+
+    query_packed = query.flatten(0, 1)
+    key_packed = torch.cat(key_valid, dim=0)
+    value_packed = torch.cat(value_valid, dim=0)
+
+    func = _HUB_KERNELS_REGISTRY[AttentionBackendName._FLASH_3_VARLEN_HUB].kernel_fn
+    out, lse, *_ = func(
+        q=query_packed,
+        k=key_packed,
+        v=value_packed,
+        cu_seqlens_q=cu_seqlens_q,
+        cu_seqlens_k=cu_seqlens_k,
+        max_seqlen_q=max_seqlen_q,
+        max_seqlen_k=max_seqlen_k,
+        softmax_scale=scale,
+        causal=is_causal,
+    )
+    out = out.unflatten(0, (batch_size, -1))
+
+    return (out, lse) if return_lse else out
+
+
 @_AttentionBackendRegistry.register(
     AttentionBackendName._FLASH_VARLEN_3,
     constraints=[_check_device, _check_qkv_dtype_bf16_or_fp16, _check_shape],
@@ -1523,6 +1793,7 @@ def score_mod(score, batch_idx, head_idx, q_idx, kv_idx):
 @_AttentionBackendRegistry.register(
     AttentionBackendName.NATIVE,
     constraints=[_check_device, _check_shape],
+    supports_context_parallel=True,
 )
 def _native_attention(
     query: torch.Tensor,
@@ -1538,18 +1809,35 @@ def _native_attention(
 ) -> torch.Tensor:
     if return_lse:
         raise ValueError("Native attention backend does not support setting `return_lse=True`.")
-    query, key, value = (x.permute(0, 2, 1, 3) for x in (query, key, value))
-    out = torch.nn.functional.scaled_dot_product_attention(
-        query=query,
-        key=key,
-        value=value,
-        attn_mask=attn_mask,
-        dropout_p=dropout_p,
-        is_causal=is_causal,
-        scale=scale,
-        enable_gqa=enable_gqa,
-    )
-    out = out.permute(0, 2, 1, 3)
+    if _parallel_config is None:
+        query, key, value = (x.permute(0, 2, 1, 3) for x in (query, key, value))
+        out = torch.nn.functional.scaled_dot_product_attention(
+            query=query,
+            key=key,
+            value=value,
+            attn_mask=attn_mask,
+            dropout_p=dropout_p,
+            is_causal=is_causal,
+            scale=scale,
+            enable_gqa=enable_gqa,
+        )
+        out = out.permute(0, 2, 1, 3)
+    else:
+        out = _templated_context_parallel_attention(
+            query,
+            key,
+            value,
+            attn_mask,
+            dropout_p,
+            is_causal,
+            scale,
+            enable_gqa,
+            return_lse,
+            forward_op=_native_attention_forward_op,
+            backward_op=_native_attention_backward_op,
+            _parallel_config=_parallel_config,
+        )
+
     return out
 
 
@@ -1816,6 +2104,38 @@ def _sage_attention(
     return (out, lse) if return_lse else out
 
 
+@_AttentionBackendRegistry.register(
+    AttentionBackendName.SAGE_HUB,
+    constraints=[_check_device_cuda, _check_qkv_dtype_bf16_or_fp16, _check_shape],
+    supports_context_parallel=False,
+)
+def _sage_attention_hub(
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    is_causal: bool = False,
+    scale: Optional[float] = None,
+    return_lse: bool = False,
+    _parallel_config: Optional["ParallelConfig"] = None,
+) -> torch.Tensor:
+    lse = None
+    func = _HUB_KERNELS_REGISTRY[AttentionBackendName.SAGE_HUB].kernel_fn
+    if _parallel_config is None:
+        out = func(
+            q=query,
+            k=key,
+            v=value,
+            tensor_layout="NHD",
+            is_causal=is_causal,
+            sm_scale=scale,
+            return_lse=return_lse,
+        )
+        if return_lse:
+            out, lse, *_ = out
+
+    return (out, lse) if return_lse else out
+
+
 @_AttentionBackendRegistry.register(
     AttentionBackendName.SAGE_VARLEN,
     constraints=[_check_device_cuda, _check_qkv_dtype_bf16_or_fp16, _check_shape],
diff --git a/src/diffusers/models/auto_model.py b/src/diffusers/models/auto_model.py
index 947b610ea61a..c96b4fa88c49 100644
--- a/src/diffusers/models/auto_model.py
+++ b/src/diffusers/models/auto_model.py
@@ -147,14 +147,13 @@ def from_pretrained(cls, pretrained_model_or_path: Optional[Union[str, os.PathLi
             "force_download",
             "local_files_only",
             "proxies",
-            "resume_download",
             "revision",
             "token",
         ]
         hub_kwargs = {name: kwargs.pop(name, None) for name in hub_kwargs_names}
 
         # load_config_kwargs uses the same hub kwargs minus subfolder and resume_download
-        load_config_kwargs = {k: v for k, v in hub_kwargs.items() if k not in ["subfolder", "resume_download"]}
+        load_config_kwargs = {k: v for k, v in hub_kwargs.items() if k not in ["subfolder"]}
 
         library = None
         orig_class_name = None
@@ -205,7 +204,6 @@ def from_pretrained(cls, pretrained_model_or_path: Optional[Union[str, os.PathLi
                 module_file=module_file,
                 class_name=class_name,
                 **hub_kwargs,
-                **kwargs,
             )
         else:
             from ..pipelines.pipeline_loading_utils import ALL_IMPORTABLE_CLASSES, get_class_obj_and_candidates
diff --git a/src/diffusers/models/autoencoders/__init__.py b/src/diffusers/models/autoencoders/__init__.py
index edfaabb070c5..56df27f93cd7 100644
--- a/src/diffusers/models/autoencoders/__init__.py
+++ b/src/diffusers/models/autoencoders/__init__.py
@@ -4,9 +4,11 @@
 from .autoencoder_kl_allegro import AutoencoderKLAllegro
 from .autoencoder_kl_cogvideox import AutoencoderKLCogVideoX
 from .autoencoder_kl_cosmos import AutoencoderKLCosmos
+from .autoencoder_kl_flux2 import AutoencoderKLFlux2
 from .autoencoder_kl_hunyuan_video import AutoencoderKLHunyuanVideo
 from .autoencoder_kl_hunyuanimage import AutoencoderKLHunyuanImage
 from .autoencoder_kl_hunyuanimage_refiner import AutoencoderKLHunyuanImageRefiner
+from .autoencoder_kl_hunyuanvideo15 import AutoencoderKLHunyuanVideo15
 from .autoencoder_kl_ltx import AutoencoderKLLTXVideo
 from .autoencoder_kl_magvit import AutoencoderKLMagvit
 from .autoencoder_kl_mochi import AutoencoderKLMochi
diff --git a/src/diffusers/models/autoencoders/autoencoder_dc.py b/src/diffusers/models/autoencoders/autoencoder_dc.py
index 724ec3bb760c..ec301ef8ad51 100644
--- a/src/diffusers/models/autoencoders/autoencoder_dc.py
+++ b/src/diffusers/models/autoencoders/autoencoder_dc.py
@@ -102,7 +102,7 @@ def get_block(
     attention_head_dim: int,
     norm_type: str,
     act_fn: str,
-    qkv_mutliscales: Tuple[int] = (),
+    qkv_mutliscales: Tuple[int, ...] = (),
 ):
     if block_type == "ResBlock":
         block = ResBlock(in_channels, out_channels, norm_type, act_fn)
@@ -206,8 +206,8 @@ def __init__(
         latent_channels: int,
         attention_head_dim: int = 32,
         block_type: Union[str, Tuple[str]] = "ResBlock",
-        block_out_channels: Tuple[int] = (128, 256, 512, 512, 1024, 1024),
-        layers_per_block: Tuple[int] = (2, 2, 2, 2, 2, 2),
+        block_out_channels: Tuple[int, ...] = (128, 256, 512, 512, 1024, 1024),
+        layers_per_block: Tuple[int, ...] = (2, 2, 2, 2, 2, 2),
         qkv_multiscales: Tuple[Tuple[int, ...], ...] = ((), (), (), (5,), (5,), (5,)),
         downsample_block_type: str = "pixel_unshuffle",
         out_shortcut: bool = True,
@@ -292,8 +292,8 @@ def __init__(
         latent_channels: int,
         attention_head_dim: int = 32,
         block_type: Union[str, Tuple[str]] = "ResBlock",
-        block_out_channels: Tuple[int] = (128, 256, 512, 512, 1024, 1024),
-        layers_per_block: Tuple[int] = (2, 2, 2, 2, 2, 2),
+        block_out_channels: Tuple[int, ...] = (128, 256, 512, 512, 1024, 1024),
+        layers_per_block: Tuple[int, ...] = (2, 2, 2, 2, 2, 2),
         qkv_multiscales: Tuple[Tuple[int, ...], ...] = ((), (), (), (5,), (5,), (5,)),
         norm_type: Union[str, Tuple[str]] = "rms_norm",
         act_fn: Union[str, Tuple[str]] = "silu",
@@ -440,8 +440,8 @@ def __init__(
         decoder_block_types: Union[str, Tuple[str]] = "ResBlock",
         encoder_block_out_channels: Tuple[int, ...] = (128, 256, 512, 512, 1024, 1024),
         decoder_block_out_channels: Tuple[int, ...] = (128, 256, 512, 512, 1024, 1024),
-        encoder_layers_per_block: Tuple[int] = (2, 2, 2, 3, 3, 3),
-        decoder_layers_per_block: Tuple[int] = (3, 3, 3, 3, 3, 3),
+        encoder_layers_per_block: Tuple[int, ...] = (2, 2, 2, 3, 3, 3),
+        decoder_layers_per_block: Tuple[int, ...] = (3, 3, 3, 3, 3, 3),
         encoder_qkv_multiscales: Tuple[Tuple[int, ...], ...] = ((), (), (), (5,), (5,), (5,)),
         decoder_qkv_multiscales: Tuple[Tuple[int, ...], ...] = ((), (), (), (5,), (5,), (5,)),
         upsample_block_type: str = "pixel_shuffle",
diff --git a/src/diffusers/models/autoencoders/autoencoder_kl.py b/src/diffusers/models/autoencoders/autoencoder_kl.py
index 1a72aa3cfeb3..10ecda39c058 100644
--- a/src/diffusers/models/autoencoders/autoencoder_kl.py
+++ b/src/diffusers/models/autoencoders/autoencoder_kl.py
@@ -11,21 +11,22 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from typing import Dict, Optional, Tuple, Union
+from typing import Optional, Tuple, Union
 
 import torch
 import torch.nn as nn
+import torch.distributed as dist
 
 from ...configuration_utils import ConfigMixin, register_to_config
 from ...loaders import PeftAdapterMixin
 from ...loaders.single_file_model import FromOriginalModelMixin
-from ...utils import deprecate
+from ...utils import deprecate, logging
 from ...utils.accelerate_utils import apply_forward_hook
+from ..attention import AttentionMixin
 from ..attention_processor import (
     ADDED_KV_ATTENTION_PROCESSORS,
     CROSS_ATTENTION_PROCESSORS,
     Attention,
-    AttentionProcessor,
     AttnAddedKVProcessor,
     AttnProcessor,
     FusedAttnProcessor2_0,
@@ -35,7 +36,12 @@
 from .vae import AutoencoderMixin, Decoder, DecoderOutput, DiagonalGaussianDistribution, Encoder
 
 
-class AutoencoderKL(ModelMixin, AutoencoderMixin, ConfigMixin, FromOriginalModelMixin, PeftAdapterMixin):
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+
+class AutoencoderKL(
+    ModelMixin, AttentionMixin, AutoencoderMixin, ConfigMixin, FromOriginalModelMixin, PeftAdapterMixin
+):
     r"""
     A VAE model with KL loss for encoding images into latents and decoding latent representations into images.
 
@@ -72,15 +78,16 @@ class AutoencoderKL(ModelMixin, AutoencoderMixin, ConfigMixin, FromOriginalModel
 
     _supports_gradient_checkpointing = True
     _no_split_modules = ["BasicTransformerBlock", "ResnetBlock2D"]
+    _group_offload_block_modules = ["quant_conv", "post_quant_conv", "encoder", "decoder"]
 
     @register_to_config
     def __init__(
         self,
         in_channels: int = 3,
         out_channels: int = 3,
-        down_block_types: Tuple[str] = ("DownEncoderBlock2D",),
-        up_block_types: Tuple[str] = ("UpDecoderBlock2D",),
-        block_out_channels: Tuple[int] = (64,),
+        down_block_types: Tuple[str, ...] = ("DownEncoderBlock2D",),
+        up_block_types: Tuple[str, ...] = ("UpDecoderBlock2D",),
+        block_out_channels: Tuple[int, ...] = (64,),
         layers_per_block: int = 1,
         act_fn: str = "silu",
         latent_channels: int = 4,
@@ -127,6 +134,7 @@ def __init__(
 
         self.use_slicing = False
         self.use_tiling = False
+        self.use_dp = False
 
         # only relevant if vae tiling is enabled
         self.tile_sample_min_size = self.config.sample_size
@@ -138,66 +146,6 @@ def __init__(
         self.tile_latent_min_size = int(sample_size / (2 ** (len(self.config.block_out_channels) - 1)))
         self.tile_overlap_factor = 0.25
 
-    @property
-    # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.attn_processors
-    def attn_processors(self) -> Dict[str, AttentionProcessor]:
-        r"""
-        Returns:
-            `dict` of attention processors: A dictionary containing all attention processors used in the model with
-            indexed by its weight name.
-        """
-        # set recursively
-        processors = {}
-
-        def fn_recursive_add_processors(name: str, module: torch.nn.Module, processors: Dict[str, AttentionProcessor]):
-            if hasattr(module, "get_processor"):
-                processors[f"{name}.processor"] = module.get_processor()
-
-            for sub_name, child in module.named_children():
-                fn_recursive_add_processors(f"{name}.{sub_name}", child, processors)
-
-            return processors
-
-        for name, module in self.named_children():
-            fn_recursive_add_processors(name, module, processors)
-
-        return processors
-
-    # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.set_attn_processor
-    def set_attn_processor(self, processor: Union[AttentionProcessor, Dict[str, AttentionProcessor]]):
-        r"""
-        Sets the attention processor to use to compute attention.
-
-        Parameters:
-            processor (`dict` of `AttentionProcessor` or only `AttentionProcessor`):
-                The instantiated processor class or a dictionary of processor classes that will be set as the processor
-                for **all** `Attention` layers.
-
-                If `processor` is a dict, the key needs to define the path to the corresponding cross attention
-                processor. This is strongly recommended when setting trainable attention processors.
-
-        """
-        count = len(self.attn_processors.keys())
-
-        if isinstance(processor, dict) and len(processor) != count:
-            raise ValueError(
-                f"A dict of processors was passed, but the number of processors {len(processor)} does not match the"
-                f" number of attention layers: {count}. Please make sure to pass {count} processor classes."
-            )
-
-        def fn_recursive_attn_processor(name: str, module: torch.nn.Module, processor):
-            if hasattr(module, "set_processor"):
-                if not isinstance(processor, dict):
-                    module.set_processor(processor)
-                else:
-                    module.set_processor(processor.pop(f"{name}.processor"))
-
-            for sub_name, child in module.named_children():
-                fn_recursive_attn_processor(f"{name}.{sub_name}", child, processor)
-
-        for name, module in self.named_children():
-            fn_recursive_attn_processor(name, module, processor)
-
     # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.set_default_attn_processor
     def set_default_attn_processor(self):
         """
@@ -214,9 +162,58 @@ def set_default_attn_processor(self):
 
         self.set_attn_processor(processor)
 
+    def enable_dp(
+        self,
+        world_size: Optional[int] = None,
+        hw_splits: Optional[Tuple[int, int]] = None,
+        overlap_ratio: Optional[float] = None,
+        overlap_pixels: Optional[int] = None
+    ) -> None:
+        r"""
+        """
+        if world_size is None:
+            world_size = dist.get_world_size() if dist.is_initialized() else 1
+
+        if world_size <= 1 or world_size > dist.get_world_size():
+            logger.warning(
+                f"Supported world_size for vae dp is between 2 - {dist.get_world_size}, but got {world_size}. " \
+                f"Fall back to normal vae")
+            return
+
+        if hw_splits is None:
+            hw_splits = (1, int(world_size))
+
+        assert len(hw_splits) == 2, f"'hw_splits' should be a tuple of 2 int, but got length {len(hw_splits)}"
+
+        h_split, w_split = map(int, hw_splits)
+
+        self.use_dp = True
+        self.h_split, self.w_split = h_split, w_split
+        self.world_size = world_size
+        self.overlap_ratio = overlap_ratio
+        self.overlap_pixels = overlap_pixels
+        self.spatial_compression_ratio = 2 ** (len(self.config.block_out_channels) - 1)
+
+        dp_ranks = list(range(0, world_size))
+        self.vae_dp_group = dist.new_group(ranks=dp_ranks)
+        self.rank = dist.get_rank()
+        # patch_ranks_flatten = [tile_idx % world_size for tile_idx in range(num_tiles)]
+        # self.patch_ranks = torch.Tensor(patch_ranks_flatten).reshape(h_split, w_split)
+        self.tile_idxs_per_rank = [[] for _ in range(self.world_size)]
+        self.num_tiles_per_rank = [0] * self.world_size
+        rank_idx = 0
+        for h_idx in range(self.h_split):
+            for w_idx in range(self.w_split):
+                rank_idx %= self.world_size
+                self.tile_idxs_per_rank[rank_idx].append((h_idx, w_idx))
+                self.num_tiles_per_rank[rank_idx] += 1
+                rank_idx += 1
+
     def _encode(self, x: torch.Tensor) -> torch.Tensor:
         batch_size, num_channels, height, width = x.shape
 
+        if self.use_dp:
+            return self._tiled_encode(x)
         if self.use_tiling and (width > self.tile_sample_min_size or height > self.tile_sample_min_size):
             return self._tiled_encode(x)
 
@@ -256,6 +253,8 @@ def encode(
         return AutoencoderKLOutput(latent_dist=posterior)
 
     def _decode(self, z: torch.Tensor, return_dict: bool = True) -> Union[DecoderOutput, torch.Tensor]:
+        if self.use_dp:
+            return self.tiled_decode_with_dp(z, return_dict=return_dict)
         if self.use_tiling and (z.shape[-1] > self.tile_latent_min_size or z.shape[-2] > self.tile_latent_min_size):
             return self.tiled_decode(z, return_dict=return_dict)
 
@@ -310,6 +309,20 @@ def blend_h(self, a: torch.Tensor, b: torch.Tensor, blend_extent: int) -> torch.
             b[:, :, :, x] = a[:, :, :, -blend_extent + x] * (1 - x / blend_extent) + b[:, :, :, x] * (x / blend_extent)
         return b
 
+    def blend_v_(self, a: torch.Tensor, b: torch.Tensor, blend_extent: int) -> torch.Tensor:
+        blend_extent = min(a.shape[2], b.shape[2], blend_extent)
+        y = torch.arange(0, blend_extent, device=a.device)
+        blend_ratio = (y / blend_extent)[None, None, :, None].to(a.dtype)
+        b[:, :, y, :] = a[:, :, -blend_extent + y, :] * (1 - blend_ratio) + b[:, :, y, :] * blend_ratio
+        return b
+
+    def blend_h_(self, a: torch.Tensor, b: torch.Tensor, blend_extent: int) -> torch.Tensor:
+        blend_extent = min(a.shape[3], b.shape[3], blend_extent)
+        x = torch.arange(0, blend_extent, device=a.device)
+        blend_ratio = (x / blend_extent)[None, None, None, :].to(a.dtype)
+        b[:, :, :, x] = a[:, :, :, -blend_extent + x] * (1 - blend_ratio) + b[:, :, :, x] * blend_ratio
+        return b
+
     def _tiled_encode(self, x: torch.Tensor) -> torch.Tensor:
         r"""Encode a batch of images using a tiled encoder.
 
@@ -469,6 +482,157 @@ def tiled_decode(self, z: torch.Tensor, return_dict: bool = True) -> Union[Decod
 
         return DecoderOutput(sample=dec)
 
+    def calculate_tiled_parallel_size(self, latent_height, latent_width):
+        # Calculate stride based on h_split and w_split
+        tile_latent_stride_height = int((latent_height + self.h_split - 1) / self.h_split)
+        tile_latent_stride_width = int((latent_width + self.w_split - 1) / self.w_split)
+
+        # Calculate overlap in latent space
+        overlap_latent_height = 3
+        overlap_latent_width = 3
+        if self.overlap_pixels is not None:
+            overlap_latent = (self.overlap_pixels + self.spatial_compression_ratio - 1) // self.spatial_compression_ratio
+            overlap_latent_height = overlap_latent
+            overlap_latent_width = overlap_latent
+        elif self.overlap_ratio is not None:
+            overlap_latent_height = int(self.overlap_ratio * latent_height)
+            overlap_latent_width = int(self.overlap_ratio * latent_width)
+
+        # Calculate minimum tile size in latent space
+        tile_latent_min_height = tile_latent_stride_height + overlap_latent_height
+        tile_latent_min_width = tile_latent_stride_width + overlap_latent_width
+
+        tile_sample_min_height = tile_latent_min_height * self.spatial_compression_ratio
+        tile_sample_min_width = tile_latent_min_width * self.spatial_compression_ratio
+        tile_sample_stride_height = tile_latent_stride_height * self.spatial_compression_ratio
+        tile_sample_stride_width = tile_latent_stride_width * self.spatial_compression_ratio
+
+        blend_latent_height = tile_latent_min_height - tile_latent_stride_height
+        blend_latent_width = tile_latent_min_width - tile_latent_stride_width
+
+        blend_sample_height = tile_sample_min_height - tile_sample_stride_height
+        blend_sample_width = tile_sample_min_width - tile_sample_stride_width
+
+        return \
+            tile_latent_min_height, tile_latent_min_width, tile_latent_stride_height, tile_latent_stride_width, \
+            tile_sample_min_height, tile_sample_min_width, tile_sample_stride_height, tile_sample_stride_width, \
+            blend_latent_height, blend_latent_width, blend_sample_height, blend_sample_width
+
+    def _tiled_encode_with_dp(self, x: torch.Tensor) -> torch.Tensor:
+        r"""Encode a batch of images using a tiled encoder.
+
+        When this option is enabled, the VAE will split the input tensor into tiles to compute encoding in several
+        steps. This is useful to keep memory use constant regardless of image size. The end result of tiled encoding is
+        different from non-tiled encoding because each tile uses a different encoder. To avoid tiling artifacts, the
+        tiles overlap and are blended together to form a smooth output. You may still see tile-sized changes in the
+        output, but they should be much less noticeable.
+
+        Args:
+            x (`torch.Tensor`): Input batch of images.
+
+        Returns:
+            `torch.Tensor`:
+                The latent representation of the encoded videos.
+        """
+
+        _, _, height, width = x.shape
+        device = x.device
+        latent_height = height // self.spatial_compression_ratio
+        latent_width = width // self.spatial_compression_ratio
+
+        tile_latent_min_height, tile_latent_min_width, tile_latent_stride_height, tile_latent_stride_width, \
+        tile_sample_min_height, tile_sample_min_width, tile_sample_stride_height, tile_sample_stride_width, \
+        blend_latent_height, blend_latent_width, blend_sample_height, blend_sample_width = \
+            self.calculate_tiled_parallel_size(latent_height, latent_width)
+
+        def vae_encode_op(
+                x, patch_height_start, patch_height_end, patch_width_start, patch_width_end
+            ) -> torch.Tensor:
+            tile = x[:, :, patch_height_start : patch_height_end, patch_width_start : patch_width_end]
+            tile = self.encoder(tile)
+            if self.config.use_quant_conv:
+                tile = self.quant_conv(tile)
+            return tile
+
+        rows = self.run_vae_tile_parallel(
+            x, vae_encode_op, 
+            tile_sample_min_height, tile_sample_min_width, tile_sample_stride_height, tile_sample_stride_width, device
+        )
+ 
+        result_rows = []
+        for i, row in enumerate(rows):
+            result_row = []
+            for j, tile in enumerate(row):
+                # blend the above tile and the left tile
+                # to the current tile and add the current tile to the result row
+                if i > 0:
+                    tile = self.blend_v_(rows[i - 1][j], tile, blend_latent_height)
+                if j > 0:
+                    tile = self.blend_h_(row[j - 1], tile, blend_latent_width)
+                result_row.append(tile[:, :, :tile_latent_stride_height, :tile_latent_stride_width])
+            result_rows.append(torch.cat(result_row, dim=3))
+
+        enc = torch.cat(result_rows, dim=2)[:, :, :latent_height, :latent_width]
+        return enc
+
+    def tiled_decode_with_dp(self, z: torch.Tensor, return_dict: bool = True) -> Union[DecoderOutput, torch.Tensor]:
+        r"""
+        Decode a batch of images using a tiled decoder.
+
+        Args:
+            z (`torch.Tensor`): Input batch of latent vectors.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~models.vae.DecoderOutput`] instead of a plain tuple.
+
+        Returns:
+            [`~models.vae.DecoderOutput`] or `tuple`:
+                If return_dict is True, a [`~models.vae.DecoderOutput`] is returned, otherwise a plain `tuple` is
+                returned.
+        """
+        _, _, latent_height, latent_width = z.shape
+        device = z.device
+        sample_height = latent_height * self.spatial_compression_ratio
+        sample_width = latent_width * self.spatial_compression_ratio
+
+        tile_latent_min_height, tile_latent_min_width, tile_latent_stride_height, tile_latent_stride_width, \
+        tile_sample_min_height, tile_sample_min_width, tile_sample_stride_height, tile_sample_stride_width, \
+        blend_latent_height, blend_latent_width, blend_sample_height, blend_sample_width = \
+            self.calculate_tiled_parallel_size(latent_height, latent_width)
+
+        def vae_decode_op(
+                z, patch_height_start, patch_height_end, patch_width_start, patch_width_end
+            ) -> torch.Tensor:
+
+            tile = z[:, :, patch_height_start : patch_height_end, patch_width_start : patch_width_end]
+            if self.config.use_post_quant_conv:
+                tile = self.post_quant_conv(tile)
+            decoded = self.decoder(tile)
+            return decoded
+
+        rows = self.run_vae_tile_parallel(
+            z, vae_decode_op, 
+            tile_latent_min_height, tile_latent_min_width, tile_latent_stride_height, tile_latent_stride_width, device
+        )
+
+        result_rows = []
+        for i, row in enumerate(rows):
+            result_row = []
+            for j, tile in enumerate(row):
+                # blend the above tile and the left tile
+                # to the current tile and add the current tile to the result row
+                if i > 0:
+                    tile = self.blend_v_(rows[i - 1][j], tile, blend_sample_height)
+                if j > 0:
+                    tile = self.blend_h_(row[j - 1], tile, blend_sample_width)
+                result_row.append(tile[:, :, :tile_sample_stride_height, :tile_sample_stride_width])
+            result_rows.append(torch.cat(result_row, dim=3))
+
+        dec = torch.cat(result_rows, dim=2)[:, :, :sample_height, :sample_width]
+        if not return_dict:
+            return (dec,)
+
+        return DecoderOutput(sample=dec)
+
     def forward(
         self,
         sample: torch.Tensor,
diff --git a/src/diffusers/models/autoencoders/autoencoder_kl_cogvideox.py b/src/diffusers/models/autoencoders/autoencoder_kl_cogvideox.py
index 5096b725d0bb..79433f7b9232 100644
--- a/src/diffusers/models/autoencoders/autoencoder_kl_cogvideox.py
+++ b/src/diffusers/models/autoencoders/autoencoder_kl_cogvideox.py
@@ -995,19 +995,19 @@ def __init__(
         self,
         in_channels: int = 3,
         out_channels: int = 3,
-        down_block_types: Tuple[str] = (
+        down_block_types: Tuple[str, ...] = (
             "CogVideoXDownBlock3D",
             "CogVideoXDownBlock3D",
             "CogVideoXDownBlock3D",
             "CogVideoXDownBlock3D",
         ),
-        up_block_types: Tuple[str] = (
+        up_block_types: Tuple[str, ...] = (
             "CogVideoXUpBlock3D",
             "CogVideoXUpBlock3D",
             "CogVideoXUpBlock3D",
             "CogVideoXUpBlock3D",
         ),
-        block_out_channels: Tuple[int] = (128, 256, 256, 512),
+        block_out_channels: Tuple[int, ...] = (128, 256, 256, 512),
         latent_channels: int = 16,
         layers_per_block: int = 3,
         act_fn: str = "silu",
diff --git a/src/diffusers/models/autoencoders/autoencoder_kl_flux2.py b/src/diffusers/models/autoencoders/autoencoder_kl_flux2.py
new file mode 100644
index 000000000000..3325d33c06bf
--- /dev/null
+++ b/src/diffusers/models/autoencoders/autoencoder_kl_flux2.py
@@ -0,0 +1,488 @@
+# Copyright 2025 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import math
+from typing import Optional, Tuple, Union
+
+import torch
+import torch.nn as nn
+
+from ...configuration_utils import ConfigMixin, register_to_config
+from ...loaders import PeftAdapterMixin
+from ...loaders.single_file_model import FromOriginalModelMixin
+from ...utils import deprecate
+from ...utils.accelerate_utils import apply_forward_hook
+from ..attention import AttentionMixin
+from ..attention_processor import (
+    ADDED_KV_ATTENTION_PROCESSORS,
+    CROSS_ATTENTION_PROCESSORS,
+    Attention,
+    AttnAddedKVProcessor,
+    AttnProcessor,
+    FusedAttnProcessor2_0,
+)
+from ..modeling_outputs import AutoencoderKLOutput
+from ..modeling_utils import ModelMixin
+from .vae import AutoencoderMixin, Decoder, DecoderOutput, DiagonalGaussianDistribution, Encoder
+
+
+class AutoencoderKLFlux2(
+    ModelMixin, AutoencoderMixin, AttentionMixin, ConfigMixin, FromOriginalModelMixin, PeftAdapterMixin
+):
+    r"""
+    A VAE model with KL loss for encoding images into latents and decoding latent representations into images.
+
+    This model inherits from [`ModelMixin`]. Check the superclass documentation for it's generic methods implemented
+    for all models (such as downloading or saving).
+
+    Parameters:
+        in_channels (int, *optional*, defaults to 3): Number of channels in the input image.
+        out_channels (int,  *optional*, defaults to 3): Number of channels in the output.
+        down_block_types (`Tuple[str]`, *optional*, defaults to `("DownEncoderBlock2D",)`):
+            Tuple of downsample block types.
+        up_block_types (`Tuple[str]`, *optional*, defaults to `("UpDecoderBlock2D",)`):
+            Tuple of upsample block types.
+        block_out_channels (`Tuple[int]`, *optional*, defaults to `(64,)`):
+            Tuple of block output channels.
+        act_fn (`str`, *optional*, defaults to `"silu"`): The activation function to use.
+        latent_channels (`int`, *optional*, defaults to 4): Number of channels in the latent space.
+        sample_size (`int`, *optional*, defaults to `32`): Sample input size.
+        force_upcast (`bool`, *optional*, default to `True`):
+            If enabled it will force the VAE to run in float32 for high image resolution pipelines, such as SD-XL. VAE
+            can be fine-tuned / trained to a lower range without losing too much precision in which case `force_upcast`
+            can be set to `False` - see: https://huggingface.co/madebyollin/sdxl-vae-fp16-fix
+        mid_block_add_attention (`bool`, *optional*, default to `True`):
+            If enabled, the mid_block of the Encoder and Decoder will have attention blocks. If set to false, the
+            mid_block will only have resnet blocks
+    """
+
+    _supports_gradient_checkpointing = True
+    _no_split_modules = ["BasicTransformerBlock", "ResnetBlock2D"]
+
+    @register_to_config
+    def __init__(
+        self,
+        in_channels: int = 3,
+        out_channels: int = 3,
+        down_block_types: Tuple[str, ...] = (
+            "DownEncoderBlock2D",
+            "DownEncoderBlock2D",
+            "DownEncoderBlock2D",
+            "DownEncoderBlock2D",
+        ),
+        up_block_types: Tuple[str, ...] = (
+            "UpDecoderBlock2D",
+            "UpDecoderBlock2D",
+            "UpDecoderBlock2D",
+            "UpDecoderBlock2D",
+        ),
+        block_out_channels: Tuple[int, ...] = (
+            128,
+            256,
+            512,
+            512,
+        ),
+        layers_per_block: int = 2,
+        act_fn: str = "silu",
+        latent_channels: int = 32,
+        norm_num_groups: int = 32,
+        sample_size: int = 1024,  # YiYi notes: not sure
+        force_upcast: bool = True,
+        use_quant_conv: bool = True,
+        use_post_quant_conv: bool = True,
+        mid_block_add_attention: bool = True,
+        batch_norm_eps: float = 1e-4,
+        batch_norm_momentum: float = 0.1,
+        patch_size: Tuple[int, int] = (2, 2),
+    ):
+        super().__init__()
+
+        # pass init params to Encoder
+        self.encoder = Encoder(
+            in_channels=in_channels,
+            out_channels=latent_channels,
+            down_block_types=down_block_types,
+            block_out_channels=block_out_channels,
+            layers_per_block=layers_per_block,
+            act_fn=act_fn,
+            norm_num_groups=norm_num_groups,
+            double_z=True,
+            mid_block_add_attention=mid_block_add_attention,
+        )
+
+        # pass init params to Decoder
+        self.decoder = Decoder(
+            in_channels=latent_channels,
+            out_channels=out_channels,
+            up_block_types=up_block_types,
+            block_out_channels=block_out_channels,
+            layers_per_block=layers_per_block,
+            norm_num_groups=norm_num_groups,
+            act_fn=act_fn,
+            mid_block_add_attention=mid_block_add_attention,
+        )
+
+        self.quant_conv = nn.Conv2d(2 * latent_channels, 2 * latent_channels, 1) if use_quant_conv else None
+        self.post_quant_conv = nn.Conv2d(latent_channels, latent_channels, 1) if use_post_quant_conv else None
+
+        self.bn = nn.BatchNorm2d(
+            math.prod(patch_size) * latent_channels,
+            eps=batch_norm_eps,
+            momentum=batch_norm_momentum,
+            affine=False,
+            track_running_stats=True,
+        )
+
+        self.use_slicing = False
+        self.use_tiling = False
+
+        # only relevant if vae tiling is enabled
+        self.tile_sample_min_size = self.config.sample_size
+        sample_size = (
+            self.config.sample_size[0]
+            if isinstance(self.config.sample_size, (list, tuple))
+            else self.config.sample_size
+        )
+        self.tile_latent_min_size = int(sample_size / (2 ** (len(self.config.block_out_channels) - 1)))
+        self.tile_overlap_factor = 0.25
+
+    # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.set_default_attn_processor
+    def set_default_attn_processor(self):
+        """
+        Disables custom attention processors and sets the default attention implementation.
+        """
+        if all(proc.__class__ in ADDED_KV_ATTENTION_PROCESSORS for proc in self.attn_processors.values()):
+            processor = AttnAddedKVProcessor()
+        elif all(proc.__class__ in CROSS_ATTENTION_PROCESSORS for proc in self.attn_processors.values()):
+            processor = AttnProcessor()
+        else:
+            raise ValueError(
+                f"Cannot call `set_default_attn_processor` when attention processors are of type {next(iter(self.attn_processors.values()))}"
+            )
+
+        self.set_attn_processor(processor)
+
+    def _encode(self, x: torch.Tensor) -> torch.Tensor:
+        batch_size, num_channels, height, width = x.shape
+
+        if self.use_tiling and (width > self.tile_sample_min_size or height > self.tile_sample_min_size):
+            return self._tiled_encode(x)
+
+        enc = self.encoder(x)
+        if self.quant_conv is not None:
+            enc = self.quant_conv(enc)
+
+        return enc
+
+    @apply_forward_hook
+    def encode(
+        self, x: torch.Tensor, return_dict: bool = True
+    ) -> Union[AutoencoderKLOutput, Tuple[DiagonalGaussianDistribution]]:
+        """
+        Encode a batch of images into latents.
+
+        Args:
+            x (`torch.Tensor`): Input batch of images.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether to return a [`~models.autoencoder_kl.AutoencoderKLOutput`] instead of a plain tuple.
+
+        Returns:
+                The latent representations of the encoded images. If `return_dict` is True, a
+                [`~models.autoencoder_kl.AutoencoderKLOutput`] is returned, otherwise a plain `tuple` is returned.
+        """
+        if self.use_slicing and x.shape[0] > 1:
+            encoded_slices = [self._encode(x_slice) for x_slice in x.split(1)]
+            h = torch.cat(encoded_slices)
+        else:
+            h = self._encode(x)
+
+        posterior = DiagonalGaussianDistribution(h)
+
+        if not return_dict:
+            return (posterior,)
+
+        return AutoencoderKLOutput(latent_dist=posterior)
+
+    def _decode(self, z: torch.Tensor, return_dict: bool = True) -> Union[DecoderOutput, torch.Tensor]:
+        if self.use_tiling and (z.shape[-1] > self.tile_latent_min_size or z.shape[-2] > self.tile_latent_min_size):
+            return self.tiled_decode(z, return_dict=return_dict)
+
+        if self.post_quant_conv is not None:
+            z = self.post_quant_conv(z)
+
+        dec = self.decoder(z)
+
+        if not return_dict:
+            return (dec,)
+
+        return DecoderOutput(sample=dec)
+
+    @apply_forward_hook
+    def decode(
+        self, z: torch.FloatTensor, return_dict: bool = True, generator=None
+    ) -> Union[DecoderOutput, torch.FloatTensor]:
+        """
+        Decode a batch of images.
+
+        Args:
+            z (`torch.Tensor`): Input batch of latent vectors.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether to return a [`~models.vae.DecoderOutput`] instead of a plain tuple.
+
+        Returns:
+            [`~models.vae.DecoderOutput`] or `tuple`:
+                If return_dict is True, a [`~models.vae.DecoderOutput`] is returned, otherwise a plain `tuple` is
+                returned.
+
+        """
+        if self.use_slicing and z.shape[0] > 1:
+            decoded_slices = [self._decode(z_slice).sample for z_slice in z.split(1)]
+            decoded = torch.cat(decoded_slices)
+        else:
+            decoded = self._decode(z).sample
+
+        if not return_dict:
+            return (decoded,)
+
+        return DecoderOutput(sample=decoded)
+
+    def blend_v(self, a: torch.Tensor, b: torch.Tensor, blend_extent: int) -> torch.Tensor:
+        blend_extent = min(a.shape[2], b.shape[2], blend_extent)
+        for y in range(blend_extent):
+            b[:, :, y, :] = a[:, :, -blend_extent + y, :] * (1 - y / blend_extent) + b[:, :, y, :] * (y / blend_extent)
+        return b
+
+    def blend_h(self, a: torch.Tensor, b: torch.Tensor, blend_extent: int) -> torch.Tensor:
+        blend_extent = min(a.shape[3], b.shape[3], blend_extent)
+        for x in range(blend_extent):
+            b[:, :, :, x] = a[:, :, :, -blend_extent + x] * (1 - x / blend_extent) + b[:, :, :, x] * (x / blend_extent)
+        return b
+
+    def _tiled_encode(self, x: torch.Tensor) -> torch.Tensor:
+        r"""Encode a batch of images using a tiled encoder.
+
+        When this option is enabled, the VAE will split the input tensor into tiles to compute encoding in several
+        steps. This is useful to keep memory use constant regardless of image size. The end result of tiled encoding is
+        different from non-tiled encoding because each tile uses a different encoder. To avoid tiling artifacts, the
+        tiles overlap and are blended together to form a smooth output. You may still see tile-sized changes in the
+        output, but they should be much less noticeable.
+
+        Args:
+            x (`torch.Tensor`): Input batch of images.
+
+        Returns:
+            `torch.Tensor`:
+                The latent representation of the encoded videos.
+        """
+
+        overlap_size = int(self.tile_sample_min_size * (1 - self.tile_overlap_factor))
+        blend_extent = int(self.tile_latent_min_size * self.tile_overlap_factor)
+        row_limit = self.tile_latent_min_size - blend_extent
+
+        # Split the image into 512x512 tiles and encode them separately.
+        rows = []
+        for i in range(0, x.shape[2], overlap_size):
+            row = []
+            for j in range(0, x.shape[3], overlap_size):
+                tile = x[:, :, i : i + self.tile_sample_min_size, j : j + self.tile_sample_min_size]
+                tile = self.encoder(tile)
+                if self.config.use_quant_conv:
+                    tile = self.quant_conv(tile)
+                row.append(tile)
+            rows.append(row)
+        result_rows = []
+        for i, row in enumerate(rows):
+            result_row = []
+            for j, tile in enumerate(row):
+                # blend the above tile and the left tile
+                # to the current tile and add the current tile to the result row
+                if i > 0:
+                    tile = self.blend_v(rows[i - 1][j], tile, blend_extent)
+                if j > 0:
+                    tile = self.blend_h(row[j - 1], tile, blend_extent)
+                result_row.append(tile[:, :, :row_limit, :row_limit])
+            result_rows.append(torch.cat(result_row, dim=3))
+
+        enc = torch.cat(result_rows, dim=2)
+        return enc
+
+    def tiled_encode(self, x: torch.Tensor, return_dict: bool = True) -> AutoencoderKLOutput:
+        r"""Encode a batch of images using a tiled encoder.
+
+        When this option is enabled, the VAE will split the input tensor into tiles to compute encoding in several
+        steps. This is useful to keep memory use constant regardless of image size. The end result of tiled encoding is
+        different from non-tiled encoding because each tile uses a different encoder. To avoid tiling artifacts, the
+        tiles overlap and are blended together to form a smooth output. You may still see tile-sized changes in the
+        output, but they should be much less noticeable.
+
+        Args:
+            x (`torch.Tensor`): Input batch of images.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~models.autoencoder_kl.AutoencoderKLOutput`] instead of a plain tuple.
+
+        Returns:
+            [`~models.autoencoder_kl.AutoencoderKLOutput`] or `tuple`:
+                If return_dict is True, a [`~models.autoencoder_kl.AutoencoderKLOutput`] is returned, otherwise a plain
+                `tuple` is returned.
+        """
+        deprecation_message = (
+            "The tiled_encode implementation supporting the `return_dict` parameter is deprecated. In the future, the "
+            "implementation of this method will be replaced with that of `_tiled_encode` and you will no longer be able "
+            "to pass `return_dict`. You will also have to create a `DiagonalGaussianDistribution()` from the returned value."
+        )
+        deprecate("tiled_encode", "1.0.0", deprecation_message, standard_warn=False)
+
+        overlap_size = int(self.tile_sample_min_size * (1 - self.tile_overlap_factor))
+        blend_extent = int(self.tile_latent_min_size * self.tile_overlap_factor)
+        row_limit = self.tile_latent_min_size - blend_extent
+
+        # Split the image into 512x512 tiles and encode them separately.
+        rows = []
+        for i in range(0, x.shape[2], overlap_size):
+            row = []
+            for j in range(0, x.shape[3], overlap_size):
+                tile = x[:, :, i : i + self.tile_sample_min_size, j : j + self.tile_sample_min_size]
+                tile = self.encoder(tile)
+                if self.config.use_quant_conv:
+                    tile = self.quant_conv(tile)
+                row.append(tile)
+            rows.append(row)
+        result_rows = []
+        for i, row in enumerate(rows):
+            result_row = []
+            for j, tile in enumerate(row):
+                # blend the above tile and the left tile
+                # to the current tile and add the current tile to the result row
+                if i > 0:
+                    tile = self.blend_v(rows[i - 1][j], tile, blend_extent)
+                if j > 0:
+                    tile = self.blend_h(row[j - 1], tile, blend_extent)
+                result_row.append(tile[:, :, :row_limit, :row_limit])
+            result_rows.append(torch.cat(result_row, dim=3))
+
+        moments = torch.cat(result_rows, dim=2)
+        posterior = DiagonalGaussianDistribution(moments)
+
+        if not return_dict:
+            return (posterior,)
+
+        return AutoencoderKLOutput(latent_dist=posterior)
+
+    def tiled_decode(self, z: torch.Tensor, return_dict: bool = True) -> Union[DecoderOutput, torch.Tensor]:
+        r"""
+        Decode a batch of images using a tiled decoder.
+
+        Args:
+            z (`torch.Tensor`): Input batch of latent vectors.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~models.vae.DecoderOutput`] instead of a plain tuple.
+
+        Returns:
+            [`~models.vae.DecoderOutput`] or `tuple`:
+                If return_dict is True, a [`~models.vae.DecoderOutput`] is returned, otherwise a plain `tuple` is
+                returned.
+        """
+        overlap_size = int(self.tile_latent_min_size * (1 - self.tile_overlap_factor))
+        blend_extent = int(self.tile_sample_min_size * self.tile_overlap_factor)
+        row_limit = self.tile_sample_min_size - blend_extent
+
+        # Split z into overlapping 64x64 tiles and decode them separately.
+        # The tiles have an overlap to avoid seams between tiles.
+        rows = []
+        for i in range(0, z.shape[2], overlap_size):
+            row = []
+            for j in range(0, z.shape[3], overlap_size):
+                tile = z[:, :, i : i + self.tile_latent_min_size, j : j + self.tile_latent_min_size]
+                if self.config.use_post_quant_conv:
+                    tile = self.post_quant_conv(tile)
+                decoded = self.decoder(tile)
+                row.append(decoded)
+            rows.append(row)
+        result_rows = []
+        for i, row in enumerate(rows):
+            result_row = []
+            for j, tile in enumerate(row):
+                # blend the above tile and the left tile
+                # to the current tile and add the current tile to the result row
+                if i > 0:
+                    tile = self.blend_v(rows[i - 1][j], tile, blend_extent)
+                if j > 0:
+                    tile = self.blend_h(row[j - 1], tile, blend_extent)
+                result_row.append(tile[:, :, :row_limit, :row_limit])
+            result_rows.append(torch.cat(result_row, dim=3))
+
+        dec = torch.cat(result_rows, dim=2)
+        if not return_dict:
+            return (dec,)
+
+        return DecoderOutput(sample=dec)
+
+    def forward(
+        self,
+        sample: torch.Tensor,
+        sample_posterior: bool = False,
+        return_dict: bool = True,
+        generator: Optional[torch.Generator] = None,
+    ) -> Union[DecoderOutput, torch.Tensor]:
+        r"""
+        Args:
+            sample (`torch.Tensor`): Input sample.
+            sample_posterior (`bool`, *optional*, defaults to `False`):
+                Whether to sample from the posterior.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`DecoderOutput`] instead of a plain tuple.
+        """
+        x = sample
+        posterior = self.encode(x).latent_dist
+        if sample_posterior:
+            z = posterior.sample(generator=generator)
+        else:
+            z = posterior.mode()
+        dec = self.decode(z).sample
+
+        if not return_dict:
+            return (dec,)
+
+        return DecoderOutput(sample=dec)
+
+    # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.fuse_qkv_projections
+    def fuse_qkv_projections(self):
+        """
+        Enables fused QKV projections. For self-attention modules, all projection matrices (i.e., query, key, value)
+        are fused. For cross-attention modules, key and value projection matrices are fused.
+
+        > [!WARNING] > This API is 🧪 experimental.
+        """
+        self.original_attn_processors = None
+
+        for _, attn_processor in self.attn_processors.items():
+            if "Added" in str(attn_processor.__class__.__name__):
+                raise ValueError("`fuse_qkv_projections()` is not supported for models having added KV projections.")
+
+        self.original_attn_processors = self.attn_processors
+
+        for module in self.modules():
+            if isinstance(module, Attention):
+                module.fuse_projections(fuse=True)
+
+        self.set_attn_processor(FusedAttnProcessor2_0())
+
+    # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.unfuse_qkv_projections
+    def unfuse_qkv_projections(self):
+        """Disables the fused QKV projection if enabled.
+
+        > [!WARNING] > This API is 🧪 experimental.
+
+        """
+        if self.original_attn_processors is not None:
+            self.set_attn_processor(self.original_attn_processors)
diff --git a/src/diffusers/models/autoencoders/autoencoder_kl_hunyuan_video.py b/src/diffusers/models/autoencoders/autoencoder_kl_hunyuan_video.py
index 88b9bb507ff6..ddc0aed6b0ff 100644
--- a/src/diffusers/models/autoencoders/autoencoder_kl_hunyuan_video.py
+++ b/src/diffusers/models/autoencoders/autoencoder_kl_hunyuan_video.py
@@ -653,7 +653,7 @@ def __init__(
             "HunyuanVideoUpBlock3D",
             "HunyuanVideoUpBlock3D",
         ),
-        block_out_channels: Tuple[int] = (128, 256, 512, 512),
+        block_out_channels: Tuple[int, ...] = (128, 256, 512, 512),
         layers_per_block: int = 2,
         act_fn: str = "silu",
         norm_num_groups: int = 32,
diff --git a/src/diffusers/models/autoencoders/autoencoder_kl_hunyuanimage_refiner.py b/src/diffusers/models/autoencoders/autoencoder_kl_hunyuanimage_refiner.py
index af40c7a6cbd0..2249063a9f00 100644
--- a/src/diffusers/models/autoencoders/autoencoder_kl_hunyuanimage_refiner.py
+++ b/src/diffusers/models/autoencoders/autoencoder_kl_hunyuanimage_refiner.py
@@ -601,7 +601,7 @@ def __init__(
         in_channels: int = 3,
         out_channels: int = 3,
         latent_channels: int = 32,
-        block_out_channels: Tuple[int] = (128, 256, 512, 1024, 1024),
+        block_out_channels: Tuple[int, ...] = (128, 256, 512, 1024, 1024),
         layers_per_block: int = 2,
         spatial_compression_ratio: int = 16,
         temporal_compression_ratio: int = 4,
diff --git a/src/diffusers/models/autoencoders/autoencoder_kl_hunyuanvideo15.py b/src/diffusers/models/autoencoders/autoencoder_kl_hunyuanvideo15.py
new file mode 100644
index 000000000000..4b1beb74a3bc
--- /dev/null
+++ b/src/diffusers/models/autoencoders/autoencoder_kl_hunyuanvideo15.py
@@ -0,0 +1,967 @@
+# Copyright 2025 The Hunyuan Team and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Optional, Tuple, Union
+
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.utils.checkpoint
+
+from ...configuration_utils import ConfigMixin, register_to_config
+from ...utils import logging
+from ...utils.accelerate_utils import apply_forward_hook
+from ..activations import get_activation
+from ..modeling_outputs import AutoencoderKLOutput
+from ..modeling_utils import ModelMixin
+from .vae import DecoderOutput, DiagonalGaussianDistribution
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+
+class HunyuanVideo15CausalConv3d(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        kernel_size: Union[int, Tuple[int, int, int]] = 3,
+        stride: Union[int, Tuple[int, int, int]] = 1,
+        padding: Union[int, Tuple[int, int, int]] = 0,
+        dilation: Union[int, Tuple[int, int, int]] = 1,
+        bias: bool = True,
+        pad_mode: str = "replicate",
+    ) -> None:
+        super().__init__()
+
+        kernel_size = (kernel_size, kernel_size, kernel_size) if isinstance(kernel_size, int) else kernel_size
+
+        self.pad_mode = pad_mode
+        self.time_causal_padding = (
+            kernel_size[0] // 2,
+            kernel_size[0] // 2,
+            kernel_size[1] // 2,
+            kernel_size[1] // 2,
+            kernel_size[2] - 1,
+            0,
+        )
+
+        self.conv = nn.Conv3d(in_channels, out_channels, kernel_size, stride, padding, dilation, bias=bias)
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        hidden_states = F.pad(hidden_states, self.time_causal_padding, mode=self.pad_mode)
+        return self.conv(hidden_states)
+
+
+class HunyuanVideo15RMS_norm(nn.Module):
+    r"""
+    A custom RMS normalization layer.
+
+    Args:
+        dim (int): The number of dimensions to normalize over.
+        channel_first (bool, optional): Whether the input tensor has channels as the first dimension.
+            Default is True.
+        images (bool, optional): Whether the input represents image data. Default is True.
+        bias (bool, optional): Whether to include a learnable bias term. Default is False.
+    """
+
+    def __init__(self, dim: int, channel_first: bool = True, images: bool = True, bias: bool = False) -> None:
+        super().__init__()
+        broadcastable_dims = (1, 1, 1) if not images else (1, 1)
+        shape = (dim, *broadcastable_dims) if channel_first else (dim,)
+
+        self.channel_first = channel_first
+        self.scale = dim**0.5
+        self.gamma = nn.Parameter(torch.ones(shape))
+        self.bias = nn.Parameter(torch.zeros(shape)) if bias else 0.0
+
+    def forward(self, x):
+        return F.normalize(x, dim=(1 if self.channel_first else -1)) * self.scale * self.gamma + self.bias
+
+
+class HunyuanVideo15AttnBlock(nn.Module):
+    def __init__(self, in_channels: int):
+        super().__init__()
+        self.in_channels = in_channels
+
+        self.norm = HunyuanVideo15RMS_norm(in_channels, images=False)
+
+        self.to_q = nn.Conv3d(in_channels, in_channels, kernel_size=1)
+        self.to_k = nn.Conv3d(in_channels, in_channels, kernel_size=1)
+        self.to_v = nn.Conv3d(in_channels, in_channels, kernel_size=1)
+        self.proj_out = nn.Conv3d(in_channels, in_channels, kernel_size=1)
+
+    @staticmethod
+    def prepare_causal_attention_mask(n_frame: int, n_hw: int, dtype, device, batch_size: int = None):
+        """Prepare a causal attention mask for 3D videos.
+
+        Args:
+            n_frame (int): Number of frames (temporal length).
+            n_hw (int): Product of height and width.
+            dtype: Desired mask dtype.
+            device: Device for the mask.
+            batch_size (int, optional): If set, expands for batch.
+
+        Returns:
+            torch.Tensor: Causal attention mask.
+        """
+        seq_len = n_frame * n_hw
+        mask = torch.full((seq_len, seq_len), float("-inf"), dtype=dtype, device=device)
+        for i in range(seq_len):
+            i_frame = i // n_hw
+            mask[i, : (i_frame + 1) * n_hw] = 0
+        if batch_size is not None:
+            mask = mask.unsqueeze(0).expand(batch_size, -1, -1)
+        return mask
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        identity = x
+
+        x = self.norm(x)
+
+        query = self.to_q(x)
+        key = self.to_k(x)
+        value = self.to_v(x)
+
+        batch_size, channels, frames, height, width = query.shape
+
+        query = query.reshape(batch_size, channels, frames * height * width).permute(0, 2, 1).unsqueeze(1).contiguous()
+        key = key.reshape(batch_size, channels, frames * height * width).permute(0, 2, 1).unsqueeze(1).contiguous()
+        value = value.reshape(batch_size, channels, frames * height * width).permute(0, 2, 1).unsqueeze(1).contiguous()
+
+        attention_mask = self.prepare_causal_attention_mask(
+            frames, height * width, query.dtype, query.device, batch_size=batch_size
+        )
+
+        x = nn.functional.scaled_dot_product_attention(query, key, value, attn_mask=attention_mask)
+
+        # batch_size, 1, frames * height * width, channels
+
+        x = x.squeeze(1).reshape(batch_size, frames, height, width, channels).permute(0, 4, 1, 2, 3)
+        x = self.proj_out(x)
+
+        return x + identity
+
+
+class HunyuanVideo15Upsample(nn.Module):
+    def __init__(self, in_channels: int, out_channels: int, add_temporal_upsample: bool = True):
+        super().__init__()
+        factor = 2 * 2 * 2 if add_temporal_upsample else 1 * 2 * 2
+        self.conv = HunyuanVideo15CausalConv3d(in_channels, out_channels * factor, kernel_size=3)
+
+        self.add_temporal_upsample = add_temporal_upsample
+        self.repeats = factor * out_channels // in_channels
+
+    @staticmethod
+    def _dcae_upsample_rearrange(tensor, r1=1, r2=2, r3=2):
+        """
+        Convert (b, r1*r2*r3*c, f, h, w) -> (b, c, r1*f, r2*h, r3*w)
+
+        Args:
+            tensor: Input tensor of shape (b, r1*r2*r3*c, f, h, w)
+            r1: temporal upsampling factor
+            r2: height upsampling factor
+            r3: width upsampling factor
+        """
+        b, packed_c, f, h, w = tensor.shape
+        factor = r1 * r2 * r3
+        c = packed_c // factor
+
+        tensor = tensor.view(b, r1, r2, r3, c, f, h, w)
+        tensor = tensor.permute(0, 4, 5, 1, 6, 2, 7, 3)
+        return tensor.reshape(b, c, f * r1, h * r2, w * r3)
+
+    def forward(self, x: torch.Tensor):
+        r1 = 2 if self.add_temporal_upsample else 1
+        h = self.conv(x)
+        if self.add_temporal_upsample:
+            h_first = h[:, :, :1, :, :]
+            h_first = self._dcae_upsample_rearrange(h_first, r1=1, r2=2, r3=2)
+            h_first = h_first[:, : h_first.shape[1] // 2]
+            h_next = h[:, :, 1:, :, :]
+            h_next = self._dcae_upsample_rearrange(h_next, r1=r1, r2=2, r3=2)
+            h = torch.cat([h_first, h_next], dim=2)
+
+            # shortcut computation
+            x_first = x[:, :, :1, :, :]
+            x_first = self._dcae_upsample_rearrange(x_first, r1=1, r2=2, r3=2)
+            x_first = x_first.repeat_interleave(repeats=self.repeats // 2, dim=1)
+
+            x_next = x[:, :, 1:, :, :]
+            x_next = self._dcae_upsample_rearrange(x_next, r1=r1, r2=2, r3=2)
+            x_next = x_next.repeat_interleave(repeats=self.repeats, dim=1)
+            shortcut = torch.cat([x_first, x_next], dim=2)
+
+        else:
+            h = self._dcae_upsample_rearrange(h, r1=r1, r2=2, r3=2)
+            shortcut = x.repeat_interleave(repeats=self.repeats, dim=1)
+            shortcut = self._dcae_upsample_rearrange(shortcut, r1=r1, r2=2, r3=2)
+        return h + shortcut
+
+
+class HunyuanVideo15Downsample(nn.Module):
+    def __init__(self, in_channels: int, out_channels: int, add_temporal_downsample: bool = True):
+        super().__init__()
+        factor = 2 * 2 * 2 if add_temporal_downsample else 1 * 2 * 2
+        self.conv = HunyuanVideo15CausalConv3d(in_channels, out_channels // factor, kernel_size=3)
+
+        self.add_temporal_downsample = add_temporal_downsample
+        self.group_size = factor * in_channels // out_channels
+
+    @staticmethod
+    def _dcae_downsample_rearrange(tensor, r1=1, r2=2, r3=2):
+        """
+        Convert (b, c, r1*f, r2*h, r3*w) -> (b, r1*r2*r3*c, f, h, w)
+
+        This packs spatial/temporal dimensions into channels (opposite of upsample)
+        """
+        b, c, packed_f, packed_h, packed_w = tensor.shape
+        f, h, w = packed_f // r1, packed_h // r2, packed_w // r3
+
+        tensor = tensor.view(b, c, f, r1, h, r2, w, r3)
+        tensor = tensor.permute(0, 3, 5, 7, 1, 2, 4, 6)
+        return tensor.reshape(b, r1 * r2 * r3 * c, f, h, w)
+
+    def forward(self, x: torch.Tensor):
+        r1 = 2 if self.add_temporal_downsample else 1
+        h = self.conv(x)
+        if self.add_temporal_downsample:
+            h_first = h[:, :, :1, :, :]
+            h_first = self._dcae_downsample_rearrange(h_first, r1=1, r2=2, r3=2)
+            h_first = torch.cat([h_first, h_first], dim=1)
+            h_next = h[:, :, 1:, :, :]
+            h_next = self._dcae_downsample_rearrange(h_next, r1=r1, r2=2, r3=2)
+            h = torch.cat([h_first, h_next], dim=2)
+
+            # shortcut computation
+            x_first = x[:, :, :1, :, :]
+            x_first = self._dcae_downsample_rearrange(x_first, r1=1, r2=2, r3=2)
+            B, C, T, H, W = x_first.shape
+            x_first = x_first.view(B, h.shape[1], self.group_size // 2, T, H, W).mean(dim=2)
+            x_next = x[:, :, 1:, :, :]
+            x_next = self._dcae_downsample_rearrange(x_next, r1=r1, r2=2, r3=2)
+            B, C, T, H, W = x_next.shape
+            x_next = x_next.view(B, h.shape[1], self.group_size, T, H, W).mean(dim=2)
+            shortcut = torch.cat([x_first, x_next], dim=2)
+        else:
+            h = self._dcae_downsample_rearrange(h, r1=r1, r2=2, r3=2)
+            shortcut = self._dcae_downsample_rearrange(x, r1=r1, r2=2, r3=2)
+            B, C, T, H, W = shortcut.shape
+            shortcut = shortcut.view(B, h.shape[1], self.group_size, T, H, W).mean(dim=2)
+
+        return h + shortcut
+
+
+class HunyuanVideo15ResnetBlock(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: Optional[int] = None,
+        non_linearity: str = "swish",
+    ) -> None:
+        super().__init__()
+        out_channels = out_channels or in_channels
+
+        self.nonlinearity = get_activation(non_linearity)
+
+        self.norm1 = HunyuanVideo15RMS_norm(in_channels, images=False)
+        self.conv1 = HunyuanVideo15CausalConv3d(in_channels, out_channels, kernel_size=3)
+
+        self.norm2 = HunyuanVideo15RMS_norm(out_channels, images=False)
+        self.conv2 = HunyuanVideo15CausalConv3d(out_channels, out_channels, kernel_size=3)
+
+        self.conv_shortcut = None
+        if in_channels != out_channels:
+            self.conv_shortcut = nn.Conv3d(in_channels, out_channels, kernel_size=1, stride=1, padding=0)
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        residual = hidden_states
+
+        hidden_states = self.norm1(hidden_states)
+        hidden_states = self.nonlinearity(hidden_states)
+        hidden_states = self.conv1(hidden_states)
+
+        hidden_states = self.norm2(hidden_states)
+        hidden_states = self.nonlinearity(hidden_states)
+        hidden_states = self.conv2(hidden_states)
+
+        if self.conv_shortcut is not None:
+            residual = self.conv_shortcut(residual)
+
+        return hidden_states + residual
+
+
+class HunyuanVideo15MidBlock(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        num_layers: int = 1,
+        add_attention: bool = True,
+    ) -> None:
+        super().__init__()
+        self.add_attention = add_attention
+
+        # There is always at least one resnet
+        resnets = [
+            HunyuanVideo15ResnetBlock(
+                in_channels=in_channels,
+                out_channels=in_channels,
+            )
+        ]
+        attentions = []
+
+        for _ in range(num_layers):
+            if self.add_attention:
+                attentions.append(HunyuanVideo15AttnBlock(in_channels))
+            else:
+                attentions.append(None)
+
+            resnets.append(
+                HunyuanVideo15ResnetBlock(
+                    in_channels=in_channels,
+                    out_channels=in_channels,
+                )
+            )
+
+        self.attentions = nn.ModuleList(attentions)
+        self.resnets = nn.ModuleList(resnets)
+
+        self.gradient_checkpointing = False
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.resnets[0](hidden_states)
+
+        for attn, resnet in zip(self.attentions, self.resnets[1:]):
+            if attn is not None:
+                hidden_states = attn(hidden_states)
+            hidden_states = resnet(hidden_states)
+
+        return hidden_states
+
+
+class HunyuanVideo15DownBlock3D(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        num_layers: int = 1,
+        downsample_out_channels: Optional[int] = None,
+        add_temporal_downsample: int = True,
+    ) -> None:
+        super().__init__()
+        resnets = []
+
+        for i in range(num_layers):
+            in_channels = in_channels if i == 0 else out_channels
+            resnets.append(
+                HunyuanVideo15ResnetBlock(
+                    in_channels=in_channels,
+                    out_channels=out_channels,
+                )
+            )
+
+        self.resnets = nn.ModuleList(resnets)
+
+        if downsample_out_channels is not None:
+            self.downsamplers = nn.ModuleList(
+                [
+                    HunyuanVideo15Downsample(
+                        out_channels,
+                        out_channels=downsample_out_channels,
+                        add_temporal_downsample=add_temporal_downsample,
+                    )
+                ]
+            )
+        else:
+            self.downsamplers = None
+
+        self.gradient_checkpointing = False
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        for resnet in self.resnets:
+            hidden_states = resnet(hidden_states)
+
+        if self.downsamplers is not None:
+            for downsampler in self.downsamplers:
+                hidden_states = downsampler(hidden_states)
+
+        return hidden_states
+
+
+class HunyuanVideo15UpBlock3D(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        num_layers: int = 1,
+        upsample_out_channels: Optional[int] = None,
+        add_temporal_upsample: bool = True,
+    ) -> None:
+        super().__init__()
+        resnets = []
+
+        for i in range(num_layers):
+            input_channels = in_channels if i == 0 else out_channels
+
+            resnets.append(
+                HunyuanVideo15ResnetBlock(
+                    in_channels=input_channels,
+                    out_channels=out_channels,
+                )
+            )
+
+        self.resnets = nn.ModuleList(resnets)
+
+        if upsample_out_channels is not None:
+            self.upsamplers = nn.ModuleList(
+                [
+                    HunyuanVideo15Upsample(
+                        out_channels,
+                        out_channels=upsample_out_channels,
+                        add_temporal_upsample=add_temporal_upsample,
+                    )
+                ]
+            )
+        else:
+            self.upsamplers = None
+
+        self.gradient_checkpointing = False
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        if torch.is_grad_enabled() and self.gradient_checkpointing:
+            for resnet in self.resnets:
+                hidden_states = self._gradient_checkpointing_func(resnet, hidden_states)
+
+        else:
+            for resnet in self.resnets:
+                hidden_states = resnet(hidden_states)
+
+        if self.upsamplers is not None:
+            for upsampler in self.upsamplers:
+                hidden_states = upsampler(hidden_states)
+
+        return hidden_states
+
+
+class HunyuanVideo15Encoder3D(nn.Module):
+    r"""
+    3D vae encoder for HunyuanImageRefiner.
+    """
+
+    def __init__(
+        self,
+        in_channels: int = 3,
+        out_channels: int = 64,
+        block_out_channels: Tuple[int, ...] = (128, 256, 512, 1024, 1024),
+        layers_per_block: int = 2,
+        temporal_compression_ratio: int = 4,
+        spatial_compression_ratio: int = 16,
+        downsample_match_channel: bool = True,
+    ) -> None:
+        super().__init__()
+
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.group_size = block_out_channels[-1] // self.out_channels
+
+        self.conv_in = HunyuanVideo15CausalConv3d(in_channels, block_out_channels[0], kernel_size=3)
+        self.mid_block = None
+        self.down_blocks = nn.ModuleList([])
+
+        input_channel = block_out_channels[0]
+        for i in range(len(block_out_channels)):
+            add_spatial_downsample = i < np.log2(spatial_compression_ratio)
+            output_channel = block_out_channels[i]
+            if not add_spatial_downsample:
+                down_block = HunyuanVideo15DownBlock3D(
+                    num_layers=layers_per_block,
+                    in_channels=input_channel,
+                    out_channels=output_channel,
+                    downsample_out_channels=None,
+                    add_temporal_downsample=False,
+                )
+                input_channel = output_channel
+            else:
+                add_temporal_downsample = i >= np.log2(spatial_compression_ratio // temporal_compression_ratio)
+                downsample_out_channels = block_out_channels[i + 1] if downsample_match_channel else output_channel
+                down_block = HunyuanVideo15DownBlock3D(
+                    num_layers=layers_per_block,
+                    in_channels=input_channel,
+                    out_channels=output_channel,
+                    downsample_out_channels=downsample_out_channels,
+                    add_temporal_downsample=add_temporal_downsample,
+                )
+                input_channel = downsample_out_channels
+
+            self.down_blocks.append(down_block)
+
+        self.mid_block = HunyuanVideo15MidBlock(in_channels=block_out_channels[-1])
+
+        self.norm_out = HunyuanVideo15RMS_norm(block_out_channels[-1], images=False)
+        self.conv_act = nn.SiLU()
+        self.conv_out = HunyuanVideo15CausalConv3d(block_out_channels[-1], out_channels, kernel_size=3)
+
+        self.gradient_checkpointing = False
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.conv_in(hidden_states)
+
+        if torch.is_grad_enabled() and self.gradient_checkpointing:
+            for down_block in self.down_blocks:
+                hidden_states = self._gradient_checkpointing_func(down_block, hidden_states)
+
+            hidden_states = self._gradient_checkpointing_func(self.mid_block, hidden_states)
+        else:
+            for down_block in self.down_blocks:
+                hidden_states = down_block(hidden_states)
+
+            hidden_states = self.mid_block(hidden_states)
+
+        batch_size, _, frame, height, width = hidden_states.shape
+        short_cut = hidden_states.view(batch_size, -1, self.group_size, frame, height, width).mean(dim=2)
+
+        hidden_states = self.norm_out(hidden_states)
+        hidden_states = self.conv_act(hidden_states)
+        hidden_states = self.conv_out(hidden_states)
+
+        hidden_states += short_cut
+
+        return hidden_states
+
+
+class HunyuanVideo15Decoder3D(nn.Module):
+    r"""
+    Causal decoder for 3D video-like data used for HunyuanImage-1.5 Refiner.
+    """
+
+    def __init__(
+        self,
+        in_channels: int = 32,
+        out_channels: int = 3,
+        block_out_channels: Tuple[int, ...] = (1024, 1024, 512, 256, 128),
+        layers_per_block: int = 2,
+        spatial_compression_ratio: int = 16,
+        temporal_compression_ratio: int = 4,
+        upsample_match_channel: bool = True,
+    ):
+        super().__init__()
+        self.layers_per_block = layers_per_block
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.repeat = block_out_channels[0] // self.in_channels
+
+        self.conv_in = HunyuanVideo15CausalConv3d(self.in_channels, block_out_channels[0], kernel_size=3)
+        self.up_blocks = nn.ModuleList([])
+
+        # mid
+        self.mid_block = HunyuanVideo15MidBlock(in_channels=block_out_channels[0])
+
+        # up
+        input_channel = block_out_channels[0]
+        for i in range(len(block_out_channels)):
+            output_channel = block_out_channels[i]
+
+            add_spatial_upsample = i < np.log2(spatial_compression_ratio)
+            add_temporal_upsample = i < np.log2(temporal_compression_ratio)
+            if add_spatial_upsample or add_temporal_upsample:
+                upsample_out_channels = block_out_channels[i + 1] if upsample_match_channel else output_channel
+                up_block = HunyuanVideo15UpBlock3D(
+                    num_layers=self.layers_per_block + 1,
+                    in_channels=input_channel,
+                    out_channels=output_channel,
+                    upsample_out_channels=upsample_out_channels,
+                    add_temporal_upsample=add_temporal_upsample,
+                )
+                input_channel = upsample_out_channels
+            else:
+                up_block = HunyuanVideo15UpBlock3D(
+                    num_layers=self.layers_per_block + 1,
+                    in_channels=input_channel,
+                    out_channels=output_channel,
+                    upsample_out_channels=None,
+                    add_temporal_upsample=False,
+                )
+                input_channel = output_channel
+
+            self.up_blocks.append(up_block)
+
+        # out
+        self.norm_out = HunyuanVideo15RMS_norm(block_out_channels[-1], images=False)
+        self.conv_act = nn.SiLU()
+        self.conv_out = HunyuanVideo15CausalConv3d(block_out_channels[-1], out_channels, kernel_size=3)
+
+        self.gradient_checkpointing = False
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.conv_in(hidden_states) + hidden_states.repeat_interleave(repeats=self.repeat, dim=1)
+
+        if torch.is_grad_enabled() and self.gradient_checkpointing:
+            hidden_states = self._gradient_checkpointing_func(self.mid_block, hidden_states)
+
+            for up_block in self.up_blocks:
+                hidden_states = self._gradient_checkpointing_func(up_block, hidden_states)
+        else:
+            hidden_states = self.mid_block(hidden_states)
+
+            for up_block in self.up_blocks:
+                hidden_states = up_block(hidden_states)
+
+        # post-process
+        hidden_states = self.norm_out(hidden_states)
+        hidden_states = self.conv_act(hidden_states)
+        hidden_states = self.conv_out(hidden_states)
+        return hidden_states
+
+
+class AutoencoderKLHunyuanVideo15(ModelMixin, ConfigMixin):
+    r"""
+    A VAE model with KL loss for encoding videos into latents and decoding latent representations into videos. Used for
+    HunyuanVideo-1.5.
+
+    This model inherits from [`ModelMixin`]. Check the superclass documentation for it's generic methods implemented
+    for all models (such as downloading or saving).
+    """
+
+    _supports_gradient_checkpointing = True
+
+    @register_to_config
+    def __init__(
+        self,
+        in_channels: int = 3,
+        out_channels: int = 3,
+        latent_channels: int = 32,
+        block_out_channels: Tuple[int] = (128, 256, 512, 1024, 1024),
+        layers_per_block: int = 2,
+        spatial_compression_ratio: int = 16,
+        temporal_compression_ratio: int = 4,
+        downsample_match_channel: bool = True,
+        upsample_match_channel: bool = True,
+        scaling_factor: float = 1.03682,
+    ) -> None:
+        super().__init__()
+
+        self.encoder = HunyuanVideo15Encoder3D(
+            in_channels=in_channels,
+            out_channels=latent_channels * 2,
+            block_out_channels=block_out_channels,
+            layers_per_block=layers_per_block,
+            temporal_compression_ratio=temporal_compression_ratio,
+            spatial_compression_ratio=spatial_compression_ratio,
+            downsample_match_channel=downsample_match_channel,
+        )
+
+        self.decoder = HunyuanVideo15Decoder3D(
+            in_channels=latent_channels,
+            out_channels=out_channels,
+            block_out_channels=list(reversed(block_out_channels)),
+            layers_per_block=layers_per_block,
+            temporal_compression_ratio=temporal_compression_ratio,
+            spatial_compression_ratio=spatial_compression_ratio,
+            upsample_match_channel=upsample_match_channel,
+        )
+
+        self.spatial_compression_ratio = spatial_compression_ratio
+        self.temporal_compression_ratio = temporal_compression_ratio
+
+        # When decoding a batch of video latents at a time, one can save memory by slicing across the batch dimension
+        # to perform decoding of a single video latent at a time.
+        self.use_slicing = False
+
+        # When decoding spatially large video latents, the memory requirement is very high. By breaking the video latent
+        # frames spatially into smaller tiles and performing multiple forward passes for decoding, and then blending the
+        # intermediate tiles together, the memory requirement can be lowered.
+        self.use_tiling = False
+
+        # The minimal tile height and width for spatial tiling to be used
+        self.tile_sample_min_height = 256
+        self.tile_sample_min_width = 256
+
+        # The minimal tile height and width in latent space
+        self.tile_latent_min_height = self.tile_sample_min_height // spatial_compression_ratio
+        self.tile_latent_min_width = self.tile_sample_min_width // spatial_compression_ratio
+        self.tile_overlap_factor = 0.25
+
+    def enable_tiling(
+        self,
+        tile_sample_min_height: Optional[int] = None,
+        tile_sample_min_width: Optional[int] = None,
+        tile_latent_min_height: Optional[int] = None,
+        tile_latent_min_width: Optional[int] = None,
+        tile_overlap_factor: Optional[float] = None,
+    ) -> None:
+        r"""
+        Enable tiled VAE decoding. When this option is enabled, the VAE will split the input tensor into tiles to
+        compute decoding and encoding in several steps. This is useful for saving a large amount of memory and to allow
+        processing larger images.
+
+        Args:
+            tile_sample_min_height (`int`, *optional*):
+                The minimum height required for a sample to be separated into tiles across the height dimension.
+            tile_sample_min_width (`int`, *optional*):
+                The minimum width required for a sample to be separated into tiles across the width dimension.
+            tile_latent_min_height (`int`, *optional*):
+                The minimum height required for a latent to be separated into tiles across the height dimension.
+            tile_latent_min_width (`int`, *optional*):
+                The minimum width required for a latent to be separated into tiles across the width dimension.
+        """
+        self.use_tiling = True
+        self.tile_sample_min_height = tile_sample_min_height or self.tile_sample_min_height
+        self.tile_sample_min_width = tile_sample_min_width or self.tile_sample_min_width
+        self.tile_latent_min_height = tile_latent_min_height or self.tile_latent_min_height
+        self.tile_latent_min_width = tile_latent_min_width or self.tile_latent_min_width
+        self.tile_overlap_factor = tile_overlap_factor or self.tile_overlap_factor
+
+    def disable_tiling(self) -> None:
+        r"""
+        Disable tiled VAE decoding. If `enable_tiling` was previously enabled, this method will go back to computing
+        decoding in one step.
+        """
+        self.use_tiling = False
+
+    def enable_slicing(self) -> None:
+        r"""
+        Enable sliced VAE decoding. When this option is enabled, the VAE will split the input tensor in slices to
+        compute decoding in several steps. This is useful to save some memory and allow larger batch sizes.
+        """
+        self.use_slicing = True
+
+    def disable_slicing(self) -> None:
+        r"""
+        Disable sliced VAE decoding. If `enable_slicing` was previously enabled, this method will go back to computing
+        decoding in one step.
+        """
+        self.use_slicing = False
+
+    def _encode(self, x: torch.Tensor) -> torch.Tensor:
+        _, _, _, height, width = x.shape
+
+        if self.use_tiling and (width > self.tile_sample_min_width or height > self.tile_sample_min_height):
+            return self.tiled_encode(x)
+
+        x = self.encoder(x)
+        return x
+
+    @apply_forward_hook
+    def encode(
+        self, x: torch.Tensor, return_dict: bool = True
+    ) -> Union[AutoencoderKLOutput, Tuple[DiagonalGaussianDistribution]]:
+        r"""
+        Encode a batch of images into latents.
+
+        Args:
+            x (`torch.Tensor`): Input batch of images.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether to return a [`~models.autoencoder_kl.AutoencoderKLOutput`] instead of a plain tuple.
+
+        Returns:
+                The latent representations of the encoded videos. If `return_dict` is True, a
+                [`~models.autoencoder_kl.AutoencoderKLOutput`] is returned, otherwise a plain `tuple` is returned.
+        """
+        if self.use_slicing and x.shape[0] > 1:
+            encoded_slices = [self._encode(x_slice) for x_slice in x.split(1)]
+            h = torch.cat(encoded_slices)
+        else:
+            h = self._encode(x)
+
+        posterior = DiagonalGaussianDistribution(h)
+
+        if not return_dict:
+            return (posterior,)
+        return AutoencoderKLOutput(latent_dist=posterior)
+
+    def _decode(self, z: torch.Tensor) -> torch.Tensor:
+        _, _, _, height, width = z.shape
+
+        if self.use_tiling and (width > self.tile_latent_min_width or height > self.tile_latent_min_height):
+            return self.tiled_decode(z)
+
+        dec = self.decoder(z)
+
+        return dec
+
+    @apply_forward_hook
+    def decode(self, z: torch.Tensor, return_dict: bool = True) -> Union[DecoderOutput, torch.Tensor]:
+        r"""
+        Decode a batch of images.
+
+        Args:
+            z (`torch.Tensor`): Input batch of latent vectors.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether to return a [`~models.vae.DecoderOutput`] instead of a plain tuple.
+
+        Returns:
+            [`~models.vae.DecoderOutput`] or `tuple`:
+                If return_dict is True, a [`~models.vae.DecoderOutput`] is returned, otherwise a plain `tuple` is
+                returned.
+        """
+        if self.use_slicing and z.shape[0] > 1:
+            decoded_slices = [self._decode(z_slice) for z_slice in z.split(1)]
+            decoded = torch.cat(decoded_slices)
+        else:
+            decoded = self._decode(z)
+
+        if not return_dict:
+            return (decoded,)
+
+        return DecoderOutput(sample=decoded)
+
+    def blend_v(self, a: torch.Tensor, b: torch.Tensor, blend_extent: int) -> torch.Tensor:
+        blend_extent = min(a.shape[-2], b.shape[-2], blend_extent)
+        for y in range(blend_extent):
+            b[:, :, :, y, :] = a[:, :, :, -blend_extent + y, :] * (1 - y / blend_extent) + b[:, :, :, y, :] * (
+                y / blend_extent
+            )
+        return b
+
+    def blend_h(self, a: torch.Tensor, b: torch.Tensor, blend_extent: int) -> torch.Tensor:
+        blend_extent = min(a.shape[-1], b.shape[-1], blend_extent)
+        for x in range(blend_extent):
+            b[:, :, :, :, x] = a[:, :, :, :, -blend_extent + x] * (1 - x / blend_extent) + b[:, :, :, :, x] * (
+                x / blend_extent
+            )
+        return b
+
+    def blend_t(self, a: torch.Tensor, b: torch.Tensor, blend_extent: int) -> torch.Tensor:
+        blend_extent = min(a.shape[-3], b.shape[-3], blend_extent)
+        for x in range(blend_extent):
+            b[:, :, x, :, :] = a[:, :, -blend_extent + x, :, :] * (1 - x / blend_extent) + b[:, :, x, :, :] * (
+                x / blend_extent
+            )
+        return b
+
+    def tiled_encode(self, x: torch.Tensor) -> torch.Tensor:
+        r"""Encode a batch of images using a tiled encoder.
+
+        Args:
+            x (`torch.Tensor`): Input batch of videos.
+
+        Returns:
+            `torch.Tensor`:
+                The latent representation of the encoded videos.
+        """
+        _, _, _, height, width = x.shape
+
+        overlap_height = int(self.tile_sample_min_height * (1 - self.tile_overlap_factor))  # 256 * (1 - 0.25) = 192
+        overlap_width = int(self.tile_sample_min_width * (1 - self.tile_overlap_factor))  # 256 * (1 - 0.25) = 192
+        blend_height = int(self.tile_latent_min_height * self.tile_overlap_factor)  # 8 * 0.25 = 2
+        blend_width = int(self.tile_latent_min_width * self.tile_overlap_factor)  # 8 * 0.25 = 2
+        row_limit_height = self.tile_latent_min_height - blend_height  # 8 - 2 = 6
+        row_limit_width = self.tile_latent_min_width - blend_width  # 8 - 2 = 6
+
+        rows = []
+        for i in range(0, height, overlap_height):
+            row = []
+            for j in range(0, width, overlap_width):
+                tile = x[
+                    :,
+                    :,
+                    :,
+                    i : i + self.tile_sample_min_height,
+                    j : j + self.tile_sample_min_width,
+                ]
+                tile = self.encoder(tile)
+                row.append(tile)
+            rows.append(row)
+
+        result_rows = []
+        for i, row in enumerate(rows):
+            result_row = []
+            for j, tile in enumerate(row):
+                if i > 0:
+                    tile = self.blend_v(rows[i - 1][j], tile, blend_height)
+                if j > 0:
+                    tile = self.blend_h(row[j - 1], tile, blend_width)
+                result_row.append(tile[:, :, :, :row_limit_height, :row_limit_width])
+            result_rows.append(torch.cat(result_row, dim=-1))
+        moments = torch.cat(result_rows, dim=-2)
+
+        return moments
+
+    def tiled_decode(self, z: torch.Tensor) -> torch.Tensor:
+        r"""
+        Decode a batch of images using a tiled decoder.
+
+        Args:
+            z (`torch.Tensor`): Input batch of latent vectors.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~models.vae.DecoderOutput`] instead of a plain tuple.
+
+        Returns:
+            [`~models.vae.DecoderOutput`] or `tuple`:
+                If return_dict is True, a [`~models.vae.DecoderOutput`] is returned, otherwise a plain `tuple` is
+                returned.
+        """
+
+        _, _, _, height, width = z.shape
+
+        overlap_height = int(self.tile_latent_min_height * (1 - self.tile_overlap_factor))  # 8 * (1 - 0.25) = 6
+        overlap_width = int(self.tile_latent_min_width * (1 - self.tile_overlap_factor))  # 8 * (1 - 0.25) = 6
+        blend_height = int(self.tile_sample_min_height * self.tile_overlap_factor)  # 256 * 0.25 = 64
+        blend_width = int(self.tile_sample_min_width * self.tile_overlap_factor)  # 256 * 0.25 = 64
+        row_limit_height = self.tile_sample_min_height - blend_height  # 256 - 64 = 192
+        row_limit_width = self.tile_sample_min_width - blend_width  # 256 - 64 = 192
+
+        rows = []
+        for i in range(0, height, overlap_height):
+            row = []
+            for j in range(0, width, overlap_width):
+                tile = z[
+                    :,
+                    :,
+                    :,
+                    i : i + self.tile_latent_min_height,
+                    j : j + self.tile_latent_min_width,
+                ]
+                decoded = self.decoder(tile)
+                row.append(decoded)
+            rows.append(row)
+
+        result_rows = []
+        for i, row in enumerate(rows):
+            result_row = []
+            for j, tile in enumerate(row):
+                if i > 0:
+                    tile = self.blend_v(rows[i - 1][j], tile, blend_height)
+                if j > 0:
+                    tile = self.blend_h(row[j - 1], tile, blend_width)
+                result_row.append(tile[:, :, :, :row_limit_height, :row_limit_width])
+            result_rows.append(torch.cat(result_row, dim=-1))
+        dec = torch.cat(result_rows, dim=-2)
+
+        return dec
+
+    def forward(
+        self,
+        sample: torch.Tensor,
+        sample_posterior: bool = False,
+        return_dict: bool = True,
+        generator: Optional[torch.Generator] = None,
+    ) -> Union[DecoderOutput, torch.Tensor]:
+        r"""
+        Args:
+            sample (`torch.Tensor`): Input sample.
+            sample_posterior (`bool`, *optional*, defaults to `False`):
+                Whether to sample from the posterior.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`DecoderOutput`] instead of a plain tuple.
+        """
+        x = sample
+        posterior = self.encode(x).latent_dist
+        if sample_posterior:
+            z = posterior.sample(generator=generator)
+        else:
+            z = posterior.mode()
+        dec = self.decode(z, return_dict=return_dict)
+        return dec
diff --git a/src/diffusers/models/autoencoders/autoencoder_kl_mochi.py b/src/diffusers/models/autoencoders/autoencoder_kl_mochi.py
index 3ded9a0a5491..7a64ac7de172 100644
--- a/src/diffusers/models/autoencoders/autoencoder_kl_mochi.py
+++ b/src/diffusers/models/autoencoders/autoencoder_kl_mochi.py
@@ -688,8 +688,8 @@ def __init__(
         self,
         in_channels: int = 15,
         out_channels: int = 3,
-        encoder_block_out_channels: Tuple[int] = (64, 128, 256, 384),
-        decoder_block_out_channels: Tuple[int] = (128, 256, 512, 768),
+        encoder_block_out_channels: Tuple[int, ...] = (64, 128, 256, 384),
+        decoder_block_out_channels: Tuple[int, ...] = (128, 256, 512, 768),
         latent_channels: int = 12,
         layers_per_block: Tuple[int, ...] = (3, 3, 4, 6, 3),
         act_fn: str = "silu",
diff --git a/src/diffusers/models/autoencoders/autoencoder_kl_qwenimage.py b/src/diffusers/models/autoencoders/autoencoder_kl_qwenimage.py
index 14db6aeb61e3..618801dfb605 100644
--- a/src/diffusers/models/autoencoders/autoencoder_kl_qwenimage.py
+++ b/src/diffusers/models/autoencoders/autoencoder_kl_qwenimage.py
@@ -16,7 +16,7 @@
 # QwenImageVAE is further fine-tuned from the Wan Video VAE to achieve improved performance.
 # For more information about the Wan VAE, please refer to:
 # - GitHub: https://github.com/Wan-Video/Wan2.1
-# - arXiv: https://arxiv.org/abs/2503.20314
+# - Paper: https://huggingface.co/papers/2503.20314
 
 from typing import List, Optional, Tuple, Union
 
@@ -679,7 +679,7 @@ def __init__(
         self,
         base_dim: int = 96,
         z_dim: int = 16,
-        dim_mult: Tuple[int] = [1, 2, 4, 4],
+        dim_mult: Tuple[int, ...] = (1, 2, 4, 4),
         num_res_blocks: int = 2,
         attn_scales: List[float] = [],
         temperal_downsample: List[bool] = [False, True, True],
diff --git a/src/diffusers/models/autoencoders/autoencoder_kl_temporal_decoder.py b/src/diffusers/models/autoencoders/autoencoder_kl_temporal_decoder.py
index ab76254d19e2..7a307b1eacd8 100644
--- a/src/diffusers/models/autoencoders/autoencoder_kl_temporal_decoder.py
+++ b/src/diffusers/models/autoencoders/autoencoder_kl_temporal_decoder.py
@@ -12,14 +12,15 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import itertools
-from typing import Dict, Optional, Tuple, Union
+from typing import Optional, Tuple, Union
 
 import torch
 import torch.nn as nn
 
 from ...configuration_utils import ConfigMixin, register_to_config
 from ...utils.accelerate_utils import apply_forward_hook
-from ..attention_processor import CROSS_ATTENTION_PROCESSORS, AttentionProcessor, AttnProcessor
+from ..attention import AttentionMixin
+from ..attention_processor import CROSS_ATTENTION_PROCESSORS, AttnProcessor
 from ..modeling_outputs import AutoencoderKLOutput
 from ..modeling_utils import ModelMixin
 from ..unets.unet_3d_blocks import MidBlockTemporalDecoder, UpBlockTemporalDecoder
@@ -31,7 +32,7 @@ def __init__(
         self,
         in_channels: int = 4,
         out_channels: int = 3,
-        block_out_channels: Tuple[int] = (128, 256, 512, 512),
+        block_out_channels: Tuple[int, ...] = (128, 256, 512, 512),
         layers_per_block: int = 2,
     ):
         super().__init__()
@@ -135,7 +136,7 @@ def forward(
         return sample
 
 
-class AutoencoderKLTemporalDecoder(ModelMixin, AutoencoderMixin, ConfigMixin):
+class AutoencoderKLTemporalDecoder(ModelMixin, AttentionMixin, AutoencoderMixin, ConfigMixin):
     r"""
     A VAE model with KL loss for encoding images into latents and decoding latent representations into images.
 
@@ -172,8 +173,8 @@ def __init__(
         self,
         in_channels: int = 3,
         out_channels: int = 3,
-        down_block_types: Tuple[str] = ("DownEncoderBlock2D",),
-        block_out_channels: Tuple[int] = (64,),
+        down_block_types: Tuple[str, ...] = ("DownEncoderBlock2D",),
+        block_out_channels: Tuple[int, ...] = (64,),
         layers_per_block: int = 1,
         latent_channels: int = 4,
         sample_size: int = 32,
@@ -202,66 +203,6 @@ def __init__(
 
         self.quant_conv = nn.Conv2d(2 * latent_channels, 2 * latent_channels, 1)
 
-    @property
-    # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.attn_processors
-    def attn_processors(self) -> Dict[str, AttentionProcessor]:
-        r"""
-        Returns:
-            `dict` of attention processors: A dictionary containing all attention processors used in the model with
-            indexed by its weight name.
-        """
-        # set recursively
-        processors = {}
-
-        def fn_recursive_add_processors(name: str, module: torch.nn.Module, processors: Dict[str, AttentionProcessor]):
-            if hasattr(module, "get_processor"):
-                processors[f"{name}.processor"] = module.get_processor()
-
-            for sub_name, child in module.named_children():
-                fn_recursive_add_processors(f"{name}.{sub_name}", child, processors)
-
-            return processors
-
-        for name, module in self.named_children():
-            fn_recursive_add_processors(name, module, processors)
-
-        return processors
-
-    # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.set_attn_processor
-    def set_attn_processor(self, processor: Union[AttentionProcessor, Dict[str, AttentionProcessor]]):
-        r"""
-        Sets the attention processor to use to compute attention.
-
-        Parameters:
-            processor (`dict` of `AttentionProcessor` or only `AttentionProcessor`):
-                The instantiated processor class or a dictionary of processor classes that will be set as the processor
-                for **all** `Attention` layers.
-
-                If `processor` is a dict, the key needs to define the path to the corresponding cross attention
-                processor. This is strongly recommended when setting trainable attention processors.
-
-        """
-        count = len(self.attn_processors.keys())
-
-        if isinstance(processor, dict) and len(processor) != count:
-            raise ValueError(
-                f"A dict of processors was passed, but the number of processors {len(processor)} does not match the"
-                f" number of attention layers: {count}. Please make sure to pass {count} processor classes."
-            )
-
-        def fn_recursive_attn_processor(name: str, module: torch.nn.Module, processor):
-            if hasattr(module, "set_processor"):
-                if not isinstance(processor, dict):
-                    module.set_processor(processor)
-                else:
-                    module.set_processor(processor.pop(f"{name}.processor"))
-
-            for sub_name, child in module.named_children():
-                fn_recursive_attn_processor(f"{name}.{sub_name}", child, processor)
-
-        for name, module in self.named_children():
-            fn_recursive_attn_processor(name, module, processor)
-
     def set_default_attn_processor(self):
         """
         Disables custom attention processors and sets the default attention implementation.
diff --git a/src/diffusers/models/autoencoders/autoencoder_kl_wan.py b/src/diffusers/models/autoencoders/autoencoder_kl_wan.py
index f8bdfeb75524..a7c22421481a 100644
--- a/src/diffusers/models/autoencoders/autoencoder_kl_wan.py
+++ b/src/diffusers/models/autoencoders/autoencoder_kl_wan.py
@@ -17,6 +17,7 @@
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
+import torch.distributed as dist
 
 from ...configuration_utils import ConfigMixin, register_to_config
 from ...loaders import FromOriginalModelMixin
@@ -619,6 +620,7 @@ def forward(self, x, feat_cache=None, feat_idx=[0]):
             feat_idx[0] += 1
         else:
             x = self.conv_out(x)
+
         return x
 
 
@@ -961,6 +963,7 @@ class AutoencoderKLWan(ModelMixin, AutoencoderMixin, ConfigMixin, FromOriginalMo
     """
 
     _supports_gradient_checkpointing = False
+    _group_offload_block_modules = ["quant_conv", "post_quant_conv", "encoder", "decoder"]
     # keys toignore when AlignDeviceHook moves inputs/outputs between devices
     # these are shared mutable state modified in-place
     _skip_keys = ["feat_cache", "feat_idx"]
@@ -971,7 +974,7 @@ def __init__(
         base_dim: int = 96,
         decoder_base_dim: Optional[int] = None,
         z_dim: int = 16,
-        dim_mult: Tuple[int] = [1, 2, 4, 4],
+        dim_mult: List[int] = [1, 2, 4, 4],
         num_res_blocks: int = 2,
         attn_scales: List[float] = [],
         temperal_downsample: List[bool] = [False, True, True],
@@ -1073,6 +1076,8 @@ def __init__(
         self.tile_sample_stride_height = 192
         self.tile_sample_stride_width = 192
 
+        self.use_dp = False
+
         # Precompute and cache conv counts for encoder and decoder for clear_cache speedup
         self._cached_conv_counts = {
             "decoder": sum(isinstance(m, WanCausalConv3d) for m in self.decoder.modules())
@@ -1113,6 +1118,52 @@ def enable_tiling(
         self.tile_sample_stride_height = tile_sample_stride_height or self.tile_sample_stride_height
         self.tile_sample_stride_width = tile_sample_stride_width or self.tile_sample_stride_width
 
+    def enable_dp(
+        self,
+        world_size: Optional[int] = None,
+        hw_splits: Optional[Tuple[int, int]] = None,
+        overlap_ratio: Optional[float] = None,
+        overlap_pixels: Optional[int] = None
+    ) -> None:
+        r"""
+        """
+        if world_size is None:
+            world_size = dist.get_world_size() if dist.is_initialized() else 1
+
+        if world_size <= 1 or world_size > dist.get_world_size():
+            logger.warning(
+                f"Supported world_size for vae dp is between 2 - {dist.get_world_size}, but got {world_size}. " \
+                f"Fall back to normal vae")
+            return
+
+        if hw_splits is None:
+            hw_splits = (1, int(world_size))
+
+        assert len(hw_splits) == 2, f"'hw_splits' should be a tuple of 2 int, but got length {len(hw_splits)}"
+
+        h_split, w_split = map(int, hw_splits)
+
+        self.use_dp = True
+        self.h_split, self.w_split = h_split, w_split
+        self.world_size = world_size
+        self.overlap_ratio = overlap_ratio
+        self.overlap_pixels = overlap_pixels
+
+        dp_ranks = list(range(0, world_size))
+        self.vae_dp_group = dist.new_group(ranks=dp_ranks)
+        self.rank = dist.get_rank()
+        # patch_ranks_flatten = [tile_idx % world_size for tile_idx in range(num_tiles)]
+        # self.patch_ranks = torch.Tensor(patch_ranks_flatten).reshape(h_split, w_split)
+        self.tile_idxs_per_rank = [[] for _ in range(self.world_size)]
+        self.num_tiles_per_rank = [0] * self.world_size
+        rank_idx = 0
+        for h_idx in range(self.h_split):
+            for w_idx in range(self.w_split):
+                rank_idx %= self.world_size
+                self.tile_idxs_per_rank[rank_idx].append((h_idx, w_idx))
+                self.num_tiles_per_rank[rank_idx] += 1
+                rank_idx += 1
+
     def clear_cache(self):
         # Use cached conv counts for decoder and encoder to avoid re-iterating modules each call
         self._conv_num = self._cached_conv_counts["decoder"]
@@ -1130,6 +1181,9 @@ def _encode(self, x: torch.Tensor):
         if self.config.patch_size is not None:
             x = patchify(x, patch_size=self.config.patch_size)
 
+        if self.use_dp:
+            return self.tiled_encode_with_dp(x)
+
         if self.use_tiling and (width > self.tile_sample_min_width or height > self.tile_sample_min_height):
             return self.tiled_encode(x)
 
@@ -1182,6 +1236,9 @@ def _decode(self, z: torch.Tensor, return_dict: bool = True):
         tile_latent_min_height = self.tile_sample_min_height // self.spatial_compression_ratio
         tile_latent_min_width = self.tile_sample_min_width // self.spatial_compression_ratio
 
+        if self.use_dp:
+            return self.tiled_decode_with_dp(z, return_dict=return_dict)
+
         if self.use_tiling and (width > tile_latent_min_width or height > tile_latent_min_height):
             return self.tiled_decode(z, return_dict=return_dict)
 
@@ -1249,6 +1306,20 @@ def blend_h(self, a: torch.Tensor, b: torch.Tensor, blend_extent: int) -> torch.
             )
         return b
 
+    def blend_v_(self, a: torch.Tensor, b: torch.Tensor, blend_extent: int) -> torch.Tensor:
+        blend_extent = min(a.shape[2], b.shape[2], blend_extent)
+        y = torch.arange(0, blend_extent, device=a.device)
+        blend_ratio = (y / blend_extent)[None, None, None, :, None].to(a.dtype)
+        b[:, :, :, y, :] = a[:, :, :, -blend_extent + y, :] * (1 - blend_ratio) + b[:, :, :, y, :] * blend_ratio
+        return b
+
+    def blend_h_(self, a: torch.Tensor, b: torch.Tensor, blend_extent: int) -> torch.Tensor:
+        blend_extent = min(a.shape[3], b.shape[3], blend_extent)
+        x = torch.arange(0, blend_extent, device=a.device)
+        blend_ratio = (x / blend_extent)[None, None, None, None, :].to(a.dtype)
+        b[:, :, :, :, x] = a[:, :, :, :, -blend_extent + x] * (1 - blend_ratio) + b[:, :, :, :, x] * blend_ratio
+        return b
+
     def tiled_encode(self, x: torch.Tensor) -> AutoencoderKLOutput:
         r"""Encode a batch of images using a tiled encoder.
 
@@ -1259,14 +1330,20 @@ def tiled_encode(self, x: torch.Tensor) -> AutoencoderKLOutput:
             `torch.Tensor`:
                 The latent representation of the encoded videos.
         """
+
         _, _, num_frames, height, width = x.shape
-        latent_height = height // self.spatial_compression_ratio
-        latent_width = width // self.spatial_compression_ratio
+        encode_spatial_compression_ratio = self.spatial_compression_ratio
+        if self.config.patch_size is not None:
+            assert encode_spatial_compression_ratio % self.config.patch_size == 0
+            encode_spatial_compression_ratio = self.spatial_compression_ratio // self.config.patch_size
 
-        tile_latent_min_height = self.tile_sample_min_height // self.spatial_compression_ratio
-        tile_latent_min_width = self.tile_sample_min_width // self.spatial_compression_ratio
-        tile_latent_stride_height = self.tile_sample_stride_height // self.spatial_compression_ratio
-        tile_latent_stride_width = self.tile_sample_stride_width // self.spatial_compression_ratio
+        latent_height = height // encode_spatial_compression_ratio
+        latent_width = width // encode_spatial_compression_ratio
+
+        tile_latent_min_height = self.tile_sample_min_height // encode_spatial_compression_ratio
+        tile_latent_min_width = self.tile_sample_min_width // encode_spatial_compression_ratio
+        tile_latent_stride_height = self.tile_sample_stride_height // encode_spatial_compression_ratio
+        tile_latent_stride_width = self.tile_sample_stride_width // encode_spatial_compression_ratio
 
         blend_height = tile_latent_min_height - tile_latent_stride_height
         blend_width = tile_latent_min_width - tile_latent_stride_width
@@ -1393,6 +1470,191 @@ def tiled_decode(self, z: torch.Tensor, return_dict: bool = True) -> Union[Decod
             return (dec,)
         return DecoderOutput(sample=dec)
 
+    def calculate_tiled_parallel_size(self, latent_height, latent_width):
+        # Calculate stride based on h_split and w_split
+        tile_latent_stride_height = int((latent_height + self.h_split - 1) / self.h_split)
+        tile_latent_stride_width = int((latent_width + self.w_split - 1) / self.w_split)
+
+        # Calculate overlap in latent space
+        overlap_latent_height = 3
+        overlap_latent_width = 3
+        if self.overlap_pixels is not None:
+            overlap_latent = (self.overlap_pixels + self.spatial_compression_ratio - 1) // self.spatial_compression_ratio
+            overlap_latent_height = overlap_latent
+            overlap_latent_width = overlap_latent
+        elif self.overlap_ratio is not None:
+            overlap_latent_height = int(self.overlap_ratio * latent_height)
+            overlap_latent_width = int(self.overlap_ratio * latent_width)
+
+        # Calculate minimum tile size in latent space
+        tile_latent_min_height = tile_latent_stride_height + overlap_latent_height
+        tile_latent_min_width = tile_latent_stride_width + overlap_latent_width
+
+        tile_sample_min_height = tile_latent_min_height * self.spatial_compression_ratio
+        tile_sample_min_width = tile_latent_min_width * self.spatial_compression_ratio
+        tile_sample_stride_height = tile_latent_stride_height * self.spatial_compression_ratio
+        tile_sample_stride_width = tile_latent_stride_width * self.spatial_compression_ratio
+
+        blend_latent_height = tile_latent_min_height - tile_latent_stride_height
+        blend_latent_width = tile_latent_min_width - tile_latent_stride_width
+
+        if self.config.patch_size is not None:
+            sample_height = sample_height // self.config.patch_size
+            sample_width = sample_width // self.config.patch_size
+            tile_sample_stride_height = tile_sample_stride_height // self.config.patch_size
+            tile_sample_stride_width = tile_sample_stride_width // self.config.patch_size
+            blend_sample_height = tile_sample_min_height // self.config.patch_size - tile_sample_stride_height
+            blend_sample_width = tile_sample_min_width // self.config.patch_size - tile_sample_stride_width
+        else:
+            blend_sample_height = tile_sample_min_height - tile_sample_stride_height
+            blend_sample_width = tile_sample_min_width - tile_sample_stride_width
+
+        return \
+            tile_latent_min_height, tile_latent_min_width, tile_latent_stride_height, tile_latent_stride_width, \
+            tile_sample_min_height, tile_sample_min_width, tile_sample_stride_height, tile_sample_stride_width, \
+            blend_latent_height, blend_latent_width, blend_sample_height, blend_sample_width
+
+    def tiled_encode_with_dp(self, x: torch.Tensor) -> AutoencoderKLOutput:
+        r"""Encode a batch of images using a tiled encoder.
+
+        Args:
+            x (`torch.Tensor`): Input batch of videos.
+
+        Returns:
+            `torch.Tensor`:
+                The latent representation of the encoded videos.
+        """
+        _, _, num_frames, sample_height, sample_width = x.shape
+        device = x.device
+        latent_height = sample_height // self.spatial_compression_ratio
+        latent_width = sample_width // self.spatial_compression_ratio
+
+        tile_latent_min_height, tile_latent_min_width, tile_latent_stride_height, tile_latent_stride_width, \
+        tile_sample_min_height, tile_sample_min_width, tile_sample_stride_height, tile_sample_stride_width, \
+        blend_latent_height, blend_latent_width, blend_sample_height, blend_sample_width = \
+            self.calculate_tiled_parallel_size(latent_height, latent_width)
+
+        def vae_encode_op(
+                x, patch_height_start, patch_height_end, patch_width_start, patch_width_end, num_frames
+            ) -> torch.Tensor:
+
+            self.clear_cache()
+            time = []
+            frame_range = 1 + (num_frames - 1) // 4
+            for k in range(frame_range):
+                self._enc_conv_idx = [0]
+                if k == 0:
+                    tile = x[:, :, :1, patch_height_start : patch_height_end, patch_width_start : patch_width_end]
+                else:
+                    tile = x[
+                        :,
+                        :,
+                        1 + 4 * (k - 1) : 1 + 4 * k,
+                        patch_height_start : patch_height_end,
+                        patch_width_start : patch_width_end,
+                    ]
+                tile = self.encoder(tile, feat_cache=self._enc_feat_map, feat_idx=self._enc_conv_idx)
+                tile = self.quant_conv(tile)
+                time.append(tile)
+            time = torch.cat(time, dim=2)
+            self.clear_cache()
+            return time
+
+        rows = self.run_vae_tile_parallel(
+            x, vae_encode_op, 
+            tile_sample_min_height, tile_sample_min_width, tile_sample_stride_height, tile_sample_stride_width, device,
+            num_frames=num_frames
+        )
+
+        result_rows = []
+        for i, row in enumerate(rows):
+            result_row = []
+            for j, tile in enumerate(row):
+                # blend the above tile and the left tile
+                # to the current tile and add the current tile to the result row
+                if i > 0:
+                    tile = self.blend_v_(rows[i - 1][j], tile, blend_latent_height)
+                if j > 0:
+                    tile = self.blend_h_(row[j - 1], tile, blend_latent_width)
+                result_row.append(tile[:, :, :, :tile_latent_stride_height, :tile_latent_stride_width])
+            result_rows.append(torch.cat(result_row, dim=-1))
+
+        enc = torch.cat(result_rows, dim=3)[:, :, :, :latent_height, :latent_width]
+        return enc
+
+    def tiled_decode_with_dp(self, z: torch.Tensor, return_dict: bool = True) -> Union[DecoderOutput, torch.Tensor]:
+        r"""
+        Decode a batch of images using a tiled decoder.
+
+        Args:
+            z (`torch.Tensor`): Input batch of latent vectors.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~models.vae.DecoderOutput`] instead of a plain tuple.
+
+        Returns:
+            [`~models.vae.DecoderOutput`] or `tuple`:
+                If return_dict is True, a [`~models.vae.DecoderOutput`] is returned, otherwise a plain `tuple` is
+                returned.
+        """
+        _, _, num_frames, latent_height, latent_width = z.shape
+        device = z.device
+        sample_height = latent_height * self.spatial_compression_ratio
+        sample_width = latent_width * self.spatial_compression_ratio
+
+        tile_latent_min_height, tile_latent_min_width, tile_latent_stride_height, tile_latent_stride_width, \
+        tile_sample_min_height, tile_sample_min_width, tile_sample_stride_height, tile_sample_stride_width, \
+        blend_latent_height, blend_latent_width, blend_sample_height, blend_sample_width = \
+            self.calculate_tiled_parallel_size(latent_height, latent_width)
+
+        def vae_decode_op(
+                z, patch_height_start, patch_height_end, patch_width_start, patch_width_end, num_frames
+            ) -> torch.Tensor:
+
+            self.clear_cache()
+
+            time = []
+            for k in range(num_frames):
+                self._conv_idx = [0]
+                tile = z[:, :, k : k + 1, patch_height_start : patch_height_end, patch_width_start : patch_width_end]
+                tile = self.post_quant_conv(tile)
+                decoded = self.decoder(
+                    tile, feat_cache=self._feat_map, feat_idx=self._conv_idx, first_chunk=(k == 0)
+                )
+                time.append(decoded)
+            time = torch.cat(time, dim=2)
+            self.clear_cache()
+            return time
+
+        rows = self.run_vae_tile_parallel(
+            z, vae_decode_op, 
+            tile_latent_min_height, tile_latent_min_width, tile_latent_stride_height, tile_latent_stride_width, device,
+            num_frames=num_frames
+        )
+
+        # combine all tiles, same as tiled decode
+        result_rows = []
+        for i, row in enumerate(rows):
+            result_row = []
+            for j, tile in enumerate(row):
+                # blend the above tile and the left tile
+                # to the current tile and add the current tile to the result row
+                if i > 0:
+                    tile = self.blend_v_(rows[i - 1][j], tile, blend_sample_height)
+                if j > 0:
+                    tile = self.blend_h_(row[j - 1], tile, blend_sample_width)
+                result_row.append(tile[:, :, :, :tile_sample_stride_height, :tile_sample_stride_width])
+            result_rows.append(torch.cat(result_row, dim=-1))
+        dec = torch.cat(result_rows, dim=3)[:, :, :, :sample_height, :sample_width]
+
+        if self.config.patch_size is not None:
+            dec = unpatchify(dec, patch_size=self.config.patch_size)
+
+        dec = torch.clamp(dec, min=-1.0, max=1.0)
+
+        if not return_dict:
+            return (dec,)
+        return DecoderOutput(sample=dec)
+
     def forward(
         self,
         sample: torch.Tensor,
@@ -1408,6 +1670,7 @@ def forward(
         """
         x = sample
         posterior = self.encode(x).latent_dist
+
         if sample_posterior:
             z = posterior.sample(generator=generator)
         else:
diff --git a/src/diffusers/models/autoencoders/consistency_decoder_vae.py b/src/diffusers/models/autoencoders/consistency_decoder_vae.py
index 0a6258fed37f..db9404f4ac70 100644
--- a/src/diffusers/models/autoencoders/consistency_decoder_vae.py
+++ b/src/diffusers/models/autoencoders/consistency_decoder_vae.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 from dataclasses import dataclass
-from typing import Dict, Optional, Tuple, Union
+from typing import Optional, Tuple, Union
 
 import torch
 import torch.nn.functional as F
@@ -23,10 +23,10 @@
 from ...utils import BaseOutput
 from ...utils.accelerate_utils import apply_forward_hook
 from ...utils.torch_utils import randn_tensor
+from ..attention import AttentionMixin
 from ..attention_processor import (
     ADDED_KV_ATTENTION_PROCESSORS,
     CROSS_ATTENTION_PROCESSORS,
-    AttentionProcessor,
     AttnAddedKVProcessor,
     AttnProcessor,
 )
@@ -49,7 +49,7 @@ class ConsistencyDecoderVAEOutput(BaseOutput):
     latent_dist: "DiagonalGaussianDistribution"
 
 
-class ConsistencyDecoderVAE(ModelMixin, AutoencoderMixin, ConfigMixin):
+class ConsistencyDecoderVAE(ModelMixin, AttentionMixin, AutoencoderMixin, ConfigMixin):
     r"""
     The consistency decoder used with DALL-E 3.
 
@@ -167,66 +167,6 @@ def __init__(
         self.tile_latent_min_size = int(sample_size / (2 ** (len(self.config.block_out_channels) - 1)))
         self.tile_overlap_factor = 0.25
 
-    @property
-    # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.attn_processors
-    def attn_processors(self) -> Dict[str, AttentionProcessor]:
-        r"""
-        Returns:
-            `dict` of attention processors: A dictionary containing all attention processors used in the model with
-            indexed by its weight name.
-        """
-        # set recursively
-        processors = {}
-
-        def fn_recursive_add_processors(name: str, module: torch.nn.Module, processors: Dict[str, AttentionProcessor]):
-            if hasattr(module, "get_processor"):
-                processors[f"{name}.processor"] = module.get_processor()
-
-            for sub_name, child in module.named_children():
-                fn_recursive_add_processors(f"{name}.{sub_name}", child, processors)
-
-            return processors
-
-        for name, module in self.named_children():
-            fn_recursive_add_processors(name, module, processors)
-
-        return processors
-
-    # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.set_attn_processor
-    def set_attn_processor(self, processor: Union[AttentionProcessor, Dict[str, AttentionProcessor]]):
-        r"""
-        Sets the attention processor to use to compute attention.
-
-        Parameters:
-            processor (`dict` of `AttentionProcessor` or only `AttentionProcessor`):
-                The instantiated processor class or a dictionary of processor classes that will be set as the processor
-                for **all** `Attention` layers.
-
-                If `processor` is a dict, the key needs to define the path to the corresponding cross attention
-                processor. This is strongly recommended when setting trainable attention processors.
-
-        """
-        count = len(self.attn_processors.keys())
-
-        if isinstance(processor, dict) and len(processor) != count:
-            raise ValueError(
-                f"A dict of processors was passed, but the number of processors {len(processor)} does not match the"
-                f" number of attention layers: {count}. Please make sure to pass {count} processor classes."
-            )
-
-        def fn_recursive_attn_processor(name: str, module: torch.nn.Module, processor):
-            if hasattr(module, "set_processor"):
-                if not isinstance(processor, dict):
-                    module.set_processor(processor)
-                else:
-                    module.set_processor(processor.pop(f"{name}.processor"))
-
-            for sub_name, child in module.named_children():
-                fn_recursive_attn_processor(f"{name}.{sub_name}", child, processor)
-
-        for name, module in self.named_children():
-            fn_recursive_attn_processor(name, module, processor)
-
     # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.set_default_attn_processor
     def set_default_attn_processor(self):
         """
diff --git a/src/diffusers/models/autoencoders/vae.py b/src/diffusers/models/autoencoders/vae.py
index 9c6031a988f9..d798711ec240 100644
--- a/src/diffusers/models/autoencoders/vae.py
+++ b/src/diffusers/models/autoencoders/vae.py
@@ -12,11 +12,12 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 from dataclasses import dataclass
-from typing import Optional, Tuple
+from typing import Optional, Tuple, List
 
 import numpy as np
 import torch
 import torch.nn as nn
+import torch.distributed as dist
 
 from ...utils import BaseOutput
 from ...utils.torch_utils import randn_tensor
@@ -926,3 +927,78 @@ def disable_slicing(self):
         decoding in one step.
         """
         self.use_slicing = False
+
+    def enable_dp(self):
+        r"""
+        Enable tiled VAE decoding. When this option is enabled, the VAE will split the input tensor into tiles to
+        compute decoding and encoding in several steps. This is useful for saving a large amount of memory and to allow
+        processing larger images.
+        """
+        if not hasattr(self, "use_tiling"):
+            raise NotImplementedError(f"Tiling Parallel doesn't seem to be implemented for {self.__class__.__name__}.")
+        self.use_dp = True
+
+    def disable_dp(self):
+        r"""
+        Disable tiled VAE decoding. If `enable_tiling` was previously enabled, this method will go back to computing
+        decoding in one step.
+        """
+        self.use_dp = False
+
+    def run_vae_tile_parallel(
+            self, 
+            input: torch.Tensor,
+            vae_op, 
+            min_height, 
+            min_width, 
+            stride_height, 
+            stride_width, 
+            device,
+            **kwargs) -> List[List[torch.Tensor]]:
+
+        local_tiles = []
+        local_hw_shapes = []
+
+        for h_idx, w_idx in self.tile_idxs_per_rank[self.rank]:
+            patch_height_start = h_idx * stride_height
+            patch_height_end = patch_height_start + min_height
+            patch_width_start = w_idx * stride_width
+            patch_width_end = patch_width_start + min_width
+            tile = vae_op(input, patch_height_start, patch_height_end, patch_width_start, patch_width_end, **kwargs)
+            local_tiles.append(tile.flatten(-2, -1))
+            local_hw_shapes.append(torch.Tensor([*tile.shape[-2:]]).to(device).int())
+
+        # concat all tiles on local rank
+        local_tiles = torch.cat(local_tiles, dim=-1)
+        local_hw_shapes = torch.stack(local_hw_shapes)
+
+        # get all hw shapes for each rank (perhaps has different shapes for last tile)
+        gathered_shape_list = [torch.empty((num_tiles, 2), dtype=local_hw_shapes.dtype, device=device) 
+                        for num_tiles in self.num_tiles_per_rank]
+        dist.all_gather(gathered_shape_list, local_hw_shapes, group=self.vae_dp_group)
+
+        # gather tiles on all ranks
+        tile_shape_first = local_tiles.shape[:-1]
+        gathered_tiles = [
+            torch.empty(
+                (*tile_shape_first, tiles_shape.prod(dim=1).sum().item()), 
+                dtype=local_tiles.dtype, device=device) for tiles_shape in gathered_shape_list
+        ]
+        dist.all_gather(gathered_tiles, local_tiles, group=self.vae_dp_group)
+
+        # put tiles in rows based on tile_idxs_per_rank
+        rows = [[None] * self.w_split for _ in range(self.h_split)]
+        for rank_idx, tile_idxs in enumerate(self.tile_idxs_per_rank):
+            if not tile_idxs:
+                continue
+            rank_tile_hw_shapes = gathered_shape_list[rank_idx]
+            hw_start_idx = 0
+            # perhaps has more than one tile in each rank, get each by hw_shapes
+            for tile_idx, (h_idx, w_idx) in enumerate(tile_idxs):
+                rank_tile_hw_shape = rank_tile_hw_shapes[tile_idx]
+                hw_end_idx = hw_start_idx + rank_tile_hw_shape.prod().item() # flattend hw
+                rows[h_idx][w_idx] = gathered_tiles[rank_idx][..., hw_start_idx:hw_end_idx].unflatten(
+                    -1, rank_tile_hw_shape.tolist()) # unflatten hw dim
+                hw_start_idx = hw_end_idx
+
+        return rows
\ No newline at end of file
diff --git a/src/diffusers/models/cache_utils.py b/src/diffusers/models/cache_utils.py
index 605c0d588c8c..f4ad1af278f5 100644
--- a/src/diffusers/models/cache_utils.py
+++ b/src/diffusers/models/cache_utils.py
@@ -67,9 +67,11 @@ def enable_cache(self, config) -> None:
             FasterCacheConfig,
             FirstBlockCacheConfig,
             PyramidAttentionBroadcastConfig,
+            TaylorSeerCacheConfig,
             apply_faster_cache,
             apply_first_block_cache,
             apply_pyramid_attention_broadcast,
+            apply_taylorseer_cache,
         )
 
         if self.is_cache_enabled:
@@ -83,16 +85,25 @@ def enable_cache(self, config) -> None:
             apply_first_block_cache(self, config)
         elif isinstance(config, PyramidAttentionBroadcastConfig):
             apply_pyramid_attention_broadcast(self, config)
+        elif isinstance(config, TaylorSeerCacheConfig):
+            apply_taylorseer_cache(self, config)
         else:
             raise ValueError(f"Cache config {type(config)} is not supported.")
 
         self._cache_config = config
 
     def disable_cache(self) -> None:
-        from ..hooks import FasterCacheConfig, FirstBlockCacheConfig, HookRegistry, PyramidAttentionBroadcastConfig
+        from ..hooks import (
+            FasterCacheConfig,
+            FirstBlockCacheConfig,
+            HookRegistry,
+            PyramidAttentionBroadcastConfig,
+            TaylorSeerCacheConfig,
+        )
         from ..hooks.faster_cache import _FASTER_CACHE_BLOCK_HOOK, _FASTER_CACHE_DENOISER_HOOK
         from ..hooks.first_block_cache import _FBC_BLOCK_HOOK, _FBC_LEADER_BLOCK_HOOK
         from ..hooks.pyramid_attention_broadcast import _PYRAMID_ATTENTION_BROADCAST_HOOK
+        from ..hooks.taylorseer_cache import _TAYLORSEER_CACHE_HOOK
 
         if self._cache_config is None:
             logger.warning("Caching techniques have not been enabled, so there's nothing to disable.")
@@ -107,6 +118,8 @@ def disable_cache(self) -> None:
             registry.remove_hook(_FBC_BLOCK_HOOK, recurse=True)
         elif isinstance(self._cache_config, PyramidAttentionBroadcastConfig):
             registry.remove_hook(_PYRAMID_ATTENTION_BROADCAST_HOOK, recurse=True)
+        elif isinstance(self._cache_config, TaylorSeerCacheConfig):
+            registry.remove_hook(_TAYLORSEER_CACHE_HOOK, recurse=True)
         else:
             raise ValueError(f"Cache config {type(self._cache_config)} is not supported.")
 
diff --git a/src/diffusers/models/controlnets/controlnet.py b/src/diffusers/models/controlnets/controlnet.py
index 3f330999d4c8..5c89c9267db4 100644
--- a/src/diffusers/models/controlnets/controlnet.py
+++ b/src/diffusers/models/controlnets/controlnet.py
@@ -21,10 +21,10 @@
 from ...configuration_utils import ConfigMixin, register_to_config
 from ...loaders.single_file_model import FromOriginalModelMixin
 from ...utils import BaseOutput, logging
+from ..attention import AttentionMixin
 from ..attention_processor import (
     ADDED_KV_ATTENTION_PROCESSORS,
     CROSS_ATTENTION_PROCESSORS,
-    AttentionProcessor,
     AttnAddedKVProcessor,
     AttnProcessor,
 )
@@ -106,7 +106,7 @@ def forward(self, conditioning):
         return embedding
 
 
-class ControlNetModel(ModelMixin, ConfigMixin, FromOriginalModelMixin):
+class ControlNetModel(ModelMixin, AttentionMixin, ConfigMixin, FromOriginalModelMixin):
     """
     A ControlNet model.
 
@@ -515,66 +515,6 @@ def from_unet(
 
         return controlnet
 
-    @property
-    # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.attn_processors
-    def attn_processors(self) -> Dict[str, AttentionProcessor]:
-        r"""
-        Returns:
-            `dict` of attention processors: A dictionary containing all attention processors used in the model with
-            indexed by its weight name.
-        """
-        # set recursively
-        processors = {}
-
-        def fn_recursive_add_processors(name: str, module: torch.nn.Module, processors: Dict[str, AttentionProcessor]):
-            if hasattr(module, "get_processor"):
-                processors[f"{name}.processor"] = module.get_processor()
-
-            for sub_name, child in module.named_children():
-                fn_recursive_add_processors(f"{name}.{sub_name}", child, processors)
-
-            return processors
-
-        for name, module in self.named_children():
-            fn_recursive_add_processors(name, module, processors)
-
-        return processors
-
-    # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.set_attn_processor
-    def set_attn_processor(self, processor: Union[AttentionProcessor, Dict[str, AttentionProcessor]]):
-        r"""
-        Sets the attention processor to use to compute attention.
-
-        Parameters:
-            processor (`dict` of `AttentionProcessor` or only `AttentionProcessor`):
-                The instantiated processor class or a dictionary of processor classes that will be set as the processor
-                for **all** `Attention` layers.
-
-                If `processor` is a dict, the key needs to define the path to the corresponding cross attention
-                processor. This is strongly recommended when setting trainable attention processors.
-
-        """
-        count = len(self.attn_processors.keys())
-
-        if isinstance(processor, dict) and len(processor) != count:
-            raise ValueError(
-                f"A dict of processors was passed, but the number of processors {len(processor)} does not match the"
-                f" number of attention layers: {count}. Please make sure to pass {count} processor classes."
-            )
-
-        def fn_recursive_attn_processor(name: str, module: torch.nn.Module, processor):
-            if hasattr(module, "set_processor"):
-                if not isinstance(processor, dict):
-                    module.set_processor(processor)
-                else:
-                    module.set_processor(processor.pop(f"{name}.processor"))
-
-            for sub_name, child in module.named_children():
-                fn_recursive_attn_processor(f"{name}.{sub_name}", child, processor)
-
-        for name, module in self.named_children():
-            fn_recursive_attn_processor(name, module, processor)
-
     # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.set_default_attn_processor
     def set_default_attn_processor(self):
         """
diff --git a/src/diffusers/models/controlnets/controlnet_flux.py b/src/diffusers/models/controlnets/controlnet_flux.py
index 063ff5bd8e2d..639a8ad7390a 100644
--- a/src/diffusers/models/controlnets/controlnet_flux.py
+++ b/src/diffusers/models/controlnets/controlnet_flux.py
@@ -21,7 +21,7 @@
 from ...configuration_utils import ConfigMixin, register_to_config
 from ...loaders import PeftAdapterMixin
 from ...utils import USE_PEFT_BACKEND, BaseOutput, logging, scale_lora_layers, unscale_lora_layers
-from ..attention_processor import AttentionProcessor
+from ..attention import AttentionMixin
 from ..controlnets.controlnet import ControlNetConditioningEmbedding, zero_module
 from ..embeddings import CombinedTimestepGuidanceTextProjEmbeddings, CombinedTimestepTextProjEmbeddings, FluxPosEmbed
 from ..modeling_outputs import Transformer2DModelOutput
@@ -38,7 +38,7 @@ class FluxControlNetOutput(BaseOutput):
     controlnet_single_block_samples: Tuple[torch.Tensor]
 
 
-class FluxControlNetModel(ModelMixin, ConfigMixin, PeftAdapterMixin):
+class FluxControlNetModel(ModelMixin, AttentionMixin, ConfigMixin, PeftAdapterMixin):
     _supports_gradient_checkpointing = True
 
     @register_to_config
@@ -118,66 +118,6 @@ def __init__(
 
         self.gradient_checkpointing = False
 
-    @property
-    # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.attn_processors
-    def attn_processors(self):
-        r"""
-        Returns:
-            `dict` of attention processors: A dictionary containing all attention processors used in the model with
-            indexed by its weight name.
-        """
-        # set recursively
-        processors = {}
-
-        def fn_recursive_add_processors(name: str, module: torch.nn.Module, processors: Dict[str, AttentionProcessor]):
-            if hasattr(module, "get_processor"):
-                processors[f"{name}.processor"] = module.get_processor()
-
-            for sub_name, child in module.named_children():
-                fn_recursive_add_processors(f"{name}.{sub_name}", child, processors)
-
-            return processors
-
-        for name, module in self.named_children():
-            fn_recursive_add_processors(name, module, processors)
-
-        return processors
-
-    # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.set_attn_processor
-    def set_attn_processor(self, processor):
-        r"""
-        Sets the attention processor to use to compute attention.
-
-        Parameters:
-            processor (`dict` of `AttentionProcessor` or only `AttentionProcessor`):
-                The instantiated processor class or a dictionary of processor classes that will be set as the processor
-                for **all** `Attention` layers.
-
-                If `processor` is a dict, the key needs to define the path to the corresponding cross attention
-                processor. This is strongly recommended when setting trainable attention processors.
-
-        """
-        count = len(self.attn_processors.keys())
-
-        if isinstance(processor, dict) and len(processor) != count:
-            raise ValueError(
-                f"A dict of processors was passed, but the number of processors {len(processor)} does not match the"
-                f" number of attention layers: {count}. Please make sure to pass {count} processor classes."
-            )
-
-        def fn_recursive_attn_processor(name: str, module: torch.nn.Module, processor):
-            if hasattr(module, "set_processor"):
-                if not isinstance(processor, dict):
-                    module.set_processor(processor)
-                else:
-                    module.set_processor(processor.pop(f"{name}.processor"))
-
-            for sub_name, child in module.named_children():
-                fn_recursive_attn_processor(f"{name}.{sub_name}", child, processor)
-
-        for name, module in self.named_children():
-            fn_recursive_attn_processor(name, module, processor)
-
     @classmethod
     def from_transformer(
         cls,
diff --git a/src/diffusers/models/controlnets/controlnet_qwenimage.py b/src/diffusers/models/controlnets/controlnet_qwenimage.py
index 7c4955eb5828..86971271788f 100644
--- a/src/diffusers/models/controlnets/controlnet_qwenimage.py
+++ b/src/diffusers/models/controlnets/controlnet_qwenimage.py
@@ -21,7 +21,7 @@
 from ...configuration_utils import ConfigMixin, register_to_config
 from ...loaders import FromOriginalModelMixin, PeftAdapterMixin
 from ...utils import USE_PEFT_BACKEND, BaseOutput, logging, scale_lora_layers, unscale_lora_layers
-from ..attention_processor import AttentionProcessor
+from ..attention import AttentionMixin
 from ..cache_utils import CacheMixin
 from ..controlnets.controlnet import zero_module
 from ..modeling_outputs import Transformer2DModelOutput
@@ -42,7 +42,9 @@ class QwenImageControlNetOutput(BaseOutput):
     controlnet_block_samples: Tuple[torch.Tensor]
 
 
-class QwenImageControlNetModel(ModelMixin, ConfigMixin, PeftAdapterMixin, FromOriginalModelMixin, CacheMixin):
+class QwenImageControlNetModel(
+    ModelMixin, AttentionMixin, ConfigMixin, PeftAdapterMixin, FromOriginalModelMixin, CacheMixin
+):
     _supports_gradient_checkpointing = True
 
     @register_to_config
@@ -92,66 +94,6 @@ def __init__(
 
         self.gradient_checkpointing = False
 
-    @property
-    # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.attn_processors
-    def attn_processors(self):
-        r"""
-        Returns:
-            `dict` of attention processors: A dictionary containing all attention processors used in the model with
-            indexed by its weight name.
-        """
-        # set recursively
-        processors = {}
-
-        def fn_recursive_add_processors(name: str, module: torch.nn.Module, processors: Dict[str, AttentionProcessor]):
-            if hasattr(module, "get_processor"):
-                processors[f"{name}.processor"] = module.get_processor()
-
-            for sub_name, child in module.named_children():
-                fn_recursive_add_processors(f"{name}.{sub_name}", child, processors)
-
-            return processors
-
-        for name, module in self.named_children():
-            fn_recursive_add_processors(name, module, processors)
-
-        return processors
-
-    # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.set_attn_processor
-    def set_attn_processor(self, processor):
-        r"""
-        Sets the attention processor to use to compute attention.
-
-        Parameters:
-            processor (`dict` of `AttentionProcessor` or only `AttentionProcessor`):
-                The instantiated processor class or a dictionary of processor classes that will be set as the processor
-                for **all** `Attention` layers.
-
-                If `processor` is a dict, the key needs to define the path to the corresponding cross attention
-                processor. This is strongly recommended when setting trainable attention processors.
-
-        """
-        count = len(self.attn_processors.keys())
-
-        if isinstance(processor, dict) and len(processor) != count:
-            raise ValueError(
-                f"A dict of processors was passed, but the number of processors {len(processor)} does not match the"
-                f" number of attention layers: {count}. Please make sure to pass {count} processor classes."
-            )
-
-        def fn_recursive_attn_processor(name: str, module: torch.nn.Module, processor):
-            if hasattr(module, "set_processor"):
-                if not isinstance(processor, dict):
-                    module.set_processor(processor)
-                else:
-                    module.set_processor(processor.pop(f"{name}.processor"))
-
-            for sub_name, child in module.named_children():
-                fn_recursive_attn_processor(f"{name}.{sub_name}", child, processor)
-
-        for name, module in self.named_children():
-            fn_recursive_attn_processor(name, module, processor)
-
     @classmethod
     def from_transformer(
         cls,
diff --git a/src/diffusers/models/controlnets/controlnet_sana.py b/src/diffusers/models/controlnets/controlnet_sana.py
index ed521adbedda..c71a8b326635 100644
--- a/src/diffusers/models/controlnets/controlnet_sana.py
+++ b/src/diffusers/models/controlnets/controlnet_sana.py
@@ -21,7 +21,7 @@
 from ...configuration_utils import ConfigMixin, register_to_config
 from ...loaders import PeftAdapterMixin
 from ...utils import USE_PEFT_BACKEND, BaseOutput, logging, scale_lora_layers, unscale_lora_layers
-from ..attention_processor import AttentionProcessor
+from ..attention import AttentionMixin
 from ..embeddings import PatchEmbed, PixArtAlphaTextProjection
 from ..modeling_outputs import Transformer2DModelOutput
 from ..modeling_utils import ModelMixin
@@ -38,7 +38,7 @@ class SanaControlNetOutput(BaseOutput):
     controlnet_block_samples: Tuple[torch.Tensor]
 
 
-class SanaControlNetModel(ModelMixin, ConfigMixin, PeftAdapterMixin):
+class SanaControlNetModel(ModelMixin, AttentionMixin, ConfigMixin, PeftAdapterMixin):
     _supports_gradient_checkpointing = True
     _no_split_modules = ["SanaTransformerBlock", "PatchEmbed"]
     _skip_layerwise_casting_patterns = ["patch_embed", "norm"]
@@ -117,66 +117,6 @@ def __init__(
 
         self.gradient_checkpointing = False
 
-    @property
-    # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.attn_processors
-    def attn_processors(self) -> Dict[str, AttentionProcessor]:
-        r"""
-        Returns:
-            `dict` of attention processors: A dictionary containing all attention processors used in the model with
-            indexed by its weight name.
-        """
-        # set recursively
-        processors = {}
-
-        def fn_recursive_add_processors(name: str, module: torch.nn.Module, processors: Dict[str, AttentionProcessor]):
-            if hasattr(module, "get_processor"):
-                processors[f"{name}.processor"] = module.get_processor()
-
-            for sub_name, child in module.named_children():
-                fn_recursive_add_processors(f"{name}.{sub_name}", child, processors)
-
-            return processors
-
-        for name, module in self.named_children():
-            fn_recursive_add_processors(name, module, processors)
-
-        return processors
-
-    # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.set_attn_processor
-    def set_attn_processor(self, processor: Union[AttentionProcessor, Dict[str, AttentionProcessor]]):
-        r"""
-        Sets the attention processor to use to compute attention.
-
-        Parameters:
-            processor (`dict` of `AttentionProcessor` or only `AttentionProcessor`):
-                The instantiated processor class or a dictionary of processor classes that will be set as the processor
-                for **all** `Attention` layers.
-
-                If `processor` is a dict, the key needs to define the path to the corresponding cross attention
-                processor. This is strongly recommended when setting trainable attention processors.
-
-        """
-        count = len(self.attn_processors.keys())
-
-        if isinstance(processor, dict) and len(processor) != count:
-            raise ValueError(
-                f"A dict of processors was passed, but the number of processors {len(processor)} does not match the"
-                f" number of attention layers: {count}. Please make sure to pass {count} processor classes."
-            )
-
-        def fn_recursive_attn_processor(name: str, module: torch.nn.Module, processor):
-            if hasattr(module, "set_processor"):
-                if not isinstance(processor, dict):
-                    module.set_processor(processor)
-                else:
-                    module.set_processor(processor.pop(f"{name}.processor"))
-
-            for sub_name, child in module.named_children():
-                fn_recursive_attn_processor(f"{name}.{sub_name}", child, processor)
-
-        for name, module in self.named_children():
-            fn_recursive_attn_processor(name, module, processor)
-
     def forward(
         self,
         hidden_states: torch.Tensor,
diff --git a/src/diffusers/models/controlnets/controlnet_sd3.py b/src/diffusers/models/controlnets/controlnet_sd3.py
index 0641c8bc0114..08b86ff344eb 100644
--- a/src/diffusers/models/controlnets/controlnet_sd3.py
+++ b/src/diffusers/models/controlnets/controlnet_sd3.py
@@ -22,8 +22,8 @@
 from ...configuration_utils import ConfigMixin, register_to_config
 from ...loaders import FromOriginalModelMixin, PeftAdapterMixin
 from ...utils import USE_PEFT_BACKEND, logging, scale_lora_layers, unscale_lora_layers
-from ..attention import JointTransformerBlock
-from ..attention_processor import Attention, AttentionProcessor, FusedJointAttnProcessor2_0
+from ..attention import AttentionMixin, JointTransformerBlock
+from ..attention_processor import Attention, FusedJointAttnProcessor2_0
 from ..embeddings import CombinedTimestepTextProjEmbeddings, PatchEmbed
 from ..modeling_outputs import Transformer2DModelOutput
 from ..modeling_utils import ModelMixin
@@ -39,7 +39,7 @@ class SD3ControlNetOutput(BaseOutput):
     controlnet_block_samples: Tuple[torch.Tensor]
 
 
-class SD3ControlNetModel(ModelMixin, ConfigMixin, PeftAdapterMixin, FromOriginalModelMixin):
+class SD3ControlNetModel(ModelMixin, AttentionMixin, ConfigMixin, PeftAdapterMixin, FromOriginalModelMixin):
     r"""
     ControlNet model for [Stable Diffusion 3](https://huggingface.co/papers/2403.03206).
 
@@ -204,66 +204,6 @@ def fn_recursive_feed_forward(module: torch.nn.Module, chunk_size: int, dim: int
         for module in self.children():
             fn_recursive_feed_forward(module, chunk_size, dim)
 
-    @property
-    # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.attn_processors
-    def attn_processors(self) -> Dict[str, AttentionProcessor]:
-        r"""
-        Returns:
-            `dict` of attention processors: A dictionary containing all attention processors used in the model with
-            indexed by its weight name.
-        """
-        # set recursively
-        processors = {}
-
-        def fn_recursive_add_processors(name: str, module: torch.nn.Module, processors: Dict[str, AttentionProcessor]):
-            if hasattr(module, "get_processor"):
-                processors[f"{name}.processor"] = module.get_processor()
-
-            for sub_name, child in module.named_children():
-                fn_recursive_add_processors(f"{name}.{sub_name}", child, processors)
-
-            return processors
-
-        for name, module in self.named_children():
-            fn_recursive_add_processors(name, module, processors)
-
-        return processors
-
-    # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.set_attn_processor
-    def set_attn_processor(self, processor: Union[AttentionProcessor, Dict[str, AttentionProcessor]]):
-        r"""
-        Sets the attention processor to use to compute attention.
-
-        Parameters:
-            processor (`dict` of `AttentionProcessor` or only `AttentionProcessor`):
-                The instantiated processor class or a dictionary of processor classes that will be set as the processor
-                for **all** `Attention` layers.
-
-                If `processor` is a dict, the key needs to define the path to the corresponding cross attention
-                processor. This is strongly recommended when setting trainable attention processors.
-
-        """
-        count = len(self.attn_processors.keys())
-
-        if isinstance(processor, dict) and len(processor) != count:
-            raise ValueError(
-                f"A dict of processors was passed, but the number of processors {len(processor)} does not match the"
-                f" number of attention layers: {count}. Please make sure to pass {count} processor classes."
-            )
-
-        def fn_recursive_attn_processor(name: str, module: torch.nn.Module, processor):
-            if hasattr(module, "set_processor"):
-                if not isinstance(processor, dict):
-                    module.set_processor(processor)
-                else:
-                    module.set_processor(processor.pop(f"{name}.processor"))
-
-            for sub_name, child in module.named_children():
-                fn_recursive_attn_processor(f"{name}.{sub_name}", child, processor)
-
-        for name, module in self.named_children():
-            fn_recursive_attn_processor(name, module, processor)
-
     # Copied from diffusers.models.transformers.transformer_sd3.SD3Transformer2DModel.fuse_qkv_projections
     def fuse_qkv_projections(self):
         """
diff --git a/src/diffusers/models/controlnets/controlnet_sparsectrl.py b/src/diffusers/models/controlnets/controlnet_sparsectrl.py
index b04be5726234..8e7faf2d44b0 100644
--- a/src/diffusers/models/controlnets/controlnet_sparsectrl.py
+++ b/src/diffusers/models/controlnets/controlnet_sparsectrl.py
@@ -22,10 +22,10 @@
 from ...configuration_utils import ConfigMixin, register_to_config
 from ...loaders import FromOriginalModelMixin
 from ...utils import BaseOutput, logging
+from ..attention import AttentionMixin
 from ..attention_processor import (
     ADDED_KV_ATTENTION_PROCESSORS,
     CROSS_ATTENTION_PROCESSORS,
-    AttentionProcessor,
     AttnAddedKVProcessor,
     AttnProcessor,
 )
@@ -93,7 +93,7 @@ def forward(self, conditioning: torch.Tensor) -> torch.Tensor:
         return embedding
 
 
-class SparseControlNetModel(ModelMixin, ConfigMixin, FromOriginalModelMixin):
+class SparseControlNetModel(ModelMixin, AttentionMixin, ConfigMixin, FromOriginalModelMixin):
     """
     A SparseControlNet model as described in [SparseCtrl: Adding Sparse Controls to Text-to-Video Diffusion
     Models](https://huggingface.co/papers/2311.16933).
@@ -448,66 +448,6 @@ def from_unet(
 
         return controlnet
 
-    @property
-    # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.attn_processors
-    def attn_processors(self) -> Dict[str, AttentionProcessor]:
-        r"""
-        Returns:
-            `dict` of attention processors: A dictionary containing all attention processors used in the model with
-            indexed by its weight name.
-        """
-        # set recursively
-        processors = {}
-
-        def fn_recursive_add_processors(name: str, module: torch.nn.Module, processors: Dict[str, AttentionProcessor]):
-            if hasattr(module, "get_processor"):
-                processors[f"{name}.processor"] = module.get_processor()
-
-            for sub_name, child in module.named_children():
-                fn_recursive_add_processors(f"{name}.{sub_name}", child, processors)
-
-            return processors
-
-        for name, module in self.named_children():
-            fn_recursive_add_processors(name, module, processors)
-
-        return processors
-
-    # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.set_attn_processor
-    def set_attn_processor(self, processor: Union[AttentionProcessor, Dict[str, AttentionProcessor]]):
-        r"""
-        Sets the attention processor to use to compute attention.
-
-        Parameters:
-            processor (`dict` of `AttentionProcessor` or only `AttentionProcessor`):
-                The instantiated processor class or a dictionary of processor classes that will be set as the processor
-                for **all** `Attention` layers.
-
-                If `processor` is a dict, the key needs to define the path to the corresponding cross attention
-                processor. This is strongly recommended when setting trainable attention processors.
-
-        """
-        count = len(self.attn_processors.keys())
-
-        if isinstance(processor, dict) and len(processor) != count:
-            raise ValueError(
-                f"A dict of processors was passed, but the number of processors {len(processor)} does not match the"
-                f" number of attention layers: {count}. Please make sure to pass {count} processor classes."
-            )
-
-        def fn_recursive_attn_processor(name: str, module: torch.nn.Module, processor):
-            if hasattr(module, "set_processor"):
-                if not isinstance(processor, dict):
-                    module.set_processor(processor)
-                else:
-                    module.set_processor(processor.pop(f"{name}.processor"))
-
-            for sub_name, child in module.named_children():
-                fn_recursive_attn_processor(f"{name}.{sub_name}", child, processor)
-
-        for name, module in self.named_children():
-            fn_recursive_attn_processor(name, module, processor)
-
     # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.set_default_attn_processor
     def set_default_attn_processor(self):
         """
diff --git a/src/diffusers/models/controlnets/controlnet_union.py b/src/diffusers/models/controlnets/controlnet_union.py
index 3df3bbe312e9..b4ee6536ca2f 100644
--- a/src/diffusers/models/controlnets/controlnet_union.py
+++ b/src/diffusers/models/controlnets/controlnet_union.py
@@ -19,10 +19,10 @@
 from ...configuration_utils import ConfigMixin, register_to_config
 from ...loaders.single_file_model import FromOriginalModelMixin
 from ...utils import logging
+from ..attention import AttentionMixin
 from ..attention_processor import (
     ADDED_KV_ATTENTION_PROCESSORS,
     CROSS_ATTENTION_PROCESSORS,
-    AttentionProcessor,
     AttnAddedKVProcessor,
     AttnProcessor,
 )
@@ -81,7 +81,7 @@ def forward(self, x: torch.Tensor):
         return x
 
 
-class ControlNetUnionModel(ModelMixin, ConfigMixin, FromOriginalModelMixin):
+class ControlNetUnionModel(ModelMixin, AttentionMixin, ConfigMixin, FromOriginalModelMixin):
     """
     A ControlNetUnion model.
 
@@ -455,66 +455,6 @@ def from_unet(
 
         return controlnet
 
-    @property
-    # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.attn_processors
-    def attn_processors(self) -> Dict[str, AttentionProcessor]:
-        r"""
-        Returns:
-            `dict` of attention processors: A dictionary containing all attention processors used in the model with
-            indexed by its weight name.
-        """
-        # set recursively
-        processors = {}
-
-        def fn_recursive_add_processors(name: str, module: torch.nn.Module, processors: Dict[str, AttentionProcessor]):
-            if hasattr(module, "get_processor"):
-                processors[f"{name}.processor"] = module.get_processor()
-
-            for sub_name, child in module.named_children():
-                fn_recursive_add_processors(f"{name}.{sub_name}", child, processors)
-
-            return processors
-
-        for name, module in self.named_children():
-            fn_recursive_add_processors(name, module, processors)
-
-        return processors
-
-    # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.set_attn_processor
-    def set_attn_processor(self, processor: Union[AttentionProcessor, Dict[str, AttentionProcessor]]):
-        r"""
-        Sets the attention processor to use to compute attention.
-
-        Parameters:
-            processor (`dict` of `AttentionProcessor` or only `AttentionProcessor`):
-                The instantiated processor class or a dictionary of processor classes that will be set as the processor
-                for **all** `Attention` layers.
-
-                If `processor` is a dict, the key needs to define the path to the corresponding cross attention
-                processor. This is strongly recommended when setting trainable attention processors.
-
-        """
-        count = len(self.attn_processors.keys())
-
-        if isinstance(processor, dict) and len(processor) != count:
-            raise ValueError(
-                f"A dict of processors was passed, but the number of processors {len(processor)} does not match the"
-                f" number of attention layers: {count}. Please make sure to pass {count} processor classes."
-            )
-
-        def fn_recursive_attn_processor(name: str, module: torch.nn.Module, processor):
-            if hasattr(module, "set_processor"):
-                if not isinstance(processor, dict):
-                    module.set_processor(processor)
-                else:
-                    module.set_processor(processor.pop(f"{name}.processor"))
-
-            for sub_name, child in module.named_children():
-                fn_recursive_attn_processor(f"{name}.{sub_name}", child, processor)
-
-        for name, module in self.named_children():
-            fn_recursive_attn_processor(name, module, processor)
-
     # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.set_default_attn_processor
     def set_default_attn_processor(self):
         """
diff --git a/src/diffusers/models/controlnets/controlnet_xs.py b/src/diffusers/models/controlnets/controlnet_xs.py
index f5c69b9a46ad..119492b0fac4 100644
--- a/src/diffusers/models/controlnets/controlnet_xs.py
+++ b/src/diffusers/models/controlnets/controlnet_xs.py
@@ -21,11 +21,11 @@
 from ...configuration_utils import ConfigMixin, register_to_config
 from ...utils import BaseOutput, logging
 from ...utils.torch_utils import apply_freeu
+from ..attention import AttentionMixin
 from ..attention_processor import (
     ADDED_KV_ATTENTION_PROCESSORS,
     CROSS_ATTENTION_PROCESSORS,
     Attention,
-    AttentionProcessor,
     AttnAddedKVProcessor,
     AttnProcessor,
     FusedAttnProcessor2_0,
@@ -241,7 +241,7 @@ def get_up_block_adapter(
     return UpBlockControlNetXSAdapter(ctrl_to_base=nn.ModuleList(ctrl_to_base))
 
 
-class ControlNetXSAdapter(ModelMixin, ConfigMixin):
+class ControlNetXSAdapter(ModelMixin, AttentionMixin, ConfigMixin):
     r"""
     A `ControlNetXSAdapter` model. To use it, pass it into a `UNetControlNetXSModel` (together with a
     `UNet2DConditionModel` base model).
@@ -293,14 +293,14 @@ def __init__(
         self,
         conditioning_channels: int = 3,
         conditioning_channel_order: str = "rgb",
-        conditioning_embedding_out_channels: Tuple[int] = (16, 32, 96, 256),
+        conditioning_embedding_out_channels: Tuple[int, ...] = (16, 32, 96, 256),
         time_embedding_mix: float = 1.0,
         learn_time_embedding: bool = False,
         num_attention_heads: Union[int, Tuple[int]] = 4,
-        block_out_channels: Tuple[int] = (4, 8, 16, 16),
-        base_block_out_channels: Tuple[int] = (320, 640, 1280, 1280),
+        block_out_channels: Tuple[int, ...] = (4, 8, 16, 16),
+        base_block_out_channels: Tuple[int, ...] = (320, 640, 1280, 1280),
         cross_attention_dim: int = 1024,
-        down_block_types: Tuple[str] = (
+        down_block_types: Tuple[str, ...] = (
             "CrossAttnDownBlock2D",
             "CrossAttnDownBlock2D",
             "CrossAttnDownBlock2D",
@@ -436,7 +436,7 @@ def from_unet(
         time_embedding_mix: int = 1.0,
         conditioning_channels: int = 3,
         conditioning_channel_order: str = "rgb",
-        conditioning_embedding_out_channels: Tuple[int] = (16, 32, 96, 256),
+        conditioning_embedding_out_channels: Tuple[int, ...] = (16, 32, 96, 256),
     ):
         r"""
         Instantiate a [`ControlNetXSAdapter`] from a [`UNet2DConditionModel`].
@@ -508,7 +508,7 @@ def forward(self, *args, **kwargs):
         )
 
 
-class UNetControlNetXSModel(ModelMixin, ConfigMixin):
+class UNetControlNetXSModel(ModelMixin, AttentionMixin, ConfigMixin):
     r"""
     A UNet fused with a ControlNet-XS adapter model
 
@@ -529,14 +529,19 @@ def __init__(
         self,
         # unet configs
         sample_size: Optional[int] = 96,
-        down_block_types: Tuple[str] = (
+        down_block_types: Tuple[str, ...] = (
             "CrossAttnDownBlock2D",
             "CrossAttnDownBlock2D",
             "CrossAttnDownBlock2D",
             "DownBlock2D",
         ),
-        up_block_types: Tuple[str] = ("UpBlock2D", "CrossAttnUpBlock2D", "CrossAttnUpBlock2D", "CrossAttnUpBlock2D"),
-        block_out_channels: Tuple[int] = (320, 640, 1280, 1280),
+        up_block_types: Tuple[str, ...] = (
+            "UpBlock2D",
+            "CrossAttnUpBlock2D",
+            "CrossAttnUpBlock2D",
+            "CrossAttnUpBlock2D",
+        ),
+        block_out_channels: Tuple[int, ...] = (320, 640, 1280, 1280),
         norm_num_groups: Optional[int] = 32,
         cross_attention_dim: Union[int, Tuple[int]] = 1024,
         transformer_layers_per_block: Union[int, Tuple[int]] = 1,
@@ -550,10 +555,10 @@ def __init__(
         # additional controlnet configs
         time_embedding_mix: float = 1.0,
         ctrl_conditioning_channels: int = 3,
-        ctrl_conditioning_embedding_out_channels: Tuple[int] = (16, 32, 96, 256),
+        ctrl_conditioning_embedding_out_channels: Tuple[int, ...] = (16, 32, 96, 256),
         ctrl_conditioning_channel_order: str = "rgb",
         ctrl_learn_time_embedding: bool = False,
-        ctrl_block_out_channels: Tuple[int] = (4, 8, 16, 16),
+        ctrl_block_out_channels: Tuple[int, ...] = (4, 8, 16, 16),
         ctrl_num_attention_heads: Union[int, Tuple[int]] = 4,
         ctrl_max_norm_num_groups: int = 32,
     ):
@@ -863,66 +868,6 @@ def freeze_unet_params(self) -> None:
         for u in self.up_blocks:
             u.freeze_base_params()
 
-    @property
-    # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.attn_processors
-    def attn_processors(self) -> Dict[str, AttentionProcessor]:
-        r"""
-        Returns:
-            `dict` of attention processors: A dictionary containing all attention processors used in the model with
-            indexed by its weight name.
-        """
-        # set recursively
-        processors = {}
-
-        def fn_recursive_add_processors(name: str, module: torch.nn.Module, processors: Dict[str, AttentionProcessor]):
-            if hasattr(module, "get_processor"):
-                processors[f"{name}.processor"] = module.get_processor()
-
-            for sub_name, child in module.named_children():
-                fn_recursive_add_processors(f"{name}.{sub_name}", child, processors)
-
-            return processors
-
-        for name, module in self.named_children():
-            fn_recursive_add_processors(name, module, processors)
-
-        return processors
-
-    # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.set_attn_processor
-    def set_attn_processor(self, processor: Union[AttentionProcessor, Dict[str, AttentionProcessor]]):
-        r"""
-        Sets the attention processor to use to compute attention.
-
-        Parameters:
-            processor (`dict` of `AttentionProcessor` or only `AttentionProcessor`):
-                The instantiated processor class or a dictionary of processor classes that will be set as the processor
-                for **all** `Attention` layers.
-
-                If `processor` is a dict, the key needs to define the path to the corresponding cross attention
-                processor. This is strongly recommended when setting trainable attention processors.
-
-        """
-        count = len(self.attn_processors.keys())
-
-        if isinstance(processor, dict) and len(processor) != count:
-            raise ValueError(
-                f"A dict of processors was passed, but the number of processors {len(processor)} does not match the"
-                f" number of attention layers: {count}. Please make sure to pass {count} processor classes."
-            )
-
-        def fn_recursive_attn_processor(name: str, module: torch.nn.Module, processor):
-            if hasattr(module, "set_processor"):
-                if not isinstance(processor, dict):
-                    module.set_processor(processor)
-                else:
-                    module.set_processor(processor.pop(f"{name}.processor"))
-
-            for sub_name, child in module.named_children():
-                fn_recursive_attn_processor(f"{name}.{sub_name}", child, processor)
-
-        for name, module in self.named_children():
-            fn_recursive_attn_processor(name, module, processor)
-
     # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.set_default_attn_processor
     def set_default_attn_processor(self):
         """
diff --git a/src/diffusers/models/modeling_utils.py b/src/diffusers/models/modeling_utils.py
index 91daca1ad809..41da95d3a2a2 100644
--- a/src/diffusers/models/modeling_utils.py
+++ b/src/diffusers/models/modeling_utils.py
@@ -531,6 +531,8 @@ def enable_group_offload(
         record_stream: bool = False,
         low_cpu_mem_usage=False,
         offload_to_disk_path: Optional[str] = None,
+        block_modules: Optional[str] = None,
+        exclude_kwargs: Optional[str] = None,
     ) -> None:
         r"""
         Activates group offloading for the current model.
@@ -570,6 +572,7 @@ def enable_group_offload(
                 f"`_supports_group_offloading` to `True` in the class definition. If you believe this is a mistake, please "
                 f"open an issue at https://github.com/huggingface/diffusers/issues."
             )
+
         apply_group_offloading(
             module=self,
             onload_device=onload_device,
@@ -581,6 +584,8 @@ def enable_group_offload(
             record_stream=record_stream,
             low_cpu_mem_usage=low_cpu_mem_usage,
             offload_to_disk_path=offload_to_disk_path,
+            block_modules=block_modules,
+            exclude_kwargs=exclude_kwargs,
         )
 
     def set_attention_backend(self, backend: str) -> None:
@@ -595,7 +600,11 @@ def set_attention_backend(self, backend: str) -> None:
                 attention as backend.
         """
         from .attention import AttentionModuleMixin
-        from .attention_dispatch import AttentionBackendName, _check_attention_backend_requirements
+        from .attention_dispatch import (
+            AttentionBackendName,
+            _check_attention_backend_requirements,
+            _maybe_download_kernel_for_backend,
+        )
 
         # TODO: the following will not be required when everything is refactored to AttentionModuleMixin
         from .attention_processor import Attention, MochiAttention
@@ -606,8 +615,10 @@ def set_attention_backend(self, backend: str) -> None:
         available_backends = {x.value for x in AttentionBackendName.__members__.values()}
         if backend not in available_backends:
             raise ValueError(f"`{backend=}` must be one of the following: " + ", ".join(available_backends))
+
         backend = AttentionBackendName(backend)
         _check_attention_backend_requirements(backend)
+        _maybe_download_kernel_for_backend(backend)
 
         attention_classes = (Attention, MochiAttention, AttentionModuleMixin)
         for module in self.modules():
@@ -1484,59 +1495,71 @@ def enable_parallelism(
         config: Union[ParallelConfig, ContextParallelConfig],
         cp_plan: Optional[Dict[str, ContextParallelModelPlan]] = None,
     ):
-        from ..hooks.context_parallel import apply_context_parallel
-        from .attention import AttentionModuleMixin
-        from .attention_processor import Attention, MochiAttention
-
         logger.warning(
             "`enable_parallelism` is an experimental feature. The API may change in the future and breaking changes may be introduced at any time without warning."
         )
 
+        if not torch.distributed.is_available() and not torch.distributed.is_initialized():
+            raise RuntimeError(
+                "torch.distributed must be available and initialized before calling `enable_parallelism`."
+            )
+
+        from ..hooks.context_parallel import apply_context_parallel
+        from .attention import AttentionModuleMixin
+        from .attention_dispatch import AttentionBackendName, _AttentionBackendRegistry
+        from .attention_processor import Attention, MochiAttention
+
         if isinstance(config, ContextParallelConfig):
             config = ParallelConfig(context_parallel_config=config)
 
-        if not torch.distributed.is_initialized():
-            raise RuntimeError("torch.distributed must be initialized before calling `enable_parallelism`.")
-
         rank = torch.distributed.get_rank()
         world_size = torch.distributed.get_world_size()
         device_type = torch._C._get_accelerator().type
         device_module = torch.get_device_module(device_type)
         device = torch.device(device_type, rank % device_module.device_count())
 
-        cp_mesh = None
+        attention_classes = (Attention, MochiAttention, AttentionModuleMixin)
+
         if config.context_parallel_config is not None:
-            cp_config = config.context_parallel_config
-            if cp_config.ring_degree < 1 or cp_config.ulysses_degree < 1:
-                raise ValueError("`ring_degree` and `ulysses_degree` must be greater than or equal to 1.")
-            if cp_config.ring_degree > 1 and cp_config.ulysses_degree > 1:
-                raise ValueError(
-                    "Unified Ulysses-Ring attention is not yet supported. Please set either `ring_degree` or `ulysses_degree` to 1."
-                )
-            if cp_config.ring_degree * cp_config.ulysses_degree > world_size:
-                raise ValueError(
-                    f"The product of `ring_degree` ({cp_config.ring_degree}) and `ulysses_degree` ({cp_config.ulysses_degree}) must not exceed the world size ({world_size})."
-                )
-            cp_mesh = torch.distributed.device_mesh.init_device_mesh(
-                device_type=device_type,
-                mesh_shape=(cp_config.ring_degree, cp_config.ulysses_degree),
-                mesh_dim_names=("ring", "ulysses"),
-            )
+            for module in self.modules():
+                if not isinstance(module, attention_classes):
+                    continue
+
+                processor = module.processor
+                if processor is None or not hasattr(processor, "_attention_backend"):
+                    continue
 
-        config.setup(rank, world_size, device, cp_mesh=cp_mesh)
+                attention_backend = processor._attention_backend
+                if attention_backend is None:
+                    attention_backend, _ = _AttentionBackendRegistry.get_active_backend()
+                else:
+                    attention_backend = AttentionBackendName(attention_backend)
+
+                if not _AttentionBackendRegistry._is_context_parallel_available(attention_backend):
+                    compatible_backends = sorted(_AttentionBackendRegistry._supports_context_parallel)
+                    raise ValueError(
+                        f"Context parallelism is enabled but the attention processor '{processor.__class__.__name__}' "
+                        f"is using backend '{attention_backend.value}' which does not support context parallelism. "
+                        f"Please set a compatible attention backend: {compatible_backends} using `model.set_attention_backend()` before "
+                        f"calling `enable_parallelism()`."
+                    )
 
-        if cp_plan is None and self._cp_plan is None:
-            raise ValueError(
-                "`cp_plan` must be provided either as an argument or set in the model's `_cp_plan` attribute."
-            )
-        cp_plan = cp_plan if cp_plan is not None else self._cp_plan
+                # All modules use the same attention processor and backend. We don't need to
+                # iterate over all modules after checking the first processor
+                break
 
+        mesh = None
         if config.context_parallel_config is not None:
-            apply_context_parallel(self, config.context_parallel_config, cp_plan)
+            cp_config = config.context_parallel_config
+            mesh = torch.distributed.device_mesh.init_device_mesh(
+                device_type=device_type,
+                mesh_shape=cp_config.mesh_shape,
+                mesh_dim_names=cp_config.mesh_dim_names,
+            )
 
+        config.setup(rank, world_size, device, mesh=mesh)
         self._parallel_config = config
 
-        attention_classes = (Attention, MochiAttention, AttentionModuleMixin)
         for module in self.modules():
             if not isinstance(module, attention_classes):
                 continue
@@ -1545,6 +1568,14 @@ def enable_parallelism(
                 continue
             processor._parallel_config = config
 
+        if config.context_parallel_config is not None:
+            if cp_plan is None and self._cp_plan is None:
+                raise ValueError(
+                    "`cp_plan` must be provided either as an argument or set in the model's `_cp_plan` attribute."
+                )
+            cp_plan = cp_plan if cp_plan is not None else self._cp_plan
+            apply_context_parallel(self, config.context_parallel_config, cp_plan)
+
     @classmethod
     def _load_pretrained_model(
         cls,
diff --git a/src/diffusers/models/transformers/__init__.py b/src/diffusers/models/transformers/__init__.py
index 2fe1159eec4c..a42f6b2716e1 100755
--- a/src/diffusers/models/transformers/__init__.py
+++ b/src/diffusers/models/transformers/__init__.py
@@ -20,13 +20,16 @@
     from .transformer_bria import BriaTransformer2DModel
     from .transformer_bria_fibo import BriaFiboTransformer2DModel
     from .transformer_chroma import ChromaTransformer2DModel
+    from .transformer_chronoedit import ChronoEditTransformer3DModel
     from .transformer_cogview3plus import CogView3PlusTransformer2DModel
     from .transformer_cogview4 import CogView4Transformer2DModel
     from .transformer_cosmos import CosmosTransformer3DModel
     from .transformer_easyanimate import EasyAnimateTransformer3DModel
     from .transformer_flux import FluxTransformer2DModel
+    from .transformer_flux2 import Flux2Transformer2DModel
     from .transformer_hidream_image import HiDreamImageTransformer2DModel
     from .transformer_hunyuan_video import HunyuanVideoTransformer3DModel
+    from .transformer_hunyuan_video15 import HunyuanVideo15Transformer3DModel
     from .transformer_hunyuan_video_framepack import HunyuanVideoFramepackTransformer3DModel
     from .transformer_hunyuanimage import HunyuanImageTransformer2DModel
     from .transformer_kandinsky import Kandinsky5Transformer3DModel
@@ -34,10 +37,14 @@
     from .transformer_lumina2 import Lumina2Transformer2DModel
     from .transformer_mochi import MochiTransformer3DModel
     from .transformer_omnigen import OmniGenTransformer2DModel
+    from .transformer_ovis_image import OvisImageTransformer2DModel
     from .transformer_prx import PRXTransformer2DModel
     from .transformer_qwenimage import QwenImageTransformer2DModel
+    from .transformer_sana_video import SanaVideoTransformer3DModel
     from .transformer_sd3 import SD3Transformer2DModel
     from .transformer_skyreels_v2 import SkyReelsV2Transformer3DModel
     from .transformer_temporal import TransformerTemporalModel
     from .transformer_wan import WanTransformer3DModel
+    from .transformer_wan_animate import WanAnimateTransformer3DModel
     from .transformer_wan_vace import WanVACETransformer3DModel
+    from .transformer_z_image import ZImageTransformer2DModel
diff --git a/src/diffusers/models/transformers/auraflow_transformer_2d.py b/src/diffusers/models/transformers/auraflow_transformer_2d.py
index bf6d9e1b3803..e3732662e408 100644
--- a/src/diffusers/models/transformers/auraflow_transformer_2d.py
+++ b/src/diffusers/models/transformers/auraflow_transformer_2d.py
@@ -23,9 +23,9 @@
 from ...loaders import FromOriginalModelMixin, PeftAdapterMixin
 from ...utils import USE_PEFT_BACKEND, logging, scale_lora_layers, unscale_lora_layers
 from ...utils.torch_utils import maybe_allow_in_graph
+from ..attention import AttentionMixin
 from ..attention_processor import (
     Attention,
-    AttentionProcessor,
     AuraFlowAttnProcessor2_0,
     FusedAuraFlowAttnProcessor2_0,
 )
@@ -275,7 +275,7 @@ def forward(
         return encoder_hidden_states, hidden_states
 
 
-class AuraFlowTransformer2DModel(ModelMixin, ConfigMixin, PeftAdapterMixin, FromOriginalModelMixin):
+class AuraFlowTransformer2DModel(ModelMixin, AttentionMixin, ConfigMixin, PeftAdapterMixin, FromOriginalModelMixin):
     r"""
     A 2D Transformer model as introduced in AuraFlow (https://blog.fal.ai/auraflow/).
 
@@ -365,66 +365,6 @@ def __init__(
 
         self.gradient_checkpointing = False
 
-    @property
-    # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.attn_processors
-    def attn_processors(self) -> Dict[str, AttentionProcessor]:
-        r"""
-        Returns:
-            `dict` of attention processors: A dictionary containing all attention processors used in the model with
-            indexed by its weight name.
-        """
-        # set recursively
-        processors = {}
-
-        def fn_recursive_add_processors(name: str, module: torch.nn.Module, processors: Dict[str, AttentionProcessor]):
-            if hasattr(module, "get_processor"):
-                processors[f"{name}.processor"] = module.get_processor()
-
-            for sub_name, child in module.named_children():
-                fn_recursive_add_processors(f"{name}.{sub_name}", child, processors)
-
-            return processors
-
-        for name, module in self.named_children():
-            fn_recursive_add_processors(name, module, processors)
-
-        return processors
-
-    # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.set_attn_processor
-    def set_attn_processor(self, processor: Union[AttentionProcessor, Dict[str, AttentionProcessor]]):
-        r"""
-        Sets the attention processor to use to compute attention.
-
-        Parameters:
-            processor (`dict` of `AttentionProcessor` or only `AttentionProcessor`):
-                The instantiated processor class or a dictionary of processor classes that will be set as the processor
-                for **all** `Attention` layers.
-
-                If `processor` is a dict, the key needs to define the path to the corresponding cross attention
-                processor. This is strongly recommended when setting trainable attention processors.
-
-        """
-        count = len(self.attn_processors.keys())
-
-        if isinstance(processor, dict) and len(processor) != count:
-            raise ValueError(
-                f"A dict of processors was passed, but the number of processors {len(processor)} does not match the"
-                f" number of attention layers: {count}. Please make sure to pass {count} processor classes."
-            )
-
-        def fn_recursive_attn_processor(name: str, module: torch.nn.Module, processor):
-            if hasattr(module, "set_processor"):
-                if not isinstance(processor, dict):
-                    module.set_processor(processor)
-                else:
-                    module.set_processor(processor.pop(f"{name}.processor"))
-
-            for sub_name, child in module.named_children():
-                fn_recursive_attn_processor(f"{name}.{sub_name}", child, processor)
-
-        for name, module in self.named_children():
-            fn_recursive_attn_processor(name, module, processor)
-
     # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.fuse_qkv_projections with FusedAttnProcessor2_0->FusedAuraFlowAttnProcessor2_0
     def fuse_qkv_projections(self):
         """
diff --git a/src/diffusers/models/transformers/cogvideox_transformer_3d.py b/src/diffusers/models/transformers/cogvideox_transformer_3d.py
index 9e0afdee6615..14b38cd46c52 100644
--- a/src/diffusers/models/transformers/cogvideox_transformer_3d.py
+++ b/src/diffusers/models/transformers/cogvideox_transformer_3d.py
@@ -22,8 +22,8 @@
 from ...loaders import PeftAdapterMixin
 from ...utils import USE_PEFT_BACKEND, logging, scale_lora_layers, unscale_lora_layers
 from ...utils.torch_utils import maybe_allow_in_graph
-from ..attention import Attention, FeedForward
-from ..attention_processor import AttentionProcessor, CogVideoXAttnProcessor2_0, FusedCogVideoXAttnProcessor2_0
+from ..attention import Attention, AttentionMixin, FeedForward
+from ..attention_processor import CogVideoXAttnProcessor2_0, FusedCogVideoXAttnProcessor2_0
 from ..cache_utils import CacheMixin
 from ..embeddings import CogVideoXPatchEmbed, TimestepEmbedding, Timesteps
 from ..modeling_outputs import Transformer2DModelOutput
@@ -157,7 +157,7 @@ def forward(
         return hidden_states, encoder_hidden_states
 
 
-class CogVideoXTransformer3DModel(ModelMixin, ConfigMixin, PeftAdapterMixin, CacheMixin):
+class CogVideoXTransformer3DModel(ModelMixin, AttentionMixin, ConfigMixin, PeftAdapterMixin, CacheMixin):
     """
     A Transformer model for video-like data in [CogVideoX](https://github.com/THUDM/CogVideo).
 
@@ -331,66 +331,6 @@ def __init__(
 
         self.gradient_checkpointing = False
 
-    @property
-    # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.attn_processors
-    def attn_processors(self) -> Dict[str, AttentionProcessor]:
-        r"""
-        Returns:
-            `dict` of attention processors: A dictionary containing all attention processors used in the model with
-            indexed by its weight name.
-        """
-        # set recursively
-        processors = {}
-
-        def fn_recursive_add_processors(name: str, module: torch.nn.Module, processors: Dict[str, AttentionProcessor]):
-            if hasattr(module, "get_processor"):
-                processors[f"{name}.processor"] = module.get_processor()
-
-            for sub_name, child in module.named_children():
-                fn_recursive_add_processors(f"{name}.{sub_name}", child, processors)
-
-            return processors
-
-        for name, module in self.named_children():
-            fn_recursive_add_processors(name, module, processors)
-
-        return processors
-
-    # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.set_attn_processor
-    def set_attn_processor(self, processor: Union[AttentionProcessor, Dict[str, AttentionProcessor]]):
-        r"""
-        Sets the attention processor to use to compute attention.
-
-        Parameters:
-            processor (`dict` of `AttentionProcessor` or only `AttentionProcessor`):
-                The instantiated processor class or a dictionary of processor classes that will be set as the processor
-                for **all** `Attention` layers.
-
-                If `processor` is a dict, the key needs to define the path to the corresponding cross attention
-                processor. This is strongly recommended when setting trainable attention processors.
-
-        """
-        count = len(self.attn_processors.keys())
-
-        if isinstance(processor, dict) and len(processor) != count:
-            raise ValueError(
-                f"A dict of processors was passed, but the number of processors {len(processor)} does not match the"
-                f" number of attention layers: {count}. Please make sure to pass {count} processor classes."
-            )
-
-        def fn_recursive_attn_processor(name: str, module: torch.nn.Module, processor):
-            if hasattr(module, "set_processor"):
-                if not isinstance(processor, dict):
-                    module.set_processor(processor)
-                else:
-                    module.set_processor(processor.pop(f"{name}.processor"))
-
-            for sub_name, child in module.named_children():
-                fn_recursive_attn_processor(f"{name}.{sub_name}", child, processor)
-
-        for name, module in self.named_children():
-            fn_recursive_attn_processor(name, module, processor)
-
     # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.fuse_qkv_projections with FusedAttnProcessor2_0->FusedCogVideoXAttnProcessor2_0
     def fuse_qkv_projections(self):
         """
diff --git a/src/diffusers/models/transformers/consisid_transformer_3d.py b/src/diffusers/models/transformers/consisid_transformer_3d.py
index 91fe811f0013..be20b0a3eacf 100644
--- a/src/diffusers/models/transformers/consisid_transformer_3d.py
+++ b/src/diffusers/models/transformers/consisid_transformer_3d.py
@@ -22,8 +22,8 @@
 from ...loaders import PeftAdapterMixin
 from ...utils import USE_PEFT_BACKEND, logging, scale_lora_layers, unscale_lora_layers
 from ...utils.torch_utils import maybe_allow_in_graph
-from ..attention import Attention, FeedForward
-from ..attention_processor import AttentionProcessor, CogVideoXAttnProcessor2_0
+from ..attention import Attention, AttentionMixin, FeedForward
+from ..attention_processor import CogVideoXAttnProcessor2_0
 from ..embeddings import CogVideoXPatchEmbed, TimestepEmbedding, Timesteps
 from ..modeling_outputs import Transformer2DModelOutput
 from ..modeling_utils import ModelMixin
@@ -348,7 +348,7 @@ def forward(
         return hidden_states, encoder_hidden_states
 
 
-class ConsisIDTransformer3DModel(ModelMixin, ConfigMixin, PeftAdapterMixin):
+class ConsisIDTransformer3DModel(ModelMixin, AttentionMixin, ConfigMixin, PeftAdapterMixin):
     """
     A Transformer model for video-like data in [ConsisID](https://github.com/PKU-YuanGroup/ConsisID).
 
@@ -620,66 +620,6 @@ def _init_face_inputs(self):
             ]
         )
 
-    @property
-    # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.attn_processors
-    def attn_processors(self) -> Dict[str, AttentionProcessor]:
-        r"""
-        Returns:
-            `dict` of attention processors: A dictionary containing all attention processors used in the model with
-            indexed by its weight name.
-        """
-        # set recursively
-        processors = {}
-
-        def fn_recursive_add_processors(name: str, module: torch.nn.Module, processors: Dict[str, AttentionProcessor]):
-            if hasattr(module, "get_processor"):
-                processors[f"{name}.processor"] = module.get_processor()
-
-            for sub_name, child in module.named_children():
-                fn_recursive_add_processors(f"{name}.{sub_name}", child, processors)
-
-            return processors
-
-        for name, module in self.named_children():
-            fn_recursive_add_processors(name, module, processors)
-
-        return processors
-
-    # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.set_attn_processor
-    def set_attn_processor(self, processor: Union[AttentionProcessor, Dict[str, AttentionProcessor]]):
-        r"""
-        Sets the attention processor to use to compute attention.
-
-        Parameters:
-            processor (`dict` of `AttentionProcessor` or only `AttentionProcessor`):
-                The instantiated processor class or a dictionary of processor classes that will be set as the processor
-                for **all** `Attention` layers.
-
-                If `processor` is a dict, the key needs to define the path to the corresponding cross attention
-                processor. This is strongly recommended when setting trainable attention processors.
-
-        """
-        count = len(self.attn_processors.keys())
-
-        if isinstance(processor, dict) and len(processor) != count:
-            raise ValueError(
-                f"A dict of processors was passed, but the number of processors {len(processor)} does not match the"
-                f" number of attention layers: {count}. Please make sure to pass {count} processor classes."
-            )
-
-        def fn_recursive_attn_processor(name: str, module: torch.nn.Module, processor):
-            if hasattr(module, "set_processor"):
-                if not isinstance(processor, dict):
-                    module.set_processor(processor)
-                else:
-                    module.set_processor(processor.pop(f"{name}.processor"))
-
-            for sub_name, child in module.named_children():
-                fn_recursive_attn_processor(f"{name}.{sub_name}", child, processor)
-
-        for name, module in self.named_children():
-            fn_recursive_attn_processor(name, module, processor)
-
     def forward(
         self,
         hidden_states: torch.Tensor,
diff --git a/src/diffusers/models/transformers/hunyuan_transformer_2d.py b/src/diffusers/models/transformers/hunyuan_transformer_2d.py
index fbe9fe8df91c..cecb675b32b7 100644
--- a/src/diffusers/models/transformers/hunyuan_transformer_2d.py
+++ b/src/diffusers/models/transformers/hunyuan_transformer_2d.py
@@ -11,7 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from typing import Dict, Optional, Union
+from typing import Optional
 
 import torch
 from torch import nn
@@ -19,8 +19,8 @@
 from ...configuration_utils import ConfigMixin, register_to_config
 from ...utils import logging
 from ...utils.torch_utils import maybe_allow_in_graph
-from ..attention import FeedForward
-from ..attention_processor import Attention, AttentionProcessor, FusedHunyuanAttnProcessor2_0, HunyuanAttnProcessor2_0
+from ..attention import AttentionMixin, FeedForward
+from ..attention_processor import Attention, FusedHunyuanAttnProcessor2_0, HunyuanAttnProcessor2_0
 from ..embeddings import (
     HunyuanCombinedTimestepTextSizeStyleEmbedding,
     PatchEmbed,
@@ -200,7 +200,7 @@ def forward(
         return hidden_states
 
 
-class HunyuanDiT2DModel(ModelMixin, ConfigMixin):
+class HunyuanDiT2DModel(ModelMixin, AttentionMixin, ConfigMixin):
     """
     HunYuanDiT: Diffusion model with a Transformer backbone.
 
@@ -350,66 +350,6 @@ def unfuse_qkv_projections(self):
         if self.original_attn_processors is not None:
             self.set_attn_processor(self.original_attn_processors)
 
-    @property
-    # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.attn_processors
-    def attn_processors(self) -> Dict[str, AttentionProcessor]:
-        r"""
-        Returns:
-            `dict` of attention processors: A dictionary containing all attention processors used in the model with
-            indexed by its weight name.
-        """
-        # set recursively
-        processors = {}
-
-        def fn_recursive_add_processors(name: str, module: torch.nn.Module, processors: Dict[str, AttentionProcessor]):
-            if hasattr(module, "get_processor"):
-                processors[f"{name}.processor"] = module.get_processor()
-
-            for sub_name, child in module.named_children():
-                fn_recursive_add_processors(f"{name}.{sub_name}", child, processors)
-
-            return processors
-
-        for name, module in self.named_children():
-            fn_recursive_add_processors(name, module, processors)
-
-        return processors
-
-    # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.set_attn_processor
-    def set_attn_processor(self, processor: Union[AttentionProcessor, Dict[str, AttentionProcessor]]):
-        r"""
-        Sets the attention processor to use to compute attention.
-
-        Parameters:
-            processor (`dict` of `AttentionProcessor` or only `AttentionProcessor`):
-                The instantiated processor class or a dictionary of processor classes that will be set as the processor
-                for **all** `Attention` layers.
-
-                If `processor` is a dict, the key needs to define the path to the corresponding cross attention
-                processor. This is strongly recommended when setting trainable attention processors.
-
-        """
-        count = len(self.attn_processors.keys())
-
-        if isinstance(processor, dict) and len(processor) != count:
-            raise ValueError(
-                f"A dict of processors was passed, but the number of processors {len(processor)} does not match the"
-                f" number of attention layers: {count}. Please make sure to pass {count} processor classes."
-            )
-
-        def fn_recursive_attn_processor(name: str, module: torch.nn.Module, processor):
-            if hasattr(module, "set_processor"):
-                if not isinstance(processor, dict):
-                    module.set_processor(processor)
-                else:
-                    module.set_processor(processor.pop(f"{name}.processor"))
-
-            for sub_name, child in module.named_children():
-                fn_recursive_attn_processor(f"{name}.{sub_name}", child, processor)
-
-        for name, module in self.named_children():
-            fn_recursive_attn_processor(name, module, processor)
-
     def set_default_attn_processor(self):
         """
         Disables custom attention processors and sets the default attention implementation.
diff --git a/src/diffusers/models/transformers/pixart_transformer_2d.py b/src/diffusers/models/transformers/pixart_transformer_2d.py
index 5a22144228ae..072670ee0c30 100644
--- a/src/diffusers/models/transformers/pixart_transformer_2d.py
+++ b/src/diffusers/models/transformers/pixart_transformer_2d.py
@@ -11,15 +11,15 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from typing import Any, Dict, Optional, Union
+from typing import Any, Dict, Optional
 
 import torch
 from torch import nn
 
 from ...configuration_utils import ConfigMixin, register_to_config
 from ...utils import logging
-from ..attention import BasicTransformerBlock
-from ..attention_processor import Attention, AttentionProcessor, AttnProcessor, FusedAttnProcessor2_0
+from ..attention import AttentionMixin, BasicTransformerBlock
+from ..attention_processor import Attention, AttnProcessor, FusedAttnProcessor2_0
 from ..embeddings import PatchEmbed, PixArtAlphaTextProjection
 from ..modeling_outputs import Transformer2DModelOutput
 from ..modeling_utils import ModelMixin
@@ -29,7 +29,7 @@
 logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
 
 
-class PixArtTransformer2DModel(ModelMixin, ConfigMixin):
+class PixArtTransformer2DModel(ModelMixin, AttentionMixin, ConfigMixin):
     r"""
     A 2D Transformer model as introduced in PixArt family of models (https://huggingface.co/papers/2310.00426,
     https://huggingface.co/papers/2403.04692).
@@ -184,66 +184,6 @@ def __init__(
                 in_features=self.config.caption_channels, hidden_size=self.inner_dim
             )
 
-    @property
-    # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.attn_processors
-    def attn_processors(self) -> Dict[str, AttentionProcessor]:
-        r"""
-        Returns:
-            `dict` of attention processors: A dictionary containing all attention processors used in the model with
-            indexed by its weight name.
-        """
-        # set recursively
-        processors = {}
-
-        def fn_recursive_add_processors(name: str, module: torch.nn.Module, processors: Dict[str, AttentionProcessor]):
-            if hasattr(module, "get_processor"):
-                processors[f"{name}.processor"] = module.get_processor()
-
-            for sub_name, child in module.named_children():
-                fn_recursive_add_processors(f"{name}.{sub_name}", child, processors)
-
-            return processors
-
-        for name, module in self.named_children():
-            fn_recursive_add_processors(name, module, processors)
-
-        return processors
-
-    # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.set_attn_processor
-    def set_attn_processor(self, processor: Union[AttentionProcessor, Dict[str, AttentionProcessor]]):
-        r"""
-        Sets the attention processor to use to compute attention.
-
-        Parameters:
-            processor (`dict` of `AttentionProcessor` or only `AttentionProcessor`):
-                The instantiated processor class or a dictionary of processor classes that will be set as the processor
-                for **all** `Attention` layers.
-
-                If `processor` is a dict, the key needs to define the path to the corresponding cross attention
-                processor. This is strongly recommended when setting trainable attention processors.
-
-        """
-        count = len(self.attn_processors.keys())
-
-        if isinstance(processor, dict) and len(processor) != count:
-            raise ValueError(
-                f"A dict of processors was passed, but the number of processors {len(processor)} does not match the"
-                f" number of attention layers: {count}. Please make sure to pass {count} processor classes."
-            )
-
-        def fn_recursive_attn_processor(name: str, module: torch.nn.Module, processor):
-            if hasattr(module, "set_processor"):
-                if not isinstance(processor, dict):
-                    module.set_processor(processor)
-                else:
-                    module.set_processor(processor.pop(f"{name}.processor"))
-
-            for sub_name, child in module.named_children():
-                fn_recursive_attn_processor(f"{name}.{sub_name}", child, processor)
-
-        for name, module in self.named_children():
-            fn_recursive_attn_processor(name, module, processor)
-
     def set_default_attn_processor(self):
         """
         Disables custom attention processors and sets the default attention implementation.
diff --git a/src/diffusers/models/transformers/prior_transformer.py b/src/diffusers/models/transformers/prior_transformer.py
index 565da0da8b6e..757bb436040f 100644
--- a/src/diffusers/models/transformers/prior_transformer.py
+++ b/src/diffusers/models/transformers/prior_transformer.py
@@ -1,5 +1,5 @@
 from dataclasses import dataclass
-from typing import Dict, Optional, Union
+from typing import Optional, Union
 
 import torch
 import torch.nn.functional as F
@@ -8,11 +8,10 @@
 from ...configuration_utils import ConfigMixin, register_to_config
 from ...loaders import PeftAdapterMixin, UNet2DConditionLoadersMixin
 from ...utils import BaseOutput
-from ..attention import BasicTransformerBlock
+from ..attention import AttentionMixin, BasicTransformerBlock
 from ..attention_processor import (
     ADDED_KV_ATTENTION_PROCESSORS,
     CROSS_ATTENTION_PROCESSORS,
-    AttentionProcessor,
     AttnAddedKVProcessor,
     AttnProcessor,
 )
@@ -33,7 +32,7 @@ class PriorTransformerOutput(BaseOutput):
     predicted_image_embedding: torch.Tensor
 
 
-class PriorTransformer(ModelMixin, ConfigMixin, UNet2DConditionLoadersMixin, PeftAdapterMixin):
+class PriorTransformer(ModelMixin, AttentionMixin, ConfigMixin, UNet2DConditionLoadersMixin, PeftAdapterMixin):
     """
     A Prior Transformer model.
 
@@ -166,66 +165,6 @@ def __init__(
         self.clip_mean = nn.Parameter(torch.zeros(1, clip_embed_dim))
         self.clip_std = nn.Parameter(torch.zeros(1, clip_embed_dim))
 
-    @property
-    # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.attn_processors
-    def attn_processors(self) -> Dict[str, AttentionProcessor]:
-        r"""
-        Returns:
-            `dict` of attention processors: A dictionary containing all attention processors used in the model with
-            indexed by its weight name.
-        """
-        # set recursively
-        processors = {}
-
-        def fn_recursive_add_processors(name: str, module: torch.nn.Module, processors: Dict[str, AttentionProcessor]):
-            if hasattr(module, "get_processor"):
-                processors[f"{name}.processor"] = module.get_processor()
-
-            for sub_name, child in module.named_children():
-                fn_recursive_add_processors(f"{name}.{sub_name}", child, processors)
-
-            return processors
-
-        for name, module in self.named_children():
-            fn_recursive_add_processors(name, module, processors)
-
-        return processors
-
-    # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.set_attn_processor
-    def set_attn_processor(self, processor: Union[AttentionProcessor, Dict[str, AttentionProcessor]]):
-        r"""
-        Sets the attention processor to use to compute attention.
-
-        Parameters:
-            processor (`dict` of `AttentionProcessor` or only `AttentionProcessor`):
-                The instantiated processor class or a dictionary of processor classes that will be set as the processor
-                for **all** `Attention` layers.
-
-                If `processor` is a dict, the key needs to define the path to the corresponding cross attention
-                processor. This is strongly recommended when setting trainable attention processors.
-
-        """
-        count = len(self.attn_processors.keys())
-
-        if isinstance(processor, dict) and len(processor) != count:
-            raise ValueError(
-                f"A dict of processors was passed, but the number of processors {len(processor)} does not match the"
-                f" number of attention layers: {count}. Please make sure to pass {count} processor classes."
-            )
-
-        def fn_recursive_attn_processor(name: str, module: torch.nn.Module, processor):
-            if hasattr(module, "set_processor"):
-                if not isinstance(processor, dict):
-                    module.set_processor(processor)
-                else:
-                    module.set_processor(processor.pop(f"{name}.processor"))
-
-            for sub_name, child in module.named_children():
-                fn_recursive_attn_processor(f"{name}.{sub_name}", child, processor)
-
-        for name, module in self.named_children():
-            fn_recursive_attn_processor(name, module, processor)
-
     # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.set_default_attn_processor
     def set_default_attn_processor(self):
         """
diff --git a/src/diffusers/models/transformers/sana_transformer.py b/src/diffusers/models/transformers/sana_transformer.py
index 1e02ac32e86c..d45badb1b121 100644
--- a/src/diffusers/models/transformers/sana_transformer.py
+++ b/src/diffusers/models/transformers/sana_transformer.py
@@ -21,9 +21,9 @@
 from ...configuration_utils import ConfigMixin, register_to_config
 from ...loaders import FromOriginalModelMixin, PeftAdapterMixin
 from ...utils import USE_PEFT_BACKEND, logging, scale_lora_layers, unscale_lora_layers
+from ..attention import AttentionMixin
 from ..attention_processor import (
     Attention,
-    AttentionProcessor,
     SanaLinearAttnProcessor2_0,
 )
 from ..embeddings import PatchEmbed, PixArtAlphaTextProjection, TimestepEmbedding, Timesteps
@@ -289,7 +289,7 @@ def forward(
         return hidden_states
 
 
-class SanaTransformer2DModel(ModelMixin, ConfigMixin, PeftAdapterMixin, FromOriginalModelMixin):
+class SanaTransformer2DModel(ModelMixin, AttentionMixin, ConfigMixin, PeftAdapterMixin, FromOriginalModelMixin):
     r"""
     A 2D Transformer model introduced in [Sana](https://huggingface.co/papers/2410.10629) family of models.
 
@@ -414,66 +414,6 @@ def __init__(
 
         self.gradient_checkpointing = False
 
-    @property
-    # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.attn_processors
-    def attn_processors(self) -> Dict[str, AttentionProcessor]:
-        r"""
-        Returns:
-            `dict` of attention processors: A dictionary containing all attention processors used in the model with
-            indexed by its weight name.
-        """
-        # set recursively
-        processors = {}
-
-        def fn_recursive_add_processors(name: str, module: torch.nn.Module, processors: Dict[str, AttentionProcessor]):
-            if hasattr(module, "get_processor"):
-                processors[f"{name}.processor"] = module.get_processor()
-
-            for sub_name, child in module.named_children():
-                fn_recursive_add_processors(f"{name}.{sub_name}", child, processors)
-
-            return processors
-
-        for name, module in self.named_children():
-            fn_recursive_add_processors(name, module, processors)
-
-        return processors
-
-    # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.set_attn_processor
-    def set_attn_processor(self, processor: Union[AttentionProcessor, Dict[str, AttentionProcessor]]):
-        r"""
-        Sets the attention processor to use to compute attention.
-
-        Parameters:
-            processor (`dict` of `AttentionProcessor` or only `AttentionProcessor`):
-                The instantiated processor class or a dictionary of processor classes that will be set as the processor
-                for **all** `Attention` layers.
-
-                If `processor` is a dict, the key needs to define the path to the corresponding cross attention
-                processor. This is strongly recommended when setting trainable attention processors.
-
-        """
-        count = len(self.attn_processors.keys())
-
-        if isinstance(processor, dict) and len(processor) != count:
-            raise ValueError(
-                f"A dict of processors was passed, but the number of processors {len(processor)} does not match the"
-                f" number of attention layers: {count}. Please make sure to pass {count} processor classes."
-            )
-
-        def fn_recursive_attn_processor(name: str, module: torch.nn.Module, processor):
-            if hasattr(module, "set_processor"):
-                if not isinstance(processor, dict):
-                    module.set_processor(processor)
-                else:
-                    module.set_processor(processor.pop(f"{name}.processor"))
-
-            for sub_name, child in module.named_children():
-                fn_recursive_attn_processor(f"{name}.{sub_name}", child, processor)
-
-        for name, module in self.named_children():
-            fn_recursive_attn_processor(name, module, processor)
-
     def forward(
         self,
         hidden_states: torch.Tensor,
diff --git a/src/diffusers/models/transformers/stable_audio_transformer.py b/src/diffusers/models/transformers/stable_audio_transformer.py
index ac9b3fca419c..2c3b6b5df91d 100644
--- a/src/diffusers/models/transformers/stable_audio_transformer.py
+++ b/src/diffusers/models/transformers/stable_audio_transformer.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 
 
-from typing import Dict, Optional, Union
+from typing import Optional, Union
 
 import numpy as np
 import torch
@@ -22,8 +22,8 @@
 from ...configuration_utils import ConfigMixin, register_to_config
 from ...utils import logging
 from ...utils.torch_utils import maybe_allow_in_graph
-from ..attention import FeedForward
-from ..attention_processor import Attention, AttentionProcessor, StableAudioAttnProcessor2_0
+from ..attention import AttentionMixin, FeedForward
+from ..attention_processor import Attention, StableAudioAttnProcessor2_0
 from ..modeling_utils import ModelMixin
 from ..transformers.transformer_2d import Transformer2DModelOutput
 
@@ -182,7 +182,7 @@ def forward(
         return hidden_states
 
 
-class StableAudioDiTModel(ModelMixin, ConfigMixin):
+class StableAudioDiTModel(ModelMixin, AttentionMixin, ConfigMixin):
     """
     The Diffusion Transformer model introduced in Stable Audio.
 
@@ -274,66 +274,6 @@ def __init__(
 
         self.gradient_checkpointing = False
 
-    @property
-    # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.attn_processors
-    def attn_processors(self) -> Dict[str, AttentionProcessor]:
-        r"""
-        Returns:
-            `dict` of attention processors: A dictionary containing all attention processors used in the model with
-            indexed by its weight name.
-        """
-        # set recursively
-        processors = {}
-
-        def fn_recursive_add_processors(name: str, module: torch.nn.Module, processors: Dict[str, AttentionProcessor]):
-            if hasattr(module, "get_processor"):
-                processors[f"{name}.processor"] = module.get_processor()
-
-            for sub_name, child in module.named_children():
-                fn_recursive_add_processors(f"{name}.{sub_name}", child, processors)
-
-            return processors
-
-        for name, module in self.named_children():
-            fn_recursive_add_processors(name, module, processors)
-
-        return processors
-
-    # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.set_attn_processor
-    def set_attn_processor(self, processor: Union[AttentionProcessor, Dict[str, AttentionProcessor]]):
-        r"""
-        Sets the attention processor to use to compute attention.
-
-        Parameters:
-            processor (`dict` of `AttentionProcessor` or only `AttentionProcessor`):
-                The instantiated processor class or a dictionary of processor classes that will be set as the processor
-                for **all** `Attention` layers.
-
-                If `processor` is a dict, the key needs to define the path to the corresponding cross attention
-                processor. This is strongly recommended when setting trainable attention processors.
-
-        """
-        count = len(self.attn_processors.keys())
-
-        if isinstance(processor, dict) and len(processor) != count:
-            raise ValueError(
-                f"A dict of processors was passed, but the number of processors {len(processor)} does not match the"
-                f" number of attention layers: {count}. Please make sure to pass {count} processor classes."
-            )
-
-        def fn_recursive_attn_processor(name: str, module: torch.nn.Module, processor):
-            if hasattr(module, "set_processor"):
-                if not isinstance(processor, dict):
-                    module.set_processor(processor)
-                else:
-                    module.set_processor(processor.pop(f"{name}.processor"))
-
-            for sub_name, child in module.named_children():
-                fn_recursive_attn_processor(f"{name}.{sub_name}", child, processor)
-
-        for name, module in self.named_children():
-            fn_recursive_attn_processor(name, module, processor)
-
     # Copied from diffusers.models.transformers.hunyuan_transformer_2d.HunyuanDiT2DModel.set_default_attn_processor with Hunyuan->StableAudio
     def set_default_attn_processor(self):
         """
diff --git a/src/diffusers/models/transformers/transformer_chronoedit.py b/src/diffusers/models/transformers/transformer_chronoedit.py
new file mode 100644
index 000000000000..79828b6464f4
--- /dev/null
+++ b/src/diffusers/models/transformers/transformer_chronoedit.py
@@ -0,0 +1,739 @@
+# Copyright 2025 The ChronoEdit Team and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import math
+from typing import Any, Dict, Optional, Tuple, Union
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from ...configuration_utils import ConfigMixin, register_to_config
+from ...loaders import FromOriginalModelMixin, PeftAdapterMixin
+from ...utils import USE_PEFT_BACKEND, deprecate, logging, scale_lora_layers, unscale_lora_layers
+from ...utils.torch_utils import maybe_allow_in_graph
+from .._modeling_parallel import ContextParallelInput, ContextParallelOutput
+from ..attention import AttentionMixin, AttentionModuleMixin, FeedForward
+from ..attention_dispatch import dispatch_attention_fn
+from ..cache_utils import CacheMixin
+from ..embeddings import PixArtAlphaTextProjection, TimestepEmbedding, Timesteps, get_1d_rotary_pos_embed
+from ..modeling_outputs import Transformer2DModelOutput
+from ..modeling_utils import ModelMixin
+from ..normalization import FP32LayerNorm
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+
+# Copied from diffusers.models.transformers.transformer_wan._get_qkv_projections
+def _get_qkv_projections(attn: "WanAttention", hidden_states: torch.Tensor, encoder_hidden_states: torch.Tensor):
+    # encoder_hidden_states is only passed for cross-attention
+    if encoder_hidden_states is None:
+        encoder_hidden_states = hidden_states
+
+    if attn.fused_projections:
+        if attn.cross_attention_dim_head is None:
+            # In self-attention layers, we can fuse the entire QKV projection into a single linear
+            query, key, value = attn.to_qkv(hidden_states).chunk(3, dim=-1)
+        else:
+            # In cross-attention layers, we can only fuse the KV projections into a single linear
+            query = attn.to_q(hidden_states)
+            key, value = attn.to_kv(encoder_hidden_states).chunk(2, dim=-1)
+    else:
+        query = attn.to_q(hidden_states)
+        key = attn.to_k(encoder_hidden_states)
+        value = attn.to_v(encoder_hidden_states)
+    return query, key, value
+
+
+# Copied from diffusers.models.transformers.transformer_wan._get_added_kv_projections
+def _get_added_kv_projections(attn: "WanAttention", encoder_hidden_states_img: torch.Tensor):
+    if attn.fused_projections:
+        key_img, value_img = attn.to_added_kv(encoder_hidden_states_img).chunk(2, dim=-1)
+    else:
+        key_img = attn.add_k_proj(encoder_hidden_states_img)
+        value_img = attn.add_v_proj(encoder_hidden_states_img)
+    return key_img, value_img
+
+
+# modified from diffusers.models.transformers.transformer_wan.WanAttnProcessor
+class WanAttnProcessor:
+    _attention_backend = None
+    _parallel_config = None
+
+    def __init__(self):
+        if not hasattr(F, "scaled_dot_product_attention"):
+            raise ImportError(
+                "WanAttnProcessor requires PyTorch 2.0. To use it, please upgrade PyTorch to version 2.0 or higher."
+            )
+
+    def __call__(
+        self,
+        attn: "WanAttention",
+        hidden_states: torch.Tensor,
+        encoder_hidden_states: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        rotary_emb: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
+    ) -> torch.Tensor:
+        encoder_hidden_states_img = None
+        if attn.add_k_proj is not None:
+            # 512 is the context length of the text encoder, hardcoded for now
+            image_context_length = encoder_hidden_states.shape[1] - 512
+            encoder_hidden_states_img = encoder_hidden_states[:, :image_context_length]
+            encoder_hidden_states = encoder_hidden_states[:, image_context_length:]
+
+        query, key, value = _get_qkv_projections(attn, hidden_states, encoder_hidden_states)
+
+        query = attn.norm_q(query)
+        key = attn.norm_k(key)
+
+        query = query.unflatten(2, (attn.heads, -1))
+        key = key.unflatten(2, (attn.heads, -1))
+        value = value.unflatten(2, (attn.heads, -1))
+
+        if rotary_emb is not None:
+
+            def apply_rotary_emb(
+                hidden_states: torch.Tensor,
+                freqs_cos: torch.Tensor,
+                freqs_sin: torch.Tensor,
+            ):
+                x1, x2 = hidden_states.unflatten(-1, (-1, 2)).unbind(-1)
+                cos = freqs_cos[..., 0::2]
+                sin = freqs_sin[..., 1::2]
+                out = torch.empty_like(hidden_states)
+                out[..., 0::2] = x1 * cos - x2 * sin
+                out[..., 1::2] = x1 * sin + x2 * cos
+                return out.type_as(hidden_states)
+
+            query = apply_rotary_emb(query, *rotary_emb)
+            key = apply_rotary_emb(key, *rotary_emb)
+
+        # I2V task
+        hidden_states_img = None
+        if encoder_hidden_states_img is not None:
+            key_img, value_img = _get_added_kv_projections(attn, encoder_hidden_states_img)
+            key_img = attn.norm_added_k(key_img)
+
+            key_img = key_img.unflatten(2, (attn.heads, -1))
+            value_img = value_img.unflatten(2, (attn.heads, -1))
+
+            hidden_states_img = dispatch_attention_fn(
+                query,
+                key_img,
+                value_img,
+                attn_mask=None,
+                dropout_p=0.0,
+                is_causal=False,
+                backend=self._attention_backend,
+                # Reference: https://github.com/huggingface/diffusers/pull/12660
+                parallel_config=None,
+            )
+            hidden_states_img = hidden_states_img.flatten(2, 3)
+            hidden_states_img = hidden_states_img.type_as(query)
+
+        hidden_states = dispatch_attention_fn(
+            query,
+            key,
+            value,
+            attn_mask=attention_mask,
+            dropout_p=0.0,
+            is_causal=False,
+            backend=self._attention_backend,
+            # Reference: https://github.com/huggingface/diffusers/pull/12660
+            parallel_config=(self._parallel_config if encoder_hidden_states is None else None),
+        )
+        hidden_states = hidden_states.flatten(2, 3)
+        hidden_states = hidden_states.type_as(query)
+
+        if hidden_states_img is not None:
+            hidden_states = hidden_states + hidden_states_img
+
+        hidden_states = attn.to_out[0](hidden_states)
+        hidden_states = attn.to_out[1](hidden_states)
+        return hidden_states
+
+
+# Copied from diffusers.models.transformers.transformer_wan.WanAttnProcessor2_0
+class WanAttnProcessor2_0:
+    def __new__(cls, *args, **kwargs):
+        deprecation_message = (
+            "The WanAttnProcessor2_0 class is deprecated and will be removed in a future version. "
+            "Please use WanAttnProcessor instead. "
+        )
+        deprecate("WanAttnProcessor2_0", "1.0.0", deprecation_message, standard_warn=False)
+        return WanAttnProcessor(*args, **kwargs)
+
+
+# Copied from diffusers.models.transformers.transformer_wan.WanAttention
+class WanAttention(torch.nn.Module, AttentionModuleMixin):
+    _default_processor_cls = WanAttnProcessor
+    _available_processors = [WanAttnProcessor]
+
+    def __init__(
+        self,
+        dim: int,
+        heads: int = 8,
+        dim_head: int = 64,
+        eps: float = 1e-5,
+        dropout: float = 0.0,
+        added_kv_proj_dim: Optional[int] = None,
+        cross_attention_dim_head: Optional[int] = None,
+        processor=None,
+        is_cross_attention=None,
+    ):
+        super().__init__()
+
+        self.inner_dim = dim_head * heads
+        self.heads = heads
+        self.added_kv_proj_dim = added_kv_proj_dim
+        self.cross_attention_dim_head = cross_attention_dim_head
+        self.kv_inner_dim = self.inner_dim if cross_attention_dim_head is None else cross_attention_dim_head * heads
+
+        self.to_q = torch.nn.Linear(dim, self.inner_dim, bias=True)
+        self.to_k = torch.nn.Linear(dim, self.kv_inner_dim, bias=True)
+        self.to_v = torch.nn.Linear(dim, self.kv_inner_dim, bias=True)
+        self.to_out = torch.nn.ModuleList(
+            [
+                torch.nn.Linear(self.inner_dim, dim, bias=True),
+                torch.nn.Dropout(dropout),
+            ]
+        )
+        self.norm_q = torch.nn.RMSNorm(dim_head * heads, eps=eps, elementwise_affine=True)
+        self.norm_k = torch.nn.RMSNorm(dim_head * heads, eps=eps, elementwise_affine=True)
+
+        self.add_k_proj = self.add_v_proj = None
+        if added_kv_proj_dim is not None:
+            self.add_k_proj = torch.nn.Linear(added_kv_proj_dim, self.inner_dim, bias=True)
+            self.add_v_proj = torch.nn.Linear(added_kv_proj_dim, self.inner_dim, bias=True)
+            self.norm_added_k = torch.nn.RMSNorm(dim_head * heads, eps=eps)
+
+        self.is_cross_attention = cross_attention_dim_head is not None
+
+        self.set_processor(processor)
+
+    def fuse_projections(self):
+        if getattr(self, "fused_projections", False):
+            return
+
+        if self.cross_attention_dim_head is None:
+            concatenated_weights = torch.cat([self.to_q.weight.data, self.to_k.weight.data, self.to_v.weight.data])
+            concatenated_bias = torch.cat([self.to_q.bias.data, self.to_k.bias.data, self.to_v.bias.data])
+            out_features, in_features = concatenated_weights.shape
+            with torch.device("meta"):
+                self.to_qkv = nn.Linear(in_features, out_features, bias=True)
+            self.to_qkv.load_state_dict(
+                {"weight": concatenated_weights, "bias": concatenated_bias}, strict=True, assign=True
+            )
+        else:
+            concatenated_weights = torch.cat([self.to_k.weight.data, self.to_v.weight.data])
+            concatenated_bias = torch.cat([self.to_k.bias.data, self.to_v.bias.data])
+            out_features, in_features = concatenated_weights.shape
+            with torch.device("meta"):
+                self.to_kv = nn.Linear(in_features, out_features, bias=True)
+            self.to_kv.load_state_dict(
+                {"weight": concatenated_weights, "bias": concatenated_bias}, strict=True, assign=True
+            )
+
+        if self.added_kv_proj_dim is not None:
+            concatenated_weights = torch.cat([self.add_k_proj.weight.data, self.add_v_proj.weight.data])
+            concatenated_bias = torch.cat([self.add_k_proj.bias.data, self.add_v_proj.bias.data])
+            out_features, in_features = concatenated_weights.shape
+            with torch.device("meta"):
+                self.to_added_kv = nn.Linear(in_features, out_features, bias=True)
+            self.to_added_kv.load_state_dict(
+                {"weight": concatenated_weights, "bias": concatenated_bias}, strict=True, assign=True
+            )
+
+        self.fused_projections = True
+
+    @torch.no_grad()
+    def unfuse_projections(self):
+        if not getattr(self, "fused_projections", False):
+            return
+
+        if hasattr(self, "to_qkv"):
+            delattr(self, "to_qkv")
+        if hasattr(self, "to_kv"):
+            delattr(self, "to_kv")
+        if hasattr(self, "to_added_kv"):
+            delattr(self, "to_added_kv")
+
+        self.fused_projections = False
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        encoder_hidden_states: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        rotary_emb: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
+        **kwargs,
+    ) -> torch.Tensor:
+        return self.processor(self, hidden_states, encoder_hidden_states, attention_mask, rotary_emb, **kwargs)
+
+
+# Copied from diffusers.models.transformers.transformer_wan.WanImageEmbedding
+class WanImageEmbedding(torch.nn.Module):
+    def __init__(self, in_features: int, out_features: int, pos_embed_seq_len=None):
+        super().__init__()
+
+        self.norm1 = FP32LayerNorm(in_features)
+        self.ff = FeedForward(in_features, out_features, mult=1, activation_fn="gelu")
+        self.norm2 = FP32LayerNorm(out_features)
+        if pos_embed_seq_len is not None:
+            self.pos_embed = nn.Parameter(torch.zeros(1, pos_embed_seq_len, in_features))
+        else:
+            self.pos_embed = None
+
+    def forward(self, encoder_hidden_states_image: torch.Tensor) -> torch.Tensor:
+        if self.pos_embed is not None:
+            batch_size, seq_len, embed_dim = encoder_hidden_states_image.shape
+            encoder_hidden_states_image = encoder_hidden_states_image.view(-1, 2 * seq_len, embed_dim)
+            encoder_hidden_states_image = encoder_hidden_states_image + self.pos_embed
+
+        hidden_states = self.norm1(encoder_hidden_states_image)
+        hidden_states = self.ff(hidden_states)
+        hidden_states = self.norm2(hidden_states)
+        return hidden_states
+
+
+# Copied from diffusers.models.transformers.transformer_wan.WanTimeTextImageEmbedding
+class WanTimeTextImageEmbedding(nn.Module):
+    def __init__(
+        self,
+        dim: int,
+        time_freq_dim: int,
+        time_proj_dim: int,
+        text_embed_dim: int,
+        image_embed_dim: Optional[int] = None,
+        pos_embed_seq_len: Optional[int] = None,
+    ):
+        super().__init__()
+
+        self.timesteps_proj = Timesteps(num_channels=time_freq_dim, flip_sin_to_cos=True, downscale_freq_shift=0)
+        self.time_embedder = TimestepEmbedding(in_channels=time_freq_dim, time_embed_dim=dim)
+        self.act_fn = nn.SiLU()
+        self.time_proj = nn.Linear(dim, time_proj_dim)
+        self.text_embedder = PixArtAlphaTextProjection(text_embed_dim, dim, act_fn="gelu_tanh")
+
+        self.image_embedder = None
+        if image_embed_dim is not None:
+            self.image_embedder = WanImageEmbedding(image_embed_dim, dim, pos_embed_seq_len=pos_embed_seq_len)
+
+    def forward(
+        self,
+        timestep: torch.Tensor,
+        encoder_hidden_states: torch.Tensor,
+        encoder_hidden_states_image: Optional[torch.Tensor] = None,
+        timestep_seq_len: Optional[int] = None,
+    ):
+        timestep = self.timesteps_proj(timestep)
+        if timestep_seq_len is not None:
+            timestep = timestep.unflatten(0, (-1, timestep_seq_len))
+
+        time_embedder_dtype = next(iter(self.time_embedder.parameters())).dtype
+        if timestep.dtype != time_embedder_dtype and time_embedder_dtype != torch.int8:
+            timestep = timestep.to(time_embedder_dtype)
+        temb = self.time_embedder(timestep).type_as(encoder_hidden_states)
+        timestep_proj = self.time_proj(self.act_fn(temb))
+
+        encoder_hidden_states = self.text_embedder(encoder_hidden_states)
+        if encoder_hidden_states_image is not None:
+            encoder_hidden_states_image = self.image_embedder(encoder_hidden_states_image)
+
+        return temb, timestep_proj, encoder_hidden_states, encoder_hidden_states_image
+
+
+class ChronoEditRotaryPosEmbed(nn.Module):
+    def __init__(
+        self,
+        attention_head_dim: int,
+        patch_size: Tuple[int, int, int],
+        max_seq_len: int,
+        theta: float = 10000.0,
+        temporal_skip_len: int = 8,
+    ):
+        super().__init__()
+
+        self.attention_head_dim = attention_head_dim
+        self.patch_size = patch_size
+        self.max_seq_len = max_seq_len
+        self.temporal_skip_len = temporal_skip_len
+
+        h_dim = w_dim = 2 * (attention_head_dim // 6)
+        t_dim = attention_head_dim - h_dim - w_dim
+        freqs_dtype = torch.float32 if torch.backends.mps.is_available() else torch.float64
+
+        freqs_cos = []
+        freqs_sin = []
+
+        for dim in [t_dim, h_dim, w_dim]:
+            freq_cos, freq_sin = get_1d_rotary_pos_embed(
+                dim,
+                max_seq_len,
+                theta,
+                use_real=True,
+                repeat_interleave_real=True,
+                freqs_dtype=freqs_dtype,
+            )
+            freqs_cos.append(freq_cos)
+            freqs_sin.append(freq_sin)
+
+        self.register_buffer("freqs_cos", torch.cat(freqs_cos, dim=1), persistent=False)
+        self.register_buffer("freqs_sin", torch.cat(freqs_sin, dim=1), persistent=False)
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        batch_size, num_channels, num_frames, height, width = hidden_states.shape
+        p_t, p_h, p_w = self.patch_size
+        ppf, pph, ppw = num_frames // p_t, height // p_h, width // p_w
+
+        split_sizes = [
+            self.attention_head_dim - 2 * (self.attention_head_dim // 3),
+            self.attention_head_dim // 3,
+            self.attention_head_dim // 3,
+        ]
+
+        freqs_cos = self.freqs_cos.split(split_sizes, dim=1)
+        freqs_sin = self.freqs_sin.split(split_sizes, dim=1)
+
+        if num_frames == 2:
+            freqs_cos_f = freqs_cos[0][: self.temporal_skip_len][[0, -1]].view(ppf, 1, 1, -1).expand(ppf, pph, ppw, -1)
+        else:
+            freqs_cos_f = freqs_cos[0][:ppf].view(ppf, 1, 1, -1).expand(ppf, pph, ppw, -1)
+        freqs_cos_h = freqs_cos[1][:pph].view(1, pph, 1, -1).expand(ppf, pph, ppw, -1)
+        freqs_cos_w = freqs_cos[2][:ppw].view(1, 1, ppw, -1).expand(ppf, pph, ppw, -1)
+
+        if num_frames == 2:
+            freqs_sin_f = freqs_sin[0][: self.temporal_skip_len][[0, -1]].view(ppf, 1, 1, -1).expand(ppf, pph, ppw, -1)
+        else:
+            freqs_sin_f = freqs_sin[0][:ppf].view(ppf, 1, 1, -1).expand(ppf, pph, ppw, -1)
+        freqs_sin_h = freqs_sin[1][:pph].view(1, pph, 1, -1).expand(ppf, pph, ppw, -1)
+        freqs_sin_w = freqs_sin[2][:ppw].view(1, 1, ppw, -1).expand(ppf, pph, ppw, -1)
+
+        freqs_cos = torch.cat([freqs_cos_f, freqs_cos_h, freqs_cos_w], dim=-1).reshape(1, ppf * pph * ppw, 1, -1)
+        freqs_sin = torch.cat([freqs_sin_f, freqs_sin_h, freqs_sin_w], dim=-1).reshape(1, ppf * pph * ppw, 1, -1)
+
+        return freqs_cos, freqs_sin
+
+
+@maybe_allow_in_graph
+# Copied from diffusers.models.transformers.transformer_wan.WanTransformerBlock
+class WanTransformerBlock(nn.Module):
+    def __init__(
+        self,
+        dim: int,
+        ffn_dim: int,
+        num_heads: int,
+        qk_norm: str = "rms_norm_across_heads",
+        cross_attn_norm: bool = False,
+        eps: float = 1e-6,
+        added_kv_proj_dim: Optional[int] = None,
+    ):
+        super().__init__()
+
+        # 1. Self-attention
+        self.norm1 = FP32LayerNorm(dim, eps, elementwise_affine=False)
+        self.attn1 = WanAttention(
+            dim=dim,
+            heads=num_heads,
+            dim_head=dim // num_heads,
+            eps=eps,
+            cross_attention_dim_head=None,
+            processor=WanAttnProcessor(),
+        )
+
+        # 2. Cross-attention
+        self.attn2 = WanAttention(
+            dim=dim,
+            heads=num_heads,
+            dim_head=dim // num_heads,
+            eps=eps,
+            added_kv_proj_dim=added_kv_proj_dim,
+            cross_attention_dim_head=dim // num_heads,
+            processor=WanAttnProcessor(),
+        )
+        self.norm2 = FP32LayerNorm(dim, eps, elementwise_affine=True) if cross_attn_norm else nn.Identity()
+
+        # 3. Feed-forward
+        self.ffn = FeedForward(dim, inner_dim=ffn_dim, activation_fn="gelu-approximate")
+        self.norm3 = FP32LayerNorm(dim, eps, elementwise_affine=False)
+
+        self.scale_shift_table = nn.Parameter(torch.randn(1, 6, dim) / dim**0.5)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        encoder_hidden_states: torch.Tensor,
+        temb: torch.Tensor,
+        rotary_emb: torch.Tensor,
+    ) -> torch.Tensor:
+        if temb.ndim == 4:
+            # temb: batch_size, seq_len, 6, inner_dim (wan2.2 ti2v)
+            shift_msa, scale_msa, gate_msa, c_shift_msa, c_scale_msa, c_gate_msa = (
+                self.scale_shift_table.unsqueeze(0) + temb.float()
+            ).chunk(6, dim=2)
+            # batch_size, seq_len, 1, inner_dim
+            shift_msa = shift_msa.squeeze(2)
+            scale_msa = scale_msa.squeeze(2)
+            gate_msa = gate_msa.squeeze(2)
+            c_shift_msa = c_shift_msa.squeeze(2)
+            c_scale_msa = c_scale_msa.squeeze(2)
+            c_gate_msa = c_gate_msa.squeeze(2)
+        else:
+            # temb: batch_size, 6, inner_dim (wan2.1/wan2.2 14B)
+            shift_msa, scale_msa, gate_msa, c_shift_msa, c_scale_msa, c_gate_msa = (
+                self.scale_shift_table + temb.float()
+            ).chunk(6, dim=1)
+
+        # 1. Self-attention
+        norm_hidden_states = (self.norm1(hidden_states.float()) * (1 + scale_msa) + shift_msa).type_as(hidden_states)
+        attn_output = self.attn1(norm_hidden_states, None, None, rotary_emb)
+        hidden_states = (hidden_states.float() + attn_output * gate_msa).type_as(hidden_states)
+
+        # 2. Cross-attention
+        norm_hidden_states = self.norm2(hidden_states.float()).type_as(hidden_states)
+        attn_output = self.attn2(norm_hidden_states, encoder_hidden_states, None, None)
+        hidden_states = hidden_states + attn_output
+
+        # 3. Feed-forward
+        norm_hidden_states = (self.norm3(hidden_states.float()) * (1 + c_scale_msa) + c_shift_msa).type_as(
+            hidden_states
+        )
+        ff_output = self.ffn(norm_hidden_states)
+        hidden_states = (hidden_states.float() + ff_output.float() * c_gate_msa).type_as(hidden_states)
+
+        return hidden_states
+
+
+# modified from diffusers.models.transformers.transformer_wan.WanTransformer3DModel
+class ChronoEditTransformer3DModel(
+    ModelMixin, ConfigMixin, PeftAdapterMixin, FromOriginalModelMixin, CacheMixin, AttentionMixin
+):
+    r"""
+    A Transformer model for video-like data used in the ChronoEdit model.
+
+    Args:
+        patch_size (`Tuple[int]`, defaults to `(1, 2, 2)`):
+            3D patch dimensions for video embedding (t_patch, h_patch, w_patch).
+        num_attention_heads (`int`, defaults to `40`):
+            Fixed length for text embeddings.
+        attention_head_dim (`int`, defaults to `128`):
+            The number of channels in each head.
+        in_channels (`int`, defaults to `16`):
+            The number of channels in the input.
+        out_channels (`int`, defaults to `16`):
+            The number of channels in the output.
+        text_dim (`int`, defaults to `512`):
+            Input dimension for text embeddings.
+        freq_dim (`int`, defaults to `256`):
+            Dimension for sinusoidal time embeddings.
+        ffn_dim (`int`, defaults to `13824`):
+            Intermediate dimension in feed-forward network.
+        num_layers (`int`, defaults to `40`):
+            The number of layers of transformer blocks to use.
+        window_size (`Tuple[int]`, defaults to `(-1, -1)`):
+            Window size for local attention (-1 indicates global attention).
+        cross_attn_norm (`bool`, defaults to `True`):
+            Enable cross-attention normalization.
+        qk_norm (`bool`, defaults to `True`):
+            Enable query/key normalization.
+        eps (`float`, defaults to `1e-6`):
+            Epsilon value for normalization layers.
+        add_img_emb (`bool`, defaults to `False`):
+            Whether to use img_emb.
+        added_kv_proj_dim (`int`, *optional*, defaults to `None`):
+            The number of channels to use for the added key and value projections. If `None`, no projection is used.
+    """
+
+    _supports_gradient_checkpointing = True
+    _skip_layerwise_casting_patterns = ["patch_embedding", "condition_embedder", "norm"]
+    _no_split_modules = ["WanTransformerBlock"]
+    _keep_in_fp32_modules = ["time_embedder", "scale_shift_table", "norm1", "norm2", "norm3"]
+    _keys_to_ignore_on_load_unexpected = ["norm_added_q"]
+    _repeated_blocks = ["WanTransformerBlock"]
+    _cp_plan = {
+        "rope": {
+            0: ContextParallelInput(split_dim=1, expected_dims=4, split_output=True),
+            1: ContextParallelInput(split_dim=1, expected_dims=4, split_output=True),
+        },
+        "blocks.0": {
+            "hidden_states": ContextParallelInput(split_dim=1, expected_dims=3, split_output=False),
+        },
+        # Reference: https://github.com/huggingface/diffusers/pull/12660
+        # We need to disable the splitting of encoder_hidden_states because
+        # the image_encoder consistently generates 257 tokens for image_embed. This causes
+        # the shape of encoder_hidden_states—whose token count is always 769 (512 + 257)
+        # after concatenation—to be indivisible by the number of devices in the CP.
+        "proj_out": ContextParallelOutput(gather_dim=1, expected_dims=3),
+    }
+
+    @register_to_config
+    def __init__(
+        self,
+        patch_size: Tuple[int] = (1, 2, 2),
+        num_attention_heads: int = 40,
+        attention_head_dim: int = 128,
+        in_channels: int = 16,
+        out_channels: int = 16,
+        text_dim: int = 4096,
+        freq_dim: int = 256,
+        ffn_dim: int = 13824,
+        num_layers: int = 40,
+        cross_attn_norm: bool = True,
+        qk_norm: Optional[str] = "rms_norm_across_heads",
+        eps: float = 1e-6,
+        image_dim: Optional[int] = None,
+        added_kv_proj_dim: Optional[int] = None,
+        rope_max_seq_len: int = 1024,
+        pos_embed_seq_len: Optional[int] = None,
+        rope_temporal_skip_len: int = 8,
+    ) -> None:
+        super().__init__()
+
+        inner_dim = num_attention_heads * attention_head_dim
+        out_channels = out_channels or in_channels
+
+        # 1. Patch & position embedding
+        self.rope = ChronoEditRotaryPosEmbed(
+            attention_head_dim, patch_size, rope_max_seq_len, temporal_skip_len=rope_temporal_skip_len
+        )
+        self.patch_embedding = nn.Conv3d(in_channels, inner_dim, kernel_size=patch_size, stride=patch_size)
+
+        # 2. Condition embeddings
+        # image_embedding_dim=1280 for I2V model
+        self.condition_embedder = WanTimeTextImageEmbedding(
+            dim=inner_dim,
+            time_freq_dim=freq_dim,
+            time_proj_dim=inner_dim * 6,
+            text_embed_dim=text_dim,
+            image_embed_dim=image_dim,
+            pos_embed_seq_len=pos_embed_seq_len,
+        )
+
+        # 3. Transformer blocks
+        self.blocks = nn.ModuleList(
+            [
+                WanTransformerBlock(
+                    inner_dim, ffn_dim, num_attention_heads, qk_norm, cross_attn_norm, eps, added_kv_proj_dim
+                )
+                for _ in range(num_layers)
+            ]
+        )
+
+        # 4. Output norm & projection
+        self.norm_out = FP32LayerNorm(inner_dim, eps, elementwise_affine=False)
+        self.proj_out = nn.Linear(inner_dim, out_channels * math.prod(patch_size))
+        self.scale_shift_table = nn.Parameter(torch.randn(1, 2, inner_dim) / inner_dim**0.5)
+
+        self.gradient_checkpointing = False
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        timestep: torch.LongTensor,
+        encoder_hidden_states: torch.Tensor,
+        encoder_hidden_states_image: Optional[torch.Tensor] = None,
+        return_dict: bool = True,
+        attention_kwargs: Optional[Dict[str, Any]] = None,
+    ) -> Union[torch.Tensor, Dict[str, torch.Tensor]]:
+        if attention_kwargs is not None:
+            attention_kwargs = attention_kwargs.copy()
+            lora_scale = attention_kwargs.pop("scale", 1.0)
+        else:
+            lora_scale = 1.0
+
+        if USE_PEFT_BACKEND:
+            # weight the lora layers by setting `lora_scale` for each PEFT layer
+            scale_lora_layers(self, lora_scale)
+        else:
+            if attention_kwargs is not None and attention_kwargs.get("scale", None) is not None:
+                logger.warning(
+                    "Passing `scale` via `attention_kwargs` when not using the PEFT backend is ineffective."
+                )
+
+        batch_size, num_channels, num_frames, height, width = hidden_states.shape
+        p_t, p_h, p_w = self.config.patch_size
+        post_patch_num_frames = num_frames // p_t
+        post_patch_height = height // p_h
+        post_patch_width = width // p_w
+
+        rotary_emb = self.rope(hidden_states)
+
+        hidden_states = self.patch_embedding(hidden_states)
+        hidden_states = hidden_states.flatten(2).transpose(1, 2)
+
+        # timestep shape: batch_size, or batch_size, seq_len (wan 2.2 ti2v)
+        if timestep.ndim == 2:
+            ts_seq_len = timestep.shape[1]
+            timestep = timestep.flatten()  # batch_size * seq_len
+        else:
+            ts_seq_len = None
+
+        temb, timestep_proj, encoder_hidden_states, encoder_hidden_states_image = self.condition_embedder(
+            timestep, encoder_hidden_states, encoder_hidden_states_image, timestep_seq_len=ts_seq_len
+        )
+        if ts_seq_len is not None:
+            # batch_size, seq_len, 6, inner_dim
+            timestep_proj = timestep_proj.unflatten(2, (6, -1))
+        else:
+            # batch_size, 6, inner_dim
+            timestep_proj = timestep_proj.unflatten(1, (6, -1))
+
+        if encoder_hidden_states_image is not None:
+            encoder_hidden_states = torch.concat([encoder_hidden_states_image, encoder_hidden_states], dim=1)
+
+        # 4. Transformer blocks
+        if torch.is_grad_enabled() and self.gradient_checkpointing:
+            for block in self.blocks:
+                hidden_states = self._gradient_checkpointing_func(
+                    block, hidden_states, encoder_hidden_states, timestep_proj, rotary_emb
+                )
+        else:
+            for block in self.blocks:
+                hidden_states = block(hidden_states, encoder_hidden_states, timestep_proj, rotary_emb)
+
+        # 5. Output norm, projection & unpatchify
+        if temb.ndim == 3:
+            # batch_size, seq_len, inner_dim (wan 2.2 ti2v)
+            shift, scale = (self.scale_shift_table.unsqueeze(0).to(temb.device) + temb.unsqueeze(2)).chunk(2, dim=2)
+            shift = shift.squeeze(2)
+            scale = scale.squeeze(2)
+        else:
+            # batch_size, inner_dim
+            shift, scale = (self.scale_shift_table.to(temb.device) + temb.unsqueeze(1)).chunk(2, dim=1)
+
+        # Move the shift and scale tensors to the same device as hidden_states.
+        # When using multi-GPU inference via accelerate these will be on the
+        # first device rather than the last device, which hidden_states ends up
+        # on.
+        shift = shift.to(hidden_states.device)
+        scale = scale.to(hidden_states.device)
+
+        hidden_states = (self.norm_out(hidden_states.float()) * (1 + scale) + shift).type_as(hidden_states)
+        hidden_states = self.proj_out(hidden_states)
+
+        hidden_states = hidden_states.reshape(
+            batch_size, post_patch_num_frames, post_patch_height, post_patch_width, p_t, p_h, p_w, -1
+        )
+        hidden_states = hidden_states.permute(0, 7, 1, 4, 2, 5, 3, 6)
+        output = hidden_states.flatten(6, 7).flatten(4, 5).flatten(2, 3)
+
+        if USE_PEFT_BACKEND:
+            # remove `lora_scale` from each PEFT layer
+            unscale_lora_layers(self, lora_scale)
+
+        if not return_dict:
+            return (output,)
+
+        return Transformer2DModelOutput(sample=output)
diff --git a/src/diffusers/models/transformers/transformer_cogview3plus.py b/src/diffusers/models/transformers/transformer_cogview3plus.py
index 7356f4a606bb..e48290fb39d4 100644
--- a/src/diffusers/models/transformers/transformer_cogview3plus.py
+++ b/src/diffusers/models/transformers/transformer_cogview3plus.py
@@ -13,15 +13,15 @@
 # limitations under the License.
 
 
-from typing import Dict, Tuple, Union
+from typing import Tuple, Union
 
 import torch
 import torch.nn as nn
 
 from ...configuration_utils import ConfigMixin, register_to_config
 from ...utils import logging
-from ..attention import FeedForward
-from ..attention_processor import Attention, AttentionProcessor, CogVideoXAttnProcessor2_0
+from ..attention import AttentionMixin, FeedForward
+from ..attention_processor import Attention, CogVideoXAttnProcessor2_0
 from ..embeddings import CogView3CombinedTimestepSizeEmbeddings, CogView3PlusPatchEmbed
 from ..modeling_outputs import Transformer2DModelOutput
 from ..modeling_utils import ModelMixin
@@ -125,7 +125,7 @@ def forward(
         return hidden_states, encoder_hidden_states
 
 
-class CogView3PlusTransformer2DModel(ModelMixin, ConfigMixin):
+class CogView3PlusTransformer2DModel(ModelMixin, AttentionMixin, ConfigMixin):
     r"""
     The Transformer model introduced in [CogView3: Finer and Faster Text-to-Image Generation via Relay
     Diffusion](https://huggingface.co/papers/2403.05121).
@@ -224,66 +224,6 @@ def __init__(
 
         self.gradient_checkpointing = False
 
-    @property
-    # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.attn_processors
-    def attn_processors(self) -> Dict[str, AttentionProcessor]:
-        r"""
-        Returns:
-            `dict` of attention processors: A dictionary containing all attention processors used in the model with
-            indexed by its weight name.
-        """
-        # set recursively
-        processors = {}
-
-        def fn_recursive_add_processors(name: str, module: torch.nn.Module, processors: Dict[str, AttentionProcessor]):
-            if hasattr(module, "get_processor"):
-                processors[f"{name}.processor"] = module.get_processor()
-
-            for sub_name, child in module.named_children():
-                fn_recursive_add_processors(f"{name}.{sub_name}", child, processors)
-
-            return processors
-
-        for name, module in self.named_children():
-            fn_recursive_add_processors(name, module, processors)
-
-        return processors
-
-    # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.set_attn_processor
-    def set_attn_processor(self, processor: Union[AttentionProcessor, Dict[str, AttentionProcessor]]):
-        r"""
-        Sets the attention processor to use to compute attention.
-
-        Parameters:
-            processor (`dict` of `AttentionProcessor` or only `AttentionProcessor`):
-                The instantiated processor class or a dictionary of processor classes that will be set as the processor
-                for **all** `Attention` layers.
-
-                If `processor` is a dict, the key needs to define the path to the corresponding cross attention
-                processor. This is strongly recommended when setting trainable attention processors.
-
-        """
-        count = len(self.attn_processors.keys())
-
-        if isinstance(processor, dict) and len(processor) != count:
-            raise ValueError(
-                f"A dict of processors was passed, but the number of processors {len(processor)} does not match the"
-                f" number of attention layers: {count}. Please make sure to pass {count} processor classes."
-            )
-
-        def fn_recursive_attn_processor(name: str, module: torch.nn.Module, processor):
-            if hasattr(module, "set_processor"):
-                if not isinstance(processor, dict):
-                    module.set_processor(processor)
-                else:
-                    module.set_processor(processor.pop(f"{name}.processor"))
-
-            for sub_name, child in module.named_children():
-                fn_recursive_attn_processor(f"{name}.{sub_name}", child, processor)
-
-        for name, module in self.named_children():
-            fn_recursive_attn_processor(name, module, processor)
-
     def forward(
         self,
         hidden_states: torch.Tensor,
diff --git a/src/diffusers/models/transformers/transformer_flux2.py b/src/diffusers/models/transformers/transformer_flux2.py
new file mode 100644
index 000000000000..c10bf3ed4f7b
--- /dev/null
+++ b/src/diffusers/models/transformers/transformer_flux2.py
@@ -0,0 +1,908 @@
+# Copyright 2025 Black Forest Labs, The HuggingFace Team and The InstantX Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import inspect
+from typing import Any, Dict, List, Optional, Tuple, Union
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from ...configuration_utils import ConfigMixin, register_to_config
+from ...loaders import FluxTransformer2DLoadersMixin, FromOriginalModelMixin, PeftAdapterMixin
+from ...utils import USE_PEFT_BACKEND, is_torch_npu_available, logging, scale_lora_layers, unscale_lora_layers
+from .._modeling_parallel import ContextParallelInput, ContextParallelOutput
+from ..attention import AttentionMixin, AttentionModuleMixin
+from ..attention_dispatch import dispatch_attention_fn
+from ..cache_utils import CacheMixin
+from ..embeddings import (
+    TimestepEmbedding,
+    Timesteps,
+    apply_rotary_emb,
+    get_1d_rotary_pos_embed,
+)
+from ..modeling_outputs import Transformer2DModelOutput
+from ..modeling_utils import ModelMixin
+from ..normalization import AdaLayerNormContinuous
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+
+def _get_projections(attn: "Flux2Attention", hidden_states, encoder_hidden_states=None):
+    query = attn.to_q(hidden_states)
+    key = attn.to_k(hidden_states)
+    value = attn.to_v(hidden_states)
+
+    encoder_query = encoder_key = encoder_value = None
+    if encoder_hidden_states is not None and attn.added_kv_proj_dim is not None:
+        encoder_query = attn.add_q_proj(encoder_hidden_states)
+        encoder_key = attn.add_k_proj(encoder_hidden_states)
+        encoder_value = attn.add_v_proj(encoder_hidden_states)
+
+    return query, key, value, encoder_query, encoder_key, encoder_value
+
+
+def _get_fused_projections(attn: "Flux2Attention", hidden_states, encoder_hidden_states=None):
+    query, key, value = attn.to_qkv(hidden_states).chunk(3, dim=-1)
+
+    encoder_query = encoder_key = encoder_value = (None,)
+    if encoder_hidden_states is not None and hasattr(attn, "to_added_qkv"):
+        encoder_query, encoder_key, encoder_value = attn.to_added_qkv(encoder_hidden_states).chunk(3, dim=-1)
+
+    return query, key, value, encoder_query, encoder_key, encoder_value
+
+
+def _get_qkv_projections(attn: "Flux2Attention", hidden_states, encoder_hidden_states=None):
+    if attn.fused_projections:
+        return _get_fused_projections(attn, hidden_states, encoder_hidden_states)
+    return _get_projections(attn, hidden_states, encoder_hidden_states)
+
+
+class Flux2SwiGLU(nn.Module):
+    """
+    Flux 2 uses a SwiGLU-style activation in the transformer feedforward sub-blocks, but with the linear projection
+    layer fused into the first linear layer of the FF sub-block. Thus, this module has no trainable parameters.
+    """
+
+    def __init__(self):
+        super().__init__()
+        self.gate_fn = nn.SiLU()
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x1, x2 = x.chunk(2, dim=-1)
+        x = self.gate_fn(x1) * x2
+        return x
+
+
+class Flux2FeedForward(nn.Module):
+    def __init__(
+        self,
+        dim: int,
+        dim_out: Optional[int] = None,
+        mult: float = 3.0,
+        inner_dim: Optional[int] = None,
+        bias: bool = False,
+    ):
+        super().__init__()
+        if inner_dim is None:
+            inner_dim = int(dim * mult)
+        dim_out = dim_out or dim
+
+        # Flux2SwiGLU will reduce the dimension by half
+        self.linear_in = nn.Linear(dim, inner_dim * 2, bias=bias)
+        self.act_fn = Flux2SwiGLU()
+        self.linear_out = nn.Linear(inner_dim, dim_out, bias=bias)
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = self.linear_in(x)
+        x = self.act_fn(x)
+        x = self.linear_out(x)
+        return x
+
+
+class Flux2AttnProcessor:
+    _attention_backend = None
+    _parallel_config = None
+
+    def __init__(self):
+        if not hasattr(F, "scaled_dot_product_attention"):
+            raise ImportError(f"{self.__class__.__name__} requires PyTorch 2.0. Please upgrade your pytorch version.")
+
+    def __call__(
+        self,
+        attn: "Flux2Attention",
+        hidden_states: torch.Tensor,
+        encoder_hidden_states: torch.Tensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        image_rotary_emb: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        query, key, value, encoder_query, encoder_key, encoder_value = _get_qkv_projections(
+            attn, hidden_states, encoder_hidden_states
+        )
+
+        query = query.unflatten(-1, (attn.heads, -1))
+        key = key.unflatten(-1, (attn.heads, -1))
+        value = value.unflatten(-1, (attn.heads, -1))
+
+        query = attn.norm_q(query)
+        key = attn.norm_k(key)
+
+        if attn.added_kv_proj_dim is not None:
+            encoder_query = encoder_query.unflatten(-1, (attn.heads, -1))
+            encoder_key = encoder_key.unflatten(-1, (attn.heads, -1))
+            encoder_value = encoder_value.unflatten(-1, (attn.heads, -1))
+
+            encoder_query = attn.norm_added_q(encoder_query)
+            encoder_key = attn.norm_added_k(encoder_key)
+
+            query = torch.cat([encoder_query, query], dim=1)
+            key = torch.cat([encoder_key, key], dim=1)
+            value = torch.cat([encoder_value, value], dim=1)
+
+        if image_rotary_emb is not None:
+            query = apply_rotary_emb(query, image_rotary_emb, sequence_dim=1)
+            key = apply_rotary_emb(key, image_rotary_emb, sequence_dim=1)
+
+        hidden_states = dispatch_attention_fn(
+            query,
+            key,
+            value,
+            attn_mask=attention_mask,
+            backend=self._attention_backend,
+            parallel_config=self._parallel_config,
+        )
+        hidden_states = hidden_states.flatten(2, 3)
+        hidden_states = hidden_states.to(query.dtype)
+
+        if encoder_hidden_states is not None:
+            encoder_hidden_states, hidden_states = hidden_states.split_with_sizes(
+                [encoder_hidden_states.shape[1], hidden_states.shape[1] - encoder_hidden_states.shape[1]], dim=1
+            )
+            encoder_hidden_states = attn.to_add_out(encoder_hidden_states)
+
+        hidden_states = attn.to_out[0](hidden_states)
+        hidden_states = attn.to_out[1](hidden_states)
+
+        if encoder_hidden_states is not None:
+            return hidden_states, encoder_hidden_states
+        else:
+            return hidden_states
+
+
+class Flux2Attention(torch.nn.Module, AttentionModuleMixin):
+    _default_processor_cls = Flux2AttnProcessor
+    _available_processors = [Flux2AttnProcessor]
+
+    def __init__(
+        self,
+        query_dim: int,
+        heads: int = 8,
+        dim_head: int = 64,
+        dropout: float = 0.0,
+        bias: bool = False,
+        added_kv_proj_dim: Optional[int] = None,
+        added_proj_bias: Optional[bool] = True,
+        out_bias: bool = True,
+        eps: float = 1e-5,
+        out_dim: int = None,
+        elementwise_affine: bool = True,
+        processor=None,
+    ):
+        super().__init__()
+
+        self.head_dim = dim_head
+        self.inner_dim = out_dim if out_dim is not None else dim_head * heads
+        self.query_dim = query_dim
+        self.out_dim = out_dim if out_dim is not None else query_dim
+        self.heads = out_dim // dim_head if out_dim is not None else heads
+
+        self.use_bias = bias
+        self.dropout = dropout
+
+        self.added_kv_proj_dim = added_kv_proj_dim
+        self.added_proj_bias = added_proj_bias
+
+        self.to_q = torch.nn.Linear(query_dim, self.inner_dim, bias=bias)
+        self.to_k = torch.nn.Linear(query_dim, self.inner_dim, bias=bias)
+        self.to_v = torch.nn.Linear(query_dim, self.inner_dim, bias=bias)
+
+        # QK Norm
+        self.norm_q = torch.nn.RMSNorm(dim_head, eps=eps, elementwise_affine=elementwise_affine)
+        self.norm_k = torch.nn.RMSNorm(dim_head, eps=eps, elementwise_affine=elementwise_affine)
+
+        self.to_out = torch.nn.ModuleList([])
+        self.to_out.append(torch.nn.Linear(self.inner_dim, self.out_dim, bias=out_bias))
+        self.to_out.append(torch.nn.Dropout(dropout))
+
+        if added_kv_proj_dim is not None:
+            self.norm_added_q = torch.nn.RMSNorm(dim_head, eps=eps)
+            self.norm_added_k = torch.nn.RMSNorm(dim_head, eps=eps)
+            self.add_q_proj = torch.nn.Linear(added_kv_proj_dim, self.inner_dim, bias=added_proj_bias)
+            self.add_k_proj = torch.nn.Linear(added_kv_proj_dim, self.inner_dim, bias=added_proj_bias)
+            self.add_v_proj = torch.nn.Linear(added_kv_proj_dim, self.inner_dim, bias=added_proj_bias)
+            self.to_add_out = torch.nn.Linear(self.inner_dim, query_dim, bias=out_bias)
+
+        if processor is None:
+            processor = self._default_processor_cls()
+        self.set_processor(processor)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        encoder_hidden_states: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        image_rotary_emb: Optional[torch.Tensor] = None,
+        **kwargs,
+    ) -> torch.Tensor:
+        attn_parameters = set(inspect.signature(self.processor.__call__).parameters.keys())
+        unused_kwargs = [k for k, _ in kwargs.items() if k not in attn_parameters]
+        if len(unused_kwargs) > 0:
+            logger.warning(
+                f"joint_attention_kwargs {unused_kwargs} are not expected by {self.processor.__class__.__name__} and will be ignored."
+            )
+        kwargs = {k: w for k, w in kwargs.items() if k in attn_parameters}
+        return self.processor(self, hidden_states, encoder_hidden_states, attention_mask, image_rotary_emb, **kwargs)
+
+
+class Flux2ParallelSelfAttnProcessor:
+    _attention_backend = None
+    _parallel_config = None
+
+    def __init__(self):
+        if not hasattr(F, "scaled_dot_product_attention"):
+            raise ImportError(f"{self.__class__.__name__} requires PyTorch 2.0. Please upgrade your pytorch version.")
+
+    def __call__(
+        self,
+        attn: "Flux2ParallelSelfAttention",
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        image_rotary_emb: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        # Parallel in (QKV + MLP in) projection
+        hidden_states = attn.to_qkv_mlp_proj(hidden_states)
+        qkv, mlp_hidden_states = torch.split(
+            hidden_states, [3 * attn.inner_dim, attn.mlp_hidden_dim * attn.mlp_mult_factor], dim=-1
+        )
+
+        # Handle the attention logic
+        query, key, value = qkv.chunk(3, dim=-1)
+
+        query = query.unflatten(-1, (attn.heads, -1))
+        key = key.unflatten(-1, (attn.heads, -1))
+        value = value.unflatten(-1, (attn.heads, -1))
+
+        query = attn.norm_q(query)
+        key = attn.norm_k(key)
+
+        if image_rotary_emb is not None:
+            query = apply_rotary_emb(query, image_rotary_emb, sequence_dim=1)
+            key = apply_rotary_emb(key, image_rotary_emb, sequence_dim=1)
+
+        hidden_states = dispatch_attention_fn(
+            query,
+            key,
+            value,
+            attn_mask=attention_mask,
+            backend=self._attention_backend,
+            parallel_config=self._parallel_config,
+        )
+        hidden_states = hidden_states.flatten(2, 3)
+        hidden_states = hidden_states.to(query.dtype)
+
+        # Handle the feedforward (FF) logic
+        mlp_hidden_states = attn.mlp_act_fn(mlp_hidden_states)
+
+        # Concatenate and parallel output projection
+        hidden_states = torch.cat([hidden_states, mlp_hidden_states], dim=-1)
+        hidden_states = attn.to_out(hidden_states)
+
+        return hidden_states
+
+
+class Flux2ParallelSelfAttention(torch.nn.Module, AttentionModuleMixin):
+    """
+    Flux 2 parallel self-attention for the Flux 2 single-stream transformer blocks.
+
+    This implements a parallel transformer block, where the attention QKV projections are fused to the feedforward (FF)
+    input projections, and the attention output projections are fused to the FF output projections. See the [ViT-22B
+    paper](https://arxiv.org/abs/2302.05442) for a visual depiction of this type of transformer block.
+    """
+
+    _default_processor_cls = Flux2ParallelSelfAttnProcessor
+    _available_processors = [Flux2ParallelSelfAttnProcessor]
+    # Does not support QKV fusion as the QKV projections are always fused
+    _supports_qkv_fusion = False
+
+    def __init__(
+        self,
+        query_dim: int,
+        heads: int = 8,
+        dim_head: int = 64,
+        dropout: float = 0.0,
+        bias: bool = False,
+        out_bias: bool = True,
+        eps: float = 1e-5,
+        out_dim: int = None,
+        elementwise_affine: bool = True,
+        mlp_ratio: float = 4.0,
+        mlp_mult_factor: int = 2,
+        processor=None,
+    ):
+        super().__init__()
+
+        self.head_dim = dim_head
+        self.inner_dim = out_dim if out_dim is not None else dim_head * heads
+        self.query_dim = query_dim
+        self.out_dim = out_dim if out_dim is not None else query_dim
+        self.heads = out_dim // dim_head if out_dim is not None else heads
+
+        self.use_bias = bias
+        self.dropout = dropout
+
+        self.mlp_ratio = mlp_ratio
+        self.mlp_hidden_dim = int(query_dim * self.mlp_ratio)
+        self.mlp_mult_factor = mlp_mult_factor
+
+        # Fused QKV projections + MLP input projection
+        self.to_qkv_mlp_proj = torch.nn.Linear(
+            self.query_dim, self.inner_dim * 3 + self.mlp_hidden_dim * self.mlp_mult_factor, bias=bias
+        )
+        self.mlp_act_fn = Flux2SwiGLU()
+
+        # QK Norm
+        self.norm_q = torch.nn.RMSNorm(dim_head, eps=eps, elementwise_affine=elementwise_affine)
+        self.norm_k = torch.nn.RMSNorm(dim_head, eps=eps, elementwise_affine=elementwise_affine)
+
+        # Fused attention output projection + MLP output projection
+        self.to_out = torch.nn.Linear(self.inner_dim + self.mlp_hidden_dim, self.out_dim, bias=out_bias)
+
+        if processor is None:
+            processor = self._default_processor_cls()
+        self.set_processor(processor)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        image_rotary_emb: Optional[torch.Tensor] = None,
+        **kwargs,
+    ) -> torch.Tensor:
+        attn_parameters = set(inspect.signature(self.processor.__call__).parameters.keys())
+        unused_kwargs = [k for k, _ in kwargs.items() if k not in attn_parameters]
+        if len(unused_kwargs) > 0:
+            logger.warning(
+                f"joint_attention_kwargs {unused_kwargs} are not expected by {self.processor.__class__.__name__} and will be ignored."
+            )
+        kwargs = {k: w for k, w in kwargs.items() if k in attn_parameters}
+        return self.processor(self, hidden_states, attention_mask, image_rotary_emb, **kwargs)
+
+
+class Flux2SingleTransformerBlock(nn.Module):
+    def __init__(
+        self,
+        dim: int,
+        num_attention_heads: int,
+        attention_head_dim: int,
+        mlp_ratio: float = 3.0,
+        eps: float = 1e-6,
+        bias: bool = False,
+    ):
+        super().__init__()
+
+        self.norm = nn.LayerNorm(dim, elementwise_affine=False, eps=eps)
+
+        # Note that the MLP in/out linear layers are fused with the attention QKV/out projections, respectively; this
+        # is often called a "parallel" transformer block. See the [ViT-22B paper](https://arxiv.org/abs/2302.05442)
+        # for a visual depiction of this type of transformer block.
+        self.attn = Flux2ParallelSelfAttention(
+            query_dim=dim,
+            dim_head=attention_head_dim,
+            heads=num_attention_heads,
+            out_dim=dim,
+            bias=bias,
+            out_bias=bias,
+            eps=eps,
+            mlp_ratio=mlp_ratio,
+            mlp_mult_factor=2,
+            processor=Flux2ParallelSelfAttnProcessor(),
+        )
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        encoder_hidden_states: Optional[torch.Tensor],
+        temb_mod_params: Tuple[torch.Tensor, torch.Tensor, torch.Tensor],
+        image_rotary_emb: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
+        joint_attention_kwargs: Optional[Dict[str, Any]] = None,
+        split_hidden_states: bool = False,
+        text_seq_len: Optional[int] = None,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        # If encoder_hidden_states is None, hidden_states is assumed to have encoder_hidden_states already
+        # concatenated
+        if encoder_hidden_states is not None:
+            text_seq_len = encoder_hidden_states.shape[1]
+            hidden_states = torch.cat([encoder_hidden_states, hidden_states], dim=1)
+
+        mod_shift, mod_scale, mod_gate = temb_mod_params
+
+        norm_hidden_states = self.norm(hidden_states)
+        norm_hidden_states = (1 + mod_scale) * norm_hidden_states + mod_shift
+
+        joint_attention_kwargs = joint_attention_kwargs or {}
+        attn_output = self.attn(
+            hidden_states=norm_hidden_states,
+            image_rotary_emb=image_rotary_emb,
+            **joint_attention_kwargs,
+        )
+
+        hidden_states = hidden_states + mod_gate * attn_output
+        if hidden_states.dtype == torch.float16:
+            hidden_states = hidden_states.clip(-65504, 65504)
+
+        if split_hidden_states:
+            encoder_hidden_states, hidden_states = hidden_states[:, :text_seq_len], hidden_states[:, text_seq_len:]
+            return encoder_hidden_states, hidden_states
+        else:
+            return hidden_states
+
+
+class Flux2TransformerBlock(nn.Module):
+    def __init__(
+        self,
+        dim: int,
+        num_attention_heads: int,
+        attention_head_dim: int,
+        mlp_ratio: float = 3.0,
+        eps: float = 1e-6,
+        bias: bool = False,
+    ):
+        super().__init__()
+        self.mlp_hidden_dim = int(dim * mlp_ratio)
+
+        self.norm1 = nn.LayerNorm(dim, elementwise_affine=False, eps=eps)
+        self.norm1_context = nn.LayerNorm(dim, elementwise_affine=False, eps=eps)
+
+        self.attn = Flux2Attention(
+            query_dim=dim,
+            added_kv_proj_dim=dim,
+            dim_head=attention_head_dim,
+            heads=num_attention_heads,
+            out_dim=dim,
+            bias=bias,
+            added_proj_bias=bias,
+            out_bias=bias,
+            eps=eps,
+            processor=Flux2AttnProcessor(),
+        )
+
+        self.norm2 = nn.LayerNorm(dim, elementwise_affine=False, eps=eps)
+        self.ff = Flux2FeedForward(dim=dim, dim_out=dim, mult=mlp_ratio, bias=bias)
+
+        self.norm2_context = nn.LayerNorm(dim, elementwise_affine=False, eps=eps)
+        self.ff_context = Flux2FeedForward(dim=dim, dim_out=dim, mult=mlp_ratio, bias=bias)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        encoder_hidden_states: torch.Tensor,
+        temb_mod_params_img: Tuple[Tuple[torch.Tensor, torch.Tensor, torch.Tensor], ...],
+        temb_mod_params_txt: Tuple[Tuple[torch.Tensor, torch.Tensor, torch.Tensor], ...],
+        image_rotary_emb: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
+        joint_attention_kwargs: Optional[Dict[str, Any]] = None,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        joint_attention_kwargs = joint_attention_kwargs or {}
+
+        # Modulation parameters shape: [1, 1, self.dim]
+        (shift_msa, scale_msa, gate_msa), (shift_mlp, scale_mlp, gate_mlp) = temb_mod_params_img
+        (c_shift_msa, c_scale_msa, c_gate_msa), (c_shift_mlp, c_scale_mlp, c_gate_mlp) = temb_mod_params_txt
+
+        # Img stream
+        norm_hidden_states = self.norm1(hidden_states)
+        norm_hidden_states = (1 + scale_msa) * norm_hidden_states + shift_msa
+
+        # Conditioning txt stream
+        norm_encoder_hidden_states = self.norm1_context(encoder_hidden_states)
+        norm_encoder_hidden_states = (1 + c_scale_msa) * norm_encoder_hidden_states + c_shift_msa
+
+        # Attention on concatenated img + txt stream
+        attention_outputs = self.attn(
+            hidden_states=norm_hidden_states,
+            encoder_hidden_states=norm_encoder_hidden_states,
+            image_rotary_emb=image_rotary_emb,
+            **joint_attention_kwargs,
+        )
+
+        attn_output, context_attn_output = attention_outputs
+
+        # Process attention outputs for the image stream (`hidden_states`).
+        attn_output = gate_msa * attn_output
+        hidden_states = hidden_states + attn_output
+
+        norm_hidden_states = self.norm2(hidden_states)
+        norm_hidden_states = norm_hidden_states * (1 + scale_mlp) + shift_mlp
+
+        ff_output = self.ff(norm_hidden_states)
+        hidden_states = hidden_states + gate_mlp * ff_output
+
+        # Process attention outputs for the text stream (`encoder_hidden_states`).
+        context_attn_output = c_gate_msa * context_attn_output
+        encoder_hidden_states = encoder_hidden_states + context_attn_output
+
+        norm_encoder_hidden_states = self.norm2_context(encoder_hidden_states)
+        norm_encoder_hidden_states = norm_encoder_hidden_states * (1 + c_scale_mlp) + c_shift_mlp
+
+        context_ff_output = self.ff_context(norm_encoder_hidden_states)
+        encoder_hidden_states = encoder_hidden_states + c_gate_mlp * context_ff_output
+        if encoder_hidden_states.dtype == torch.float16:
+            encoder_hidden_states = encoder_hidden_states.clip(-65504, 65504)
+
+        return encoder_hidden_states, hidden_states
+
+
+class Flux2PosEmbed(nn.Module):
+    # modified from https://github.com/black-forest-labs/flux/blob/c00d7c60b085fce8058b9df845e036090873f2ce/src/flux/modules/layers.py#L11
+    def __init__(self, theta: int, axes_dim: List[int]):
+        super().__init__()
+        self.theta = theta
+        self.axes_dim = axes_dim
+
+    def forward(self, ids: torch.Tensor) -> torch.Tensor:
+        # Expected ids shape: [S, len(self.axes_dim)]
+        cos_out = []
+        sin_out = []
+        pos = ids.float()
+        is_mps = ids.device.type == "mps"
+        is_npu = ids.device.type == "npu"
+        freqs_dtype = torch.float32 if (is_mps or is_npu) else torch.float64
+        # Unlike Flux 1, loop over len(self.axes_dim) rather than ids.shape[-1]
+        for i in range(len(self.axes_dim)):
+            cos, sin = get_1d_rotary_pos_embed(
+                self.axes_dim[i],
+                pos[..., i],
+                theta=self.theta,
+                repeat_interleave_real=True,
+                use_real=True,
+                freqs_dtype=freqs_dtype,
+            )
+            cos_out.append(cos)
+            sin_out.append(sin)
+        freqs_cos = torch.cat(cos_out, dim=-1).to(ids.device)
+        freqs_sin = torch.cat(sin_out, dim=-1).to(ids.device)
+        return freqs_cos, freqs_sin
+
+
+class Flux2TimestepGuidanceEmbeddings(nn.Module):
+    def __init__(self, in_channels: int = 256, embedding_dim: int = 6144, bias: bool = False):
+        super().__init__()
+
+        self.time_proj = Timesteps(num_channels=in_channels, flip_sin_to_cos=True, downscale_freq_shift=0)
+        self.timestep_embedder = TimestepEmbedding(
+            in_channels=in_channels, time_embed_dim=embedding_dim, sample_proj_bias=bias
+        )
+
+        self.guidance_embedder = TimestepEmbedding(
+            in_channels=in_channels, time_embed_dim=embedding_dim, sample_proj_bias=bias
+        )
+
+    def forward(self, timestep: torch.Tensor, guidance: torch.Tensor) -> torch.Tensor:
+        timesteps_proj = self.time_proj(timestep)
+        timesteps_emb = self.timestep_embedder(timesteps_proj.to(timestep.dtype))  # (N, D)
+
+        guidance_proj = self.time_proj(guidance)
+        guidance_emb = self.guidance_embedder(guidance_proj.to(guidance.dtype))  # (N, D)
+
+        time_guidance_emb = timesteps_emb + guidance_emb
+
+        return time_guidance_emb
+
+
+class Flux2Modulation(nn.Module):
+    def __init__(self, dim: int, mod_param_sets: int = 2, bias: bool = False):
+        super().__init__()
+        self.mod_param_sets = mod_param_sets
+
+        self.linear = nn.Linear(dim, dim * 3 * self.mod_param_sets, bias=bias)
+        self.act_fn = nn.SiLU()
+
+    def forward(self, temb: torch.Tensor) -> Tuple[Tuple[torch.Tensor, torch.Tensor, torch.Tensor], ...]:
+        mod = self.act_fn(temb)
+        mod = self.linear(mod)
+
+        if mod.ndim == 2:
+            mod = mod.unsqueeze(1)
+        mod_params = torch.chunk(mod, 3 * self.mod_param_sets, dim=-1)
+        # Return tuple of 3-tuples of modulation params shift/scale/gate
+        return tuple(mod_params[3 * i : 3 * (i + 1)] for i in range(self.mod_param_sets))
+
+
+class Flux2Transformer2DModel(
+    ModelMixin,
+    ConfigMixin,
+    PeftAdapterMixin,
+    FromOriginalModelMixin,
+    FluxTransformer2DLoadersMixin,
+    CacheMixin,
+    AttentionMixin,
+):
+    """
+    The Transformer model introduced in Flux 2.
+
+    Reference: https://blackforestlabs.ai/announcing-black-forest-labs/
+
+    Args:
+        patch_size (`int`, defaults to `1`):
+            Patch size to turn the input data into small patches.
+        in_channels (`int`, defaults to `128`):
+            The number of channels in the input.
+        out_channels (`int`, *optional*, defaults to `None`):
+            The number of channels in the output. If not specified, it defaults to `in_channels`.
+        num_layers (`int`, defaults to `8`):
+            The number of layers of dual stream DiT blocks to use.
+        num_single_layers (`int`, defaults to `48`):
+            The number of layers of single stream DiT blocks to use.
+        attention_head_dim (`int`, defaults to `128`):
+            The number of dimensions to use for each attention head.
+        num_attention_heads (`int`, defaults to `48`):
+            The number of attention heads to use.
+        joint_attention_dim (`int`, defaults to `15360`):
+            The number of dimensions to use for the joint attention (embedding/channel dimension of
+            `encoder_hidden_states`).
+        pooled_projection_dim (`int`, defaults to `768`):
+            The number of dimensions to use for the pooled projection.
+        guidance_embeds (`bool`, defaults to `True`):
+            Whether to use guidance embeddings for guidance-distilled variant of the model.
+        axes_dims_rope (`Tuple[int]`, defaults to `(32, 32, 32, 32)`):
+            The dimensions to use for the rotary positional embeddings.
+    """
+
+    _supports_gradient_checkpointing = True
+    _no_split_modules = ["Flux2TransformerBlock", "Flux2SingleTransformerBlock"]
+    _skip_layerwise_casting_patterns = ["pos_embed", "norm"]
+    _repeated_blocks = ["Flux2TransformerBlock", "Flux2SingleTransformerBlock"]
+    _cp_plan = {
+        "": {
+            "hidden_states": ContextParallelInput(split_dim=1, expected_dims=3, split_output=False),
+            "encoder_hidden_states": ContextParallelInput(split_dim=1, expected_dims=3, split_output=False),
+            "img_ids": ContextParallelInput(split_dim=1, expected_dims=3, split_output=False),
+            "txt_ids": ContextParallelInput(split_dim=1, expected_dims=3, split_output=False),
+        },
+        "proj_out": ContextParallelOutput(gather_dim=1, expected_dims=3),
+    }
+
+    @register_to_config
+    def __init__(
+        self,
+        patch_size: int = 1,
+        in_channels: int = 128,
+        out_channels: Optional[int] = None,
+        num_layers: int = 8,
+        num_single_layers: int = 48,
+        attention_head_dim: int = 128,
+        num_attention_heads: int = 48,
+        joint_attention_dim: int = 15360,
+        timestep_guidance_channels: int = 256,
+        mlp_ratio: float = 3.0,
+        axes_dims_rope: Tuple[int, ...] = (32, 32, 32, 32),
+        rope_theta: int = 2000,
+        eps: float = 1e-6,
+    ):
+        super().__init__()
+        self.out_channels = out_channels or in_channels
+        self.inner_dim = num_attention_heads * attention_head_dim
+
+        # 1. Sinusoidal positional embedding for RoPE on image and text tokens
+        self.pos_embed = Flux2PosEmbed(theta=rope_theta, axes_dim=axes_dims_rope)
+
+        # 2. Combined timestep + guidance embedding
+        self.time_guidance_embed = Flux2TimestepGuidanceEmbeddings(
+            in_channels=timestep_guidance_channels, embedding_dim=self.inner_dim, bias=False
+        )
+
+        # 3. Modulation (double stream and single stream blocks share modulation parameters, resp.)
+        # Two sets of shift/scale/gate modulation parameters for the double stream attn and FF sub-blocks
+        self.double_stream_modulation_img = Flux2Modulation(self.inner_dim, mod_param_sets=2, bias=False)
+        self.double_stream_modulation_txt = Flux2Modulation(self.inner_dim, mod_param_sets=2, bias=False)
+        # Only one set of modulation parameters as the attn and FF sub-blocks are run in parallel for single stream
+        self.single_stream_modulation = Flux2Modulation(self.inner_dim, mod_param_sets=1, bias=False)
+
+        # 4. Input projections
+        self.x_embedder = nn.Linear(in_channels, self.inner_dim, bias=False)
+        self.context_embedder = nn.Linear(joint_attention_dim, self.inner_dim, bias=False)
+
+        # 5. Double Stream Transformer Blocks
+        self.transformer_blocks = nn.ModuleList(
+            [
+                Flux2TransformerBlock(
+                    dim=self.inner_dim,
+                    num_attention_heads=num_attention_heads,
+                    attention_head_dim=attention_head_dim,
+                    mlp_ratio=mlp_ratio,
+                    eps=eps,
+                    bias=False,
+                )
+                for _ in range(num_layers)
+            ]
+        )
+
+        # 6. Single Stream Transformer Blocks
+        self.single_transformer_blocks = nn.ModuleList(
+            [
+                Flux2SingleTransformerBlock(
+                    dim=self.inner_dim,
+                    num_attention_heads=num_attention_heads,
+                    attention_head_dim=attention_head_dim,
+                    mlp_ratio=mlp_ratio,
+                    eps=eps,
+                    bias=False,
+                )
+                for _ in range(num_single_layers)
+            ]
+        )
+
+        # 7. Output layers
+        self.norm_out = AdaLayerNormContinuous(
+            self.inner_dim, self.inner_dim, elementwise_affine=False, eps=eps, bias=False
+        )
+        self.proj_out = nn.Linear(self.inner_dim, patch_size * patch_size * self.out_channels, bias=False)
+
+        self.gradient_checkpointing = False
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        encoder_hidden_states: torch.Tensor = None,
+        timestep: torch.LongTensor = None,
+        img_ids: torch.Tensor = None,
+        txt_ids: torch.Tensor = None,
+        guidance: torch.Tensor = None,
+        joint_attention_kwargs: Optional[Dict[str, Any]] = None,
+        return_dict: bool = True,
+    ) -> Union[torch.Tensor, Transformer2DModelOutput]:
+        """
+        The [`FluxTransformer2DModel`] forward method.
+
+        Args:
+            hidden_states (`torch.Tensor` of shape `(batch_size, image_sequence_length, in_channels)`):
+                Input `hidden_states`.
+            encoder_hidden_states (`torch.Tensor` of shape `(batch_size, text_sequence_length, joint_attention_dim)`):
+                Conditional embeddings (embeddings computed from the input conditions such as prompts) to use.
+            timestep ( `torch.LongTensor`):
+                Used to indicate denoising step.
+            block_controlnet_hidden_states: (`list` of `torch.Tensor`):
+                A list of tensors that if specified are added to the residuals of transformer blocks.
+            joint_attention_kwargs (`dict`, *optional*):
+                A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
+                `self.processor` in
+                [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~models.transformer_2d.Transformer2DModelOutput`] instead of a plain
+                tuple.
+
+        Returns:
+            If `return_dict` is True, an [`~models.transformer_2d.Transformer2DModelOutput`] is returned, otherwise a
+            `tuple` where the first element is the sample tensor.
+        """
+        # 0. Handle input arguments
+        if joint_attention_kwargs is not None:
+            joint_attention_kwargs = joint_attention_kwargs.copy()
+            lora_scale = joint_attention_kwargs.pop("scale", 1.0)
+        else:
+            lora_scale = 1.0
+
+        if USE_PEFT_BACKEND:
+            # weight the lora layers by setting `lora_scale` for each PEFT layer
+            scale_lora_layers(self, lora_scale)
+        else:
+            if joint_attention_kwargs is not None and joint_attention_kwargs.get("scale", None) is not None:
+                logger.warning(
+                    "Passing `scale` via `joint_attention_kwargs` when not using the PEFT backend is ineffective."
+                )
+
+        num_txt_tokens = encoder_hidden_states.shape[1]
+
+        # 1. Calculate timestep embedding and modulation parameters
+        timestep = timestep.to(hidden_states.dtype) * 1000
+        guidance = guidance.to(hidden_states.dtype) * 1000
+
+        temb = self.time_guidance_embed(timestep, guidance)
+
+        double_stream_mod_img = self.double_stream_modulation_img(temb)
+        double_stream_mod_txt = self.double_stream_modulation_txt(temb)
+        single_stream_mod = self.single_stream_modulation(temb)[0]
+
+        # 2. Input projection for image (hidden_states) and conditioning text (encoder_hidden_states)
+        hidden_states = self.x_embedder(hidden_states)
+        encoder_hidden_states = self.context_embedder(encoder_hidden_states)
+
+        # 3. Calculate RoPE embeddings from image and text tokens
+        # NOTE: the below logic means that we can't support batched inference with images of different resolutions or
+        # text prompts of differents lengths. Is this a use case we want to support?
+        if img_ids.ndim == 3:
+            img_ids = img_ids[0]
+        if txt_ids.ndim == 3:
+            txt_ids = txt_ids[0]
+
+        if is_torch_npu_available():
+            freqs_cos_image, freqs_sin_image = self.pos_embed(img_ids.cpu())
+            image_rotary_emb = (freqs_cos_image.npu(), freqs_sin_image.npu())
+            freqs_cos_text, freqs_sin_text = self.pos_embed(txt_ids.cpu())
+            text_rotary_emb = (freqs_cos_text.npu(), freqs_sin_text.npu())
+        else:
+            image_rotary_emb = self.pos_embed(img_ids)
+            text_rotary_emb = self.pos_embed(txt_ids)
+        concat_rotary_emb = (
+            torch.cat([text_rotary_emb[0], image_rotary_emb[0]], dim=0),
+            torch.cat([text_rotary_emb[1], image_rotary_emb[1]], dim=0),
+        )
+
+        # 4. Double Stream Transformer Blocks
+        for index_block, block in enumerate(self.transformer_blocks):
+            if torch.is_grad_enabled() and self.gradient_checkpointing:
+                encoder_hidden_states, hidden_states = self._gradient_checkpointing_func(
+                    block,
+                    hidden_states,
+                    encoder_hidden_states,
+                    double_stream_mod_img,
+                    double_stream_mod_txt,
+                    concat_rotary_emb,
+                    joint_attention_kwargs,
+                )
+            else:
+                encoder_hidden_states, hidden_states = block(
+                    hidden_states=hidden_states,
+                    encoder_hidden_states=encoder_hidden_states,
+                    temb_mod_params_img=double_stream_mod_img,
+                    temb_mod_params_txt=double_stream_mod_txt,
+                    image_rotary_emb=concat_rotary_emb,
+                    joint_attention_kwargs=joint_attention_kwargs,
+                )
+        # Concatenate text and image streams for single-block inference
+        hidden_states = torch.cat([encoder_hidden_states, hidden_states], dim=1)
+
+        # 5. Single Stream Transformer Blocks
+        for index_block, block in enumerate(self.single_transformer_blocks):
+            if torch.is_grad_enabled() and self.gradient_checkpointing:
+                hidden_states = self._gradient_checkpointing_func(
+                    block,
+                    hidden_states,
+                    None,
+                    single_stream_mod,
+                    concat_rotary_emb,
+                    joint_attention_kwargs,
+                )
+            else:
+                hidden_states = block(
+                    hidden_states=hidden_states,
+                    encoder_hidden_states=None,
+                    temb_mod_params=single_stream_mod,
+                    image_rotary_emb=concat_rotary_emb,
+                    joint_attention_kwargs=joint_attention_kwargs,
+                )
+        # Remove text tokens from concatenated stream
+        hidden_states = hidden_states[:, num_txt_tokens:, ...]
+
+        # 6. Output layers
+        hidden_states = self.norm_out(hidden_states, temb)
+        output = self.proj_out(hidden_states)
+
+        if USE_PEFT_BACKEND:
+            # remove `lora_scale` from each PEFT layer
+            unscale_lora_layers(self, lora_scale)
+
+        if not return_dict:
+            return (output,)
+
+        return Transformer2DModelOutput(sample=output)
diff --git a/src/diffusers/models/transformers/transformer_hunyuan_video.py b/src/diffusers/models/transformers/transformer_hunyuan_video.py
index bc857ccab463..fb0ce1a30ff9 100644
--- a/src/diffusers/models/transformers/transformer_hunyuan_video.py
+++ b/src/diffusers/models/transformers/transformer_hunyuan_video.py
@@ -23,8 +23,9 @@
 from ...configuration_utils import ConfigMixin, register_to_config
 from ...loaders import PeftAdapterMixin
 from ...utils import USE_PEFT_BACKEND, logging, scale_lora_layers, unscale_lora_layers
-from ..attention import FeedForward
-from ..attention_processor import Attention, AttentionProcessor
+from ..attention import AttentionMixin, FeedForward
+from ..attention_dispatch import dispatch_attention_fn
+from ..attention_processor import Attention
 from ..cache_utils import CacheMixin
 from ..embeddings import (
     CombinedTimestepTextProjEmbeddings,
@@ -42,6 +43,9 @@
 
 
 class HunyuanVideoAttnProcessor2_0:
+    _attention_backend = None
+    _parallel_config = None
+
     def __init__(self):
         if not hasattr(F, "scaled_dot_product_attention"):
             raise ImportError(
@@ -64,9 +68,9 @@ def __call__(
         key = attn.to_k(hidden_states)
         value = attn.to_v(hidden_states)
 
-        query = query.unflatten(2, (attn.heads, -1)).transpose(1, 2)
-        key = key.unflatten(2, (attn.heads, -1)).transpose(1, 2)
-        value = value.unflatten(2, (attn.heads, -1)).transpose(1, 2)
+        query = query.unflatten(2, (attn.heads, -1))
+        key = key.unflatten(2, (attn.heads, -1))
+        value = value.unflatten(2, (attn.heads, -1))
 
         # 2. QK normalization
         if attn.norm_q is not None:
@@ -81,21 +85,29 @@ def __call__(
             if attn.add_q_proj is None and encoder_hidden_states is not None:
                 query = torch.cat(
                     [
-                        apply_rotary_emb(query[:, :, : -encoder_hidden_states.shape[1]], image_rotary_emb),
-                        query[:, :, -encoder_hidden_states.shape[1] :],
+                        apply_rotary_emb(
+                            query[:, : -encoder_hidden_states.shape[1]],
+                            image_rotary_emb,
+                            sequence_dim=1,
+                        ),
+                        query[:, -encoder_hidden_states.shape[1] :],
                     ],
-                    dim=2,
+                    dim=1,
                 )
                 key = torch.cat(
                     [
-                        apply_rotary_emb(key[:, :, : -encoder_hidden_states.shape[1]], image_rotary_emb),
-                        key[:, :, -encoder_hidden_states.shape[1] :],
+                        apply_rotary_emb(
+                            key[:, : -encoder_hidden_states.shape[1]],
+                            image_rotary_emb,
+                            sequence_dim=1,
+                        ),
+                        key[:, -encoder_hidden_states.shape[1] :],
                     ],
-                    dim=2,
+                    dim=1,
                 )
             else:
-                query = apply_rotary_emb(query, image_rotary_emb)
-                key = apply_rotary_emb(key, image_rotary_emb)
+                query = apply_rotary_emb(query, image_rotary_emb, sequence_dim=1)
+                key = apply_rotary_emb(key, image_rotary_emb, sequence_dim=1)
 
         # 4. Encoder condition QKV projection and normalization
         if attn.add_q_proj is not None and encoder_hidden_states is not None:
@@ -103,24 +115,31 @@ def __call__(
             encoder_key = attn.add_k_proj(encoder_hidden_states)
             encoder_value = attn.add_v_proj(encoder_hidden_states)
 
-            encoder_query = encoder_query.unflatten(2, (attn.heads, -1)).transpose(1, 2)
-            encoder_key = encoder_key.unflatten(2, (attn.heads, -1)).transpose(1, 2)
-            encoder_value = encoder_value.unflatten(2, (attn.heads, -1)).transpose(1, 2)
+            encoder_query = encoder_query.unflatten(2, (attn.heads, -1))
+            encoder_key = encoder_key.unflatten(2, (attn.heads, -1))
+            encoder_value = encoder_value.unflatten(2, (attn.heads, -1))
 
             if attn.norm_added_q is not None:
                 encoder_query = attn.norm_added_q(encoder_query)
             if attn.norm_added_k is not None:
                 encoder_key = attn.norm_added_k(encoder_key)
 
-            query = torch.cat([query, encoder_query], dim=2)
-            key = torch.cat([key, encoder_key], dim=2)
-            value = torch.cat([value, encoder_value], dim=2)
+            query = torch.cat([query, encoder_query], dim=1)
+            key = torch.cat([key, encoder_key], dim=1)
+            value = torch.cat([value, encoder_value], dim=1)
 
         # 5. Attention
-        hidden_states = F.scaled_dot_product_attention(
-            query, key, value, attn_mask=attention_mask, dropout_p=0.0, is_causal=False
+        hidden_states = dispatch_attention_fn(
+            query,
+            key,
+            value,
+            attn_mask=attention_mask,
+            dropout_p=0.0,
+            is_causal=False,
+            backend=self._attention_backend,
+            parallel_config=self._parallel_config,
         )
-        hidden_states = hidden_states.transpose(1, 2).flatten(2, 3)
+        hidden_states = hidden_states.flatten(2, 3)
         hidden_states = hidden_states.to(query.dtype)
 
         # 6. Output projection
@@ -819,7 +838,9 @@ def forward(
         return hidden_states, encoder_hidden_states
 
 
-class HunyuanVideoTransformer3DModel(ModelMixin, ConfigMixin, PeftAdapterMixin, FromOriginalModelMixin, CacheMixin):
+class HunyuanVideoTransformer3DModel(
+    ModelMixin, AttentionMixin, ConfigMixin, PeftAdapterMixin, FromOriginalModelMixin, CacheMixin
+):
     r"""
     A Transformer model for video-like data used in [HunyuanVideo](https://huggingface.co/tencent/HunyuanVideo).
 
@@ -895,7 +916,7 @@ def __init__(
         text_embed_dim: int = 4096,
         pooled_projection_dim: int = 768,
         rope_theta: float = 256.0,
-        rope_axes_dim: Tuple[int] = (16, 56, 56),
+        rope_axes_dim: Tuple[int, ...] = (16, 56, 56),
         image_condition_type: Optional[str] = None,
     ) -> None:
         super().__init__()
@@ -968,66 +989,6 @@ def __init__(
 
         self.gradient_checkpointing = False
 
-    @property
-    # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.attn_processors
-    def attn_processors(self) -> Dict[str, AttentionProcessor]:
-        r"""
-        Returns:
-            `dict` of attention processors: A dictionary containing all attention processors used in the model with
-            indexed by its weight name.
-        """
-        # set recursively
-        processors = {}
-
-        def fn_recursive_add_processors(name: str, module: torch.nn.Module, processors: Dict[str, AttentionProcessor]):
-            if hasattr(module, "get_processor"):
-                processors[f"{name}.processor"] = module.get_processor()
-
-            for sub_name, child in module.named_children():
-                fn_recursive_add_processors(f"{name}.{sub_name}", child, processors)
-
-            return processors
-
-        for name, module in self.named_children():
-            fn_recursive_add_processors(name, module, processors)
-
-        return processors
-
-    # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.set_attn_processor
-    def set_attn_processor(self, processor: Union[AttentionProcessor, Dict[str, AttentionProcessor]]):
-        r"""
-        Sets the attention processor to use to compute attention.
-
-        Parameters:
-            processor (`dict` of `AttentionProcessor` or only `AttentionProcessor`):
-                The instantiated processor class or a dictionary of processor classes that will be set as the processor
-                for **all** `Attention` layers.
-
-                If `processor` is a dict, the key needs to define the path to the corresponding cross attention
-                processor. This is strongly recommended when setting trainable attention processors.
-
-        """
-        count = len(self.attn_processors.keys())
-
-        if isinstance(processor, dict) and len(processor) != count:
-            raise ValueError(
-                f"A dict of processors was passed, but the number of processors {len(processor)} does not match the"
-                f" number of attention layers: {count}. Please make sure to pass {count} processor classes."
-            )
-
-        def fn_recursive_attn_processor(name: str, module: torch.nn.Module, processor):
-            if hasattr(module, "set_processor"):
-                if not isinstance(processor, dict):
-                    module.set_processor(processor)
-                else:
-                    module.set_processor(processor.pop(f"{name}.processor"))
-
-            for sub_name, child in module.named_children():
-                fn_recursive_attn_processor(f"{name}.{sub_name}", child, processor)
-
-        for name, module in self.named_children():
-            fn_recursive_attn_processor(name, module, processor)
-
     def forward(
         self,
         hidden_states: torch.Tensor,
diff --git a/src/diffusers/models/transformers/transformer_hunyuan_video15.py b/src/diffusers/models/transformers/transformer_hunyuan_video15.py
new file mode 100644
index 000000000000..293ba996ea98
--- /dev/null
+++ b/src/diffusers/models/transformers/transformer_hunyuan_video15.py
@@ -0,0 +1,793 @@
+# Copyright 2025 The Hunyuan Team and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Any, Dict, List, Optional, Tuple, Union
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from diffusers.loaders import FromOriginalModelMixin
+
+from ...configuration_utils import ConfigMixin, register_to_config
+from ...loaders import PeftAdapterMixin
+from ...utils import USE_PEFT_BACKEND, logging, scale_lora_layers, unscale_lora_layers
+from ..attention import AttentionMixin, FeedForward
+from ..attention_dispatch import dispatch_attention_fn
+from ..attention_processor import Attention
+from ..cache_utils import CacheMixin
+from ..embeddings import (
+    CombinedTimestepTextProjEmbeddings,
+    TimestepEmbedding,
+    Timesteps,
+    get_1d_rotary_pos_embed,
+)
+from ..modeling_outputs import Transformer2DModelOutput
+from ..modeling_utils import ModelMixin
+from ..normalization import AdaLayerNormContinuous, AdaLayerNormZero
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+
+class HunyuanVideo15AttnProcessor2_0:
+    _attention_backend = None
+    _parallel_config = None
+
+    def __init__(self):
+        if not hasattr(F, "scaled_dot_product_attention"):
+            raise ImportError(
+                "HunyuanVideo15AttnProcessor2_0 requires PyTorch 2.0. To use it, please upgrade PyTorch to 2.0."
+            )
+
+    def __call__(
+        self,
+        attn: Attention,
+        hidden_states: torch.Tensor,
+        encoder_hidden_states: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        image_rotary_emb: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        # 1. QKV projections
+        query = attn.to_q(hidden_states)
+        key = attn.to_k(hidden_states)
+        value = attn.to_v(hidden_states)
+
+        query = query.unflatten(2, (attn.heads, -1))
+        key = key.unflatten(2, (attn.heads, -1))
+        value = value.unflatten(2, (attn.heads, -1))
+
+        # 2. QK normalization
+        query = attn.norm_q(query)
+        key = attn.norm_k(key)
+
+        # 3. Rotational positional embeddings applied to latent stream
+        if image_rotary_emb is not None:
+            from ..embeddings import apply_rotary_emb
+
+            query = apply_rotary_emb(query, image_rotary_emb, sequence_dim=1)
+            key = apply_rotary_emb(key, image_rotary_emb, sequence_dim=1)
+
+        # 4. Encoder condition QKV projection and normalization
+        if encoder_hidden_states is not None:
+            encoder_query = attn.add_q_proj(encoder_hidden_states)
+            encoder_key = attn.add_k_proj(encoder_hidden_states)
+            encoder_value = attn.add_v_proj(encoder_hidden_states)
+
+            encoder_query = encoder_query.unflatten(2, (attn.heads, -1))
+            encoder_key = encoder_key.unflatten(2, (attn.heads, -1))
+            encoder_value = encoder_value.unflatten(2, (attn.heads, -1))
+
+            if attn.norm_added_q is not None:
+                encoder_query = attn.norm_added_q(encoder_query)
+            if attn.norm_added_k is not None:
+                encoder_key = attn.norm_added_k(encoder_key)
+
+            query = torch.cat([query, encoder_query], dim=1)
+            key = torch.cat([key, encoder_key], dim=1)
+            value = torch.cat([value, encoder_value], dim=1)
+
+        batch_size, seq_len, heads, dim = query.shape
+        attention_mask = F.pad(attention_mask, (seq_len - attention_mask.shape[1], 0), value=True)
+        attention_mask = attention_mask.bool()
+        self_attn_mask_1 = attention_mask.view(batch_size, 1, 1, seq_len).repeat(1, 1, seq_len, 1)
+        self_attn_mask_2 = self_attn_mask_1.transpose(2, 3)
+        attention_mask = (self_attn_mask_1 & self_attn_mask_2).bool()
+
+        # 5. Attention
+        hidden_states = dispatch_attention_fn(
+            query,
+            key,
+            value,
+            attn_mask=attention_mask,
+            dropout_p=0.0,
+            is_causal=False,
+            backend=self._attention_backend,
+            parallel_config=self._parallel_config,
+        )
+
+        hidden_states = hidden_states.flatten(2, 3)
+        hidden_states = hidden_states.to(query.dtype)
+
+        # 6. Output projection
+        if encoder_hidden_states is not None:
+            hidden_states, encoder_hidden_states = (
+                hidden_states[:, : -encoder_hidden_states.shape[1]],
+                hidden_states[:, -encoder_hidden_states.shape[1] :],
+            )
+
+            if getattr(attn, "to_out", None) is not None:
+                hidden_states = attn.to_out[0](hidden_states)
+                hidden_states = attn.to_out[1](hidden_states)
+
+            if getattr(attn, "to_add_out", None) is not None:
+                encoder_hidden_states = attn.to_add_out(encoder_hidden_states)
+
+        return hidden_states, encoder_hidden_states
+
+
+class HunyuanVideo15PatchEmbed(nn.Module):
+    def __init__(
+        self,
+        patch_size: Union[int, Tuple[int, int, int]] = 16,
+        in_chans: int = 3,
+        embed_dim: int = 768,
+    ) -> None:
+        super().__init__()
+
+        patch_size = (patch_size, patch_size, patch_size) if isinstance(patch_size, int) else patch_size
+        self.proj = nn.Conv3d(in_chans, embed_dim, kernel_size=patch_size, stride=patch_size)
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.proj(hidden_states)
+        hidden_states = hidden_states.flatten(2).transpose(1, 2)  # BCFHW -> BNC
+        return hidden_states
+
+
+class HunyuanVideo15AdaNorm(nn.Module):
+    def __init__(self, in_features: int, out_features: Optional[int] = None) -> None:
+        super().__init__()
+
+        out_features = out_features or 2 * in_features
+        self.linear = nn.Linear(in_features, out_features)
+        self.nonlinearity = nn.SiLU()
+
+    def forward(
+        self, temb: torch.Tensor
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
+        temb = self.linear(self.nonlinearity(temb))
+        gate_msa, gate_mlp = temb.chunk(2, dim=1)
+        gate_msa, gate_mlp = gate_msa.unsqueeze(1), gate_mlp.unsqueeze(1)
+        return gate_msa, gate_mlp
+
+
+class HunyuanVideo15TimeEmbedding(nn.Module):
+    r"""
+    Time embedding for HunyuanVideo 1.5.
+
+    Supports standard timestep embedding and optional reference timestep embedding for MeanFlow-based super-resolution
+    models.
+
+    Args:
+        embedding_dim (`int`):
+            The dimension of the output embedding.
+    """
+
+    def __init__(self, embedding_dim: int, use_meanflow: bool = False):
+        super().__init__()
+
+        self.time_proj = Timesteps(num_channels=256, flip_sin_to_cos=True, downscale_freq_shift=0)
+        self.timestep_embedder = TimestepEmbedding(in_channels=256, time_embed_dim=embedding_dim)
+
+        self.use_meanflow = use_meanflow
+        self.time_proj_r = None
+        self.timestep_embedder_r = None
+        if use_meanflow:
+            self.time_proj_r = Timesteps(num_channels=256, flip_sin_to_cos=True, downscale_freq_shift=0)
+            self.timestep_embedder_r = TimestepEmbedding(in_channels=256, time_embed_dim=embedding_dim)
+
+    def forward(
+        self,
+        timestep: torch.Tensor,
+        timestep_r: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        timesteps_proj = self.time_proj(timestep)
+        timesteps_emb = self.timestep_embedder(timesteps_proj.to(dtype=timestep.dtype))
+
+        if timestep_r is not None:
+            timesteps_proj_r = self.time_proj_r(timestep_r)
+            timesteps_emb_r = self.timestep_embedder_r(timesteps_proj_r.to(dtype=timestep.dtype))
+            timesteps_emb = timesteps_emb + timesteps_emb_r
+
+        return timesteps_emb
+
+
+class HunyuanVideo15IndividualTokenRefinerBlock(nn.Module):
+    def __init__(
+        self,
+        num_attention_heads: int,
+        attention_head_dim: int,
+        mlp_width_ratio: str = 4.0,
+        mlp_drop_rate: float = 0.0,
+        attention_bias: bool = True,
+    ) -> None:
+        super().__init__()
+
+        hidden_size = num_attention_heads * attention_head_dim
+
+        self.norm1 = nn.LayerNorm(hidden_size, elementwise_affine=True, eps=1e-6)
+        self.attn = Attention(
+            query_dim=hidden_size,
+            cross_attention_dim=None,
+            heads=num_attention_heads,
+            dim_head=attention_head_dim,
+            bias=attention_bias,
+        )
+
+        self.norm2 = nn.LayerNorm(hidden_size, elementwise_affine=True, eps=1e-6)
+        self.ff = FeedForward(hidden_size, mult=mlp_width_ratio, activation_fn="linear-silu", dropout=mlp_drop_rate)
+
+        self.norm_out = HunyuanVideo15AdaNorm(hidden_size, 2 * hidden_size)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        temb: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        norm_hidden_states = self.norm1(hidden_states)
+
+        attn_output = self.attn(
+            hidden_states=norm_hidden_states,
+            encoder_hidden_states=None,
+            attention_mask=attention_mask,
+        )
+
+        gate_msa, gate_mlp = self.norm_out(temb)
+        hidden_states = hidden_states + attn_output * gate_msa
+
+        ff_output = self.ff(self.norm2(hidden_states))
+        hidden_states = hidden_states + ff_output * gate_mlp
+
+        return hidden_states
+
+
+class HunyuanVideo15IndividualTokenRefiner(nn.Module):
+    def __init__(
+        self,
+        num_attention_heads: int,
+        attention_head_dim: int,
+        num_layers: int,
+        mlp_width_ratio: float = 4.0,
+        mlp_drop_rate: float = 0.0,
+        attention_bias: bool = True,
+    ) -> None:
+        super().__init__()
+
+        self.refiner_blocks = nn.ModuleList(
+            [
+                HunyuanVideo15IndividualTokenRefinerBlock(
+                    num_attention_heads=num_attention_heads,
+                    attention_head_dim=attention_head_dim,
+                    mlp_width_ratio=mlp_width_ratio,
+                    mlp_drop_rate=mlp_drop_rate,
+                    attention_bias=attention_bias,
+                )
+                for _ in range(num_layers)
+            ]
+        )
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        temb: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+    ) -> None:
+        self_attn_mask = None
+        if attention_mask is not None:
+            batch_size = attention_mask.shape[0]
+            seq_len = attention_mask.shape[1]
+            attention_mask = attention_mask.to(hidden_states.device).bool()
+            self_attn_mask_1 = attention_mask.view(batch_size, 1, 1, seq_len).repeat(1, 1, seq_len, 1)
+            self_attn_mask_2 = self_attn_mask_1.transpose(2, 3)
+            self_attn_mask = (self_attn_mask_1 & self_attn_mask_2).bool()
+
+        for block in self.refiner_blocks:
+            hidden_states = block(hidden_states, temb, self_attn_mask)
+
+        return hidden_states
+
+
+class HunyuanVideo15TokenRefiner(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        num_attention_heads: int,
+        attention_head_dim: int,
+        num_layers: int,
+        mlp_ratio: float = 4.0,
+        mlp_drop_rate: float = 0.0,
+        attention_bias: bool = True,
+    ) -> None:
+        super().__init__()
+
+        hidden_size = num_attention_heads * attention_head_dim
+
+        self.time_text_embed = CombinedTimestepTextProjEmbeddings(
+            embedding_dim=hidden_size, pooled_projection_dim=in_channels
+        )
+        self.proj_in = nn.Linear(in_channels, hidden_size, bias=True)
+        self.token_refiner = HunyuanVideo15IndividualTokenRefiner(
+            num_attention_heads=num_attention_heads,
+            attention_head_dim=attention_head_dim,
+            num_layers=num_layers,
+            mlp_width_ratio=mlp_ratio,
+            mlp_drop_rate=mlp_drop_rate,
+            attention_bias=attention_bias,
+        )
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        timestep: torch.LongTensor,
+        attention_mask: Optional[torch.LongTensor] = None,
+    ) -> torch.Tensor:
+        if attention_mask is None:
+            pooled_projections = hidden_states.mean(dim=1)
+        else:
+            original_dtype = hidden_states.dtype
+            mask_float = attention_mask.float().unsqueeze(-1)
+            pooled_projections = (hidden_states * mask_float).sum(dim=1) / mask_float.sum(dim=1)
+            pooled_projections = pooled_projections.to(original_dtype)
+
+        temb = self.time_text_embed(timestep, pooled_projections)
+        hidden_states = self.proj_in(hidden_states)
+        hidden_states = self.token_refiner(hidden_states, temb, attention_mask)
+
+        return hidden_states
+
+
+class HunyuanVideo15RotaryPosEmbed(nn.Module):
+    def __init__(self, patch_size: int, patch_size_t: int, rope_dim: List[int], theta: float = 256.0) -> None:
+        super().__init__()
+
+        self.patch_size = patch_size
+        self.patch_size_t = patch_size_t
+        self.rope_dim = rope_dim
+        self.theta = theta
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        batch_size, num_channels, num_frames, height, width = hidden_states.shape
+        rope_sizes = [num_frames // self.patch_size_t, height // self.patch_size, width // self.patch_size]
+
+        axes_grids = []
+        for i in range(len(rope_sizes)):
+            # Note: The following line diverges from original behaviour. We create the grid on the device, whereas
+            # original implementation creates it on CPU and then moves it to device. This results in numerical
+            # differences in layerwise debugging outputs, but visually it is the same.
+            grid = torch.arange(0, rope_sizes[i], device=hidden_states.device, dtype=torch.float32)
+            axes_grids.append(grid)
+        grid = torch.meshgrid(*axes_grids, indexing="ij")  # [W, H, T]
+        grid = torch.stack(grid, dim=0)  # [3, W, H, T]
+
+        freqs = []
+        for i in range(3):
+            freq = get_1d_rotary_pos_embed(self.rope_dim[i], grid[i].reshape(-1), self.theta, use_real=True)
+            freqs.append(freq)
+
+        freqs_cos = torch.cat([f[0] for f in freqs], dim=1)  # (W * H * T, D / 2)
+        freqs_sin = torch.cat([f[1] for f in freqs], dim=1)  # (W * H * T, D / 2)
+        return freqs_cos, freqs_sin
+
+
+class HunyuanVideo15ByT5TextProjection(nn.Module):
+    def __init__(self, in_features: int, hidden_size: int, out_features: int):
+        super().__init__()
+        self.norm = nn.LayerNorm(in_features)
+        self.linear_1 = nn.Linear(in_features, hidden_size)
+        self.linear_2 = nn.Linear(hidden_size, hidden_size)
+        self.linear_3 = nn.Linear(hidden_size, out_features)
+        self.act_fn = nn.GELU()
+
+    def forward(self, encoder_hidden_states: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.norm(encoder_hidden_states)
+        hidden_states = self.linear_1(hidden_states)
+        hidden_states = self.act_fn(hidden_states)
+        hidden_states = self.linear_2(hidden_states)
+        hidden_states = self.act_fn(hidden_states)
+        hidden_states = self.linear_3(hidden_states)
+        return hidden_states
+
+
+class HunyuanVideo15ImageProjection(nn.Module):
+    def __init__(self, in_channels: int, hidden_size: int):
+        super().__init__()
+        self.norm_in = nn.LayerNorm(in_channels)
+        self.linear_1 = nn.Linear(in_channels, in_channels)
+        self.act_fn = nn.GELU()
+        self.linear_2 = nn.Linear(in_channels, hidden_size)
+        self.norm_out = nn.LayerNorm(hidden_size)
+
+    def forward(self, image_embeds: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.norm_in(image_embeds)
+        hidden_states = self.linear_1(hidden_states)
+        hidden_states = self.act_fn(hidden_states)
+        hidden_states = self.linear_2(hidden_states)
+        hidden_states = self.norm_out(hidden_states)
+        return hidden_states
+
+
+class HunyuanVideo15TransformerBlock(nn.Module):
+    def __init__(
+        self,
+        num_attention_heads: int,
+        attention_head_dim: int,
+        mlp_ratio: float,
+        qk_norm: str = "rms_norm",
+    ) -> None:
+        super().__init__()
+
+        hidden_size = num_attention_heads * attention_head_dim
+
+        self.norm1 = AdaLayerNormZero(hidden_size, norm_type="layer_norm")
+        self.norm1_context = AdaLayerNormZero(hidden_size, norm_type="layer_norm")
+
+        self.attn = Attention(
+            query_dim=hidden_size,
+            cross_attention_dim=None,
+            added_kv_proj_dim=hidden_size,
+            dim_head=attention_head_dim,
+            heads=num_attention_heads,
+            out_dim=hidden_size,
+            context_pre_only=False,
+            bias=True,
+            processor=HunyuanVideo15AttnProcessor2_0(),
+            qk_norm=qk_norm,
+            eps=1e-6,
+        )
+
+        self.norm2 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
+        self.ff = FeedForward(hidden_size, mult=mlp_ratio, activation_fn="gelu-approximate")
+
+        self.norm2_context = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
+        self.ff_context = FeedForward(hidden_size, mult=mlp_ratio, activation_fn="gelu-approximate")
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        encoder_hidden_states: torch.Tensor,
+        temb: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        freqs_cis: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
+        *args,
+        **kwargs,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        # 1. Input normalization
+        norm_hidden_states, gate_msa, shift_mlp, scale_mlp, gate_mlp = self.norm1(hidden_states, emb=temb)
+        norm_encoder_hidden_states, c_gate_msa, c_shift_mlp, c_scale_mlp, c_gate_mlp = self.norm1_context(
+            encoder_hidden_states, emb=temb
+        )
+
+        # 2. Joint attention
+        attn_output, context_attn_output = self.attn(
+            hidden_states=norm_hidden_states,
+            encoder_hidden_states=norm_encoder_hidden_states,
+            attention_mask=attention_mask,
+            image_rotary_emb=freqs_cis,
+        )
+
+        # 3. Modulation and residual connection
+        hidden_states = hidden_states + attn_output * gate_msa.unsqueeze(1)
+        encoder_hidden_states = encoder_hidden_states + context_attn_output * c_gate_msa.unsqueeze(1)
+
+        norm_hidden_states = self.norm2(hidden_states)
+        norm_encoder_hidden_states = self.norm2_context(encoder_hidden_states)
+
+        norm_hidden_states = norm_hidden_states * (1 + scale_mlp[:, None]) + shift_mlp[:, None]
+        norm_encoder_hidden_states = norm_encoder_hidden_states * (1 + c_scale_mlp[:, None]) + c_shift_mlp[:, None]
+
+        # 4. Feed-forward
+        ff_output = self.ff(norm_hidden_states)
+        context_ff_output = self.ff_context(norm_encoder_hidden_states)
+
+        hidden_states = hidden_states + gate_mlp.unsqueeze(1) * ff_output
+        encoder_hidden_states = encoder_hidden_states + c_gate_mlp.unsqueeze(1) * context_ff_output
+
+        return hidden_states, encoder_hidden_states
+
+
+class HunyuanVideo15Transformer3DModel(
+    ModelMixin, ConfigMixin, PeftAdapterMixin, FromOriginalModelMixin, CacheMixin, AttentionMixin
+):
+    r"""
+    A Transformer model for video-like data used in [HunyuanVideo1.5](https://huggingface.co/tencent/HunyuanVideo1.5).
+
+    Args:
+        in_channels (`int`, defaults to `16`):
+            The number of channels in the input.
+        out_channels (`int`, defaults to `16`):
+            The number of channels in the output.
+        num_attention_heads (`int`, defaults to `24`):
+            The number of heads to use for multi-head attention.
+        attention_head_dim (`int`, defaults to `128`):
+            The number of channels in each head.
+        num_layers (`int`, defaults to `20`):
+            The number of layers of dual-stream blocks to use.
+        num_refiner_layers (`int`, defaults to `2`):
+            The number of layers of refiner blocks to use.
+        mlp_ratio (`float`, defaults to `4.0`):
+            The ratio of the hidden layer size to the input size in the feedforward network.
+        patch_size (`int`, defaults to `2`):
+            The size of the spatial patches to use in the patch embedding layer.
+        patch_size_t (`int`, defaults to `1`):
+            The size of the tmeporal patches to use in the patch embedding layer.
+        qk_norm (`str`, defaults to `rms_norm`):
+            The normalization to use for the query and key projections in the attention layers.
+        guidance_embeds (`bool`, defaults to `True`):
+            Whether to use guidance embeddings in the model.
+        text_embed_dim (`int`, defaults to `4096`):
+            Input dimension of text embeddings from the text encoder.
+        pooled_projection_dim (`int`, defaults to `768`):
+            The dimension of the pooled projection of the text embeddings.
+        rope_theta (`float`, defaults to `256.0`):
+            The value of theta to use in the RoPE layer.
+        rope_axes_dim (`Tuple[int]`, defaults to `(16, 56, 56)`):
+            The dimensions of the axes to use in the RoPE layer.
+    """
+
+    _supports_gradient_checkpointing = True
+    _skip_layerwise_casting_patterns = ["x_embedder", "context_embedder", "norm"]
+    _no_split_modules = [
+        "HunyuanVideo15TransformerBlock",
+        "HunyuanVideo15PatchEmbed",
+        "HunyuanVideo15TokenRefiner",
+    ]
+    _repeated_blocks = [
+        "HunyuanVideo15TransformerBlock",
+        "HunyuanVideo15PatchEmbed",
+        "HunyuanVideo15TokenRefiner",
+    ]
+
+    @register_to_config
+    def __init__(
+        self,
+        in_channels: int = 65,
+        out_channels: int = 32,
+        num_attention_heads: int = 16,
+        attention_head_dim: int = 128,
+        num_layers: int = 54,
+        num_refiner_layers: int = 2,
+        mlp_ratio: float = 4.0,
+        patch_size: int = 1,
+        patch_size_t: int = 1,
+        qk_norm: str = "rms_norm",
+        text_embed_dim: int = 3584,
+        text_embed_2_dim: int = 1472,
+        image_embed_dim: int = 1152,
+        rope_theta: float = 256.0,
+        rope_axes_dim: Tuple[int, ...] = (16, 56, 56),
+        # YiYi Notes: config based on target_size_config https://github.com/yiyixuxu/hy15/blob/main/hyvideo/pipelines/hunyuan_video_pipeline.py#L205
+        target_size: int = 640,  # did not name sample_size since it is in pixel spaces
+        task_type: str = "i2v",
+        use_meanflow: bool = False,
+    ) -> None:
+        super().__init__()
+
+        inner_dim = num_attention_heads * attention_head_dim
+        out_channels = out_channels or in_channels
+
+        # 1. Latent and condition embedders
+        self.x_embedder = HunyuanVideo15PatchEmbed((patch_size_t, patch_size, patch_size), in_channels, inner_dim)
+        self.image_embedder = HunyuanVideo15ImageProjection(image_embed_dim, inner_dim)
+
+        self.context_embedder = HunyuanVideo15TokenRefiner(
+            text_embed_dim, num_attention_heads, attention_head_dim, num_layers=num_refiner_layers
+        )
+        self.context_embedder_2 = HunyuanVideo15ByT5TextProjection(text_embed_2_dim, 2048, inner_dim)
+
+        self.time_embed = HunyuanVideo15TimeEmbedding(inner_dim, use_meanflow=use_meanflow)
+
+        self.cond_type_embed = nn.Embedding(3, inner_dim)
+
+        # 2. RoPE
+        self.rope = HunyuanVideo15RotaryPosEmbed(patch_size, patch_size_t, rope_axes_dim, rope_theta)
+
+        # 3. Dual stream transformer blocks
+
+        self.transformer_blocks = nn.ModuleList(
+            [
+                HunyuanVideo15TransformerBlock(
+                    num_attention_heads, attention_head_dim, mlp_ratio=mlp_ratio, qk_norm=qk_norm
+                )
+                for _ in range(num_layers)
+            ]
+        )
+
+        # 5. Output projection
+        self.norm_out = AdaLayerNormContinuous(inner_dim, inner_dim, elementwise_affine=False, eps=1e-6)
+        self.proj_out = nn.Linear(inner_dim, patch_size_t * patch_size * patch_size * out_channels)
+
+        self.gradient_checkpointing = False
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        timestep: torch.LongTensor,
+        encoder_hidden_states: torch.Tensor,
+        encoder_attention_mask: torch.Tensor,
+        timestep_r: Optional[torch.LongTensor] = None,
+        encoder_hidden_states_2: Optional[torch.Tensor] = None,
+        encoder_attention_mask_2: Optional[torch.Tensor] = None,
+        image_embeds: Optional[torch.Tensor] = None,
+        attention_kwargs: Optional[Dict[str, Any]] = None,
+        return_dict: bool = True,
+    ) -> Union[Tuple[torch.Tensor], Transformer2DModelOutput]:
+        if attention_kwargs is not None:
+            attention_kwargs = attention_kwargs.copy()
+            lora_scale = attention_kwargs.pop("scale", 1.0)
+        else:
+            lora_scale = 1.0
+
+        if USE_PEFT_BACKEND:
+            # weight the lora layers by setting `lora_scale` for each PEFT layer
+            scale_lora_layers(self, lora_scale)
+        else:
+            if attention_kwargs is not None and attention_kwargs.get("scale", None) is not None:
+                logger.warning(
+                    "Passing `scale` via `attention_kwargs` when not using the PEFT backend is ineffective."
+                )
+
+        batch_size, num_channels, num_frames, height, width = hidden_states.shape
+        p_t, p_h, p_w = self.config.patch_size_t, self.config.patch_size, self.config.patch_size
+        post_patch_num_frames = num_frames // p_t
+        post_patch_height = height // p_h
+        post_patch_width = width // p_w
+
+        # 1. RoPE
+        image_rotary_emb = self.rope(hidden_states)
+
+        # 2. Conditional embeddings
+        temb = self.time_embed(timestep, timestep_r=timestep_r)
+
+        hidden_states = self.x_embedder(hidden_states)
+
+        # qwen text embedding
+        encoder_hidden_states = self.context_embedder(encoder_hidden_states, timestep, encoder_attention_mask)
+
+        encoder_hidden_states_cond_emb = self.cond_type_embed(
+            torch.zeros_like(encoder_hidden_states[:, :, 0], dtype=torch.long)
+        )
+        encoder_hidden_states = encoder_hidden_states + encoder_hidden_states_cond_emb
+
+        # byt5 text embedding
+        encoder_hidden_states_2 = self.context_embedder_2(encoder_hidden_states_2)
+
+        encoder_hidden_states_2_cond_emb = self.cond_type_embed(
+            torch.ones_like(encoder_hidden_states_2[:, :, 0], dtype=torch.long)
+        )
+        encoder_hidden_states_2 = encoder_hidden_states_2 + encoder_hidden_states_2_cond_emb
+
+        # image embed
+        encoder_hidden_states_3 = self.image_embedder(image_embeds)
+        is_t2v = torch.all(image_embeds == 0)
+        if is_t2v:
+            encoder_hidden_states_3 = encoder_hidden_states_3 * 0.0
+            encoder_attention_mask_3 = torch.zeros(
+                (batch_size, encoder_hidden_states_3.shape[1]),
+                dtype=encoder_attention_mask.dtype,
+                device=encoder_attention_mask.device,
+            )
+        else:
+            encoder_attention_mask_3 = torch.ones(
+                (batch_size, encoder_hidden_states_3.shape[1]),
+                dtype=encoder_attention_mask.dtype,
+                device=encoder_attention_mask.device,
+            )
+        encoder_hidden_states_3_cond_emb = self.cond_type_embed(
+            2
+            * torch.ones_like(
+                encoder_hidden_states_3[:, :, 0],
+                dtype=torch.long,
+            )
+        )
+        encoder_hidden_states_3 = encoder_hidden_states_3 + encoder_hidden_states_3_cond_emb
+
+        # reorder and combine text tokens: combine valid tokens first, then padding
+        encoder_attention_mask = encoder_attention_mask.bool()
+        encoder_attention_mask_2 = encoder_attention_mask_2.bool()
+        encoder_attention_mask_3 = encoder_attention_mask_3.bool()
+        new_encoder_hidden_states = []
+        new_encoder_attention_mask = []
+
+        for text, text_mask, text_2, text_mask_2, image, image_mask in zip(
+            encoder_hidden_states,
+            encoder_attention_mask,
+            encoder_hidden_states_2,
+            encoder_attention_mask_2,
+            encoder_hidden_states_3,
+            encoder_attention_mask_3,
+        ):
+            # Concatenate: [valid_image, valid_byt5, valid_mllm, invalid_image, invalid_byt5, invalid_mllm]
+            new_encoder_hidden_states.append(
+                torch.cat(
+                    [
+                        image[image_mask],  # valid image
+                        text_2[text_mask_2],  # valid byt5
+                        text[text_mask],  # valid mllm
+                        image[~image_mask],  # invalid image
+                        torch.zeros_like(text_2[~text_mask_2]),  # invalid byt5 (zeroed)
+                        torch.zeros_like(text[~text_mask]),  # invalid mllm (zeroed)
+                    ],
+                    dim=0,
+                )
+            )
+
+            # Apply same reordering to attention masks
+            new_encoder_attention_mask.append(
+                torch.cat(
+                    [
+                        image_mask[image_mask],
+                        text_mask_2[text_mask_2],
+                        text_mask[text_mask],
+                        image_mask[~image_mask],
+                        text_mask_2[~text_mask_2],
+                        text_mask[~text_mask],
+                    ],
+                    dim=0,
+                )
+            )
+
+        encoder_hidden_states = torch.stack(new_encoder_hidden_states)
+        encoder_attention_mask = torch.stack(new_encoder_attention_mask)
+
+        # 4. Transformer blocks
+        if torch.is_grad_enabled() and self.gradient_checkpointing:
+            for block in self.transformer_blocks:
+                hidden_states, encoder_hidden_states = self._gradient_checkpointing_func(
+                    block,
+                    hidden_states,
+                    encoder_hidden_states,
+                    temb,
+                    encoder_attention_mask,
+                    image_rotary_emb,
+                )
+
+        else:
+            for block in self.transformer_blocks:
+                hidden_states, encoder_hidden_states = block(
+                    hidden_states,
+                    encoder_hidden_states,
+                    temb,
+                    encoder_attention_mask,
+                    image_rotary_emb,
+                )
+
+        # 5. Output projection
+        hidden_states = self.norm_out(hidden_states, temb)
+        hidden_states = self.proj_out(hidden_states)
+
+        hidden_states = hidden_states.reshape(
+            batch_size, post_patch_num_frames, post_patch_height, post_patch_width, -1, p_t, p_h, p_w
+        )
+        hidden_states = hidden_states.permute(0, 4, 1, 5, 2, 6, 3, 7)
+        hidden_states = hidden_states.flatten(6, 7).flatten(4, 5).flatten(2, 3)
+
+        if USE_PEFT_BACKEND:
+            # remove `lora_scale` from each PEFT layer
+            unscale_lora_layers(self, lora_scale)
+
+        if not return_dict:
+            return (hidden_states,)
+
+        return Transformer2DModelOutput(sample=hidden_states)
diff --git a/src/diffusers/models/transformers/transformer_hunyuan_video_framepack.py b/src/diffusers/models/transformers/transformer_hunyuan_video_framepack.py
index 60b40fff3cb8..601ba0f0b472 100644
--- a/src/diffusers/models/transformers/transformer_hunyuan_video_framepack.py
+++ b/src/diffusers/models/transformers/transformer_hunyuan_video_framepack.py
@@ -139,7 +139,7 @@ def __init__(
         text_embed_dim: int = 4096,
         pooled_projection_dim: int = 768,
         rope_theta: float = 256.0,
-        rope_axes_dim: Tuple[int] = (16, 56, 56),
+        rope_axes_dim: Tuple[int, ...] = (16, 56, 56),
         image_condition_type: Optional[str] = None,
         has_image_proj: int = False,
         image_proj_dim: int = 1152,
diff --git a/src/diffusers/models/transformers/transformer_hunyuanimage.py b/src/diffusers/models/transformers/transformer_hunyuanimage.py
index 7f37bf815b4b..d626e322ad6f 100644
--- a/src/diffusers/models/transformers/transformer_hunyuanimage.py
+++ b/src/diffusers/models/transformers/transformer_hunyuanimage.py
@@ -25,9 +25,9 @@
 from ...loaders import PeftAdapterMixin
 from ...utils import USE_PEFT_BACKEND, logging, scale_lora_layers, unscale_lora_layers
 from ...utils.torch_utils import maybe_allow_in_graph
-from ..attention import FeedForward
+from ..attention import AttentionMixin, FeedForward
 from ..attention_dispatch import dispatch_attention_fn
-from ..attention_processor import Attention, AttentionProcessor
+from ..attention_processor import Attention
 from ..cache_utils import CacheMixin
 from ..embeddings import (
     CombinedTimestepTextProjEmbeddings,
@@ -616,7 +616,9 @@ def forward(
         return hidden_states, encoder_hidden_states
 
 
-class HunyuanImageTransformer2DModel(ModelMixin, ConfigMixin, PeftAdapterMixin, FromOriginalModelMixin, CacheMixin):
+class HunyuanImageTransformer2DModel(
+    ModelMixin, ConfigMixin, AttentionMixin, PeftAdapterMixin, FromOriginalModelMixin, CacheMixin
+):
     r"""
     The Transformer model used in [HunyuanImage-2.1](https://github.com/Tencent-Hunyuan/HunyuanImage-2.1).
 
@@ -667,10 +669,7 @@ class HunyuanImageTransformer2DModel(ModelMixin, ConfigMixin, PeftAdapterMixin,
         "HunyuanImagePatchEmbed",
         "HunyuanImageTokenRefiner",
     ]
-    _repeated_blocks = [
-        "HunyuanImageTransformerBlock",
-        "HunyuanImageSingleTransformerBlock",
-    ]
+    _repeated_blocks = ["HunyuanImageTransformerBlock", "HunyuanImageSingleTransformerBlock"]
 
     @register_to_config
     def __init__(
@@ -689,7 +688,7 @@ def __init__(
         text_embed_dim: int = 3584,
         text_embed_2_dim: Optional[int] = None,
         rope_theta: float = 256.0,
-        rope_axes_dim: Tuple[int] = (64, 64),
+        rope_axes_dim: Tuple[int, ...] = (64, 64),
         use_meanflow: bool = False,
     ) -> None:
         super().__init__()
@@ -743,66 +742,6 @@ def __init__(
 
         self.gradient_checkpointing = False
 
-    @property
-    # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.attn_processors
-    def attn_processors(self) -> Dict[str, AttentionProcessor]:
-        r"""
-        Returns:
-            `dict` of attention processors: A dictionary containing all attention processors used in the model with
-            indexed by its weight name.
-        """
-        # set recursively
-        processors = {}
-
-        def fn_recursive_add_processors(name: str, module: torch.nn.Module, processors: Dict[str, AttentionProcessor]):
-            if hasattr(module, "get_processor"):
-                processors[f"{name}.processor"] = module.get_processor()
-
-            for sub_name, child in module.named_children():
-                fn_recursive_add_processors(f"{name}.{sub_name}", child, processors)
-
-            return processors
-
-        for name, module in self.named_children():
-            fn_recursive_add_processors(name, module, processors)
-
-        return processors
-
-    # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.set_attn_processor
-    def set_attn_processor(self, processor: Union[AttentionProcessor, Dict[str, AttentionProcessor]]):
-        r"""
-        Sets the attention processor to use to compute attention.
-
-        Parameters:
-            processor (`dict` of `AttentionProcessor` or only `AttentionProcessor`):
-                The instantiated processor class or a dictionary of processor classes that will be set as the processor
-                for **all** `Attention` layers.
-
-                If `processor` is a dict, the key needs to define the path to the corresponding cross attention
-                processor. This is strongly recommended when setting trainable attention processors.
-
-        """
-        count = len(self.attn_processors.keys())
-
-        if isinstance(processor, dict) and len(processor) != count:
-            raise ValueError(
-                f"A dict of processors was passed, but the number of processors {len(processor)} does not match the"
-                f" number of attention layers: {count}. Please make sure to pass {count} processor classes."
-            )
-
-        def fn_recursive_attn_processor(name: str, module: torch.nn.Module, processor):
-            if hasattr(module, "set_processor"):
-                if not isinstance(processor, dict):
-                    module.set_processor(processor)
-                else:
-                    module.set_processor(processor.pop(f"{name}.processor"))
-
-            for sub_name, child in module.named_children():
-                fn_recursive_attn_processor(f"{name}.{sub_name}", child, processor)
-
-        for name, module in self.named_children():
-            fn_recursive_attn_processor(name, module, processor)
-
     def forward(
         self,
         hidden_states: torch.Tensor,
diff --git a/src/diffusers/models/transformers/transformer_ovis_image.py b/src/diffusers/models/transformers/transformer_ovis_image.py
new file mode 100644
index 000000000000..0a09aa720b3f
--- /dev/null
+++ b/src/diffusers/models/transformers/transformer_ovis_image.py
@@ -0,0 +1,581 @@
+# Copyright 2025 Alibaba Ovis-Image Team and The HuggingFace. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import inspect
+from typing import Any, Dict, List, Optional, Tuple, Union
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from ...configuration_utils import ConfigMixin, register_to_config
+from ...loaders import FromOriginalModelMixin, PeftAdapterMixin
+from ...utils import is_torch_npu_available, logging
+from ...utils.torch_utils import maybe_allow_in_graph
+from ..attention import AttentionModuleMixin, FeedForward
+from ..attention_dispatch import dispatch_attention_fn
+from ..cache_utils import CacheMixin
+from ..embeddings import TimestepEmbedding, Timesteps, apply_rotary_emb, get_1d_rotary_pos_embed
+from ..modeling_outputs import Transformer2DModelOutput
+from ..modeling_utils import ModelMixin
+from ..normalization import AdaLayerNormContinuous, AdaLayerNormZero, AdaLayerNormZeroSingle
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+
+def _get_projections(attn: "OvisImageAttention", hidden_states, encoder_hidden_states=None):
+    query = attn.to_q(hidden_states)
+    key = attn.to_k(hidden_states)
+    value = attn.to_v(hidden_states)
+
+    encoder_query = encoder_key = encoder_value = None
+    if encoder_hidden_states is not None and attn.added_kv_proj_dim is not None:
+        encoder_query = attn.add_q_proj(encoder_hidden_states)
+        encoder_key = attn.add_k_proj(encoder_hidden_states)
+        encoder_value = attn.add_v_proj(encoder_hidden_states)
+
+    return query, key, value, encoder_query, encoder_key, encoder_value
+
+
+def _get_fused_projections(attn: "OvisImageAttention", hidden_states, encoder_hidden_states=None):
+    query, key, value = attn.to_qkv(hidden_states).chunk(3, dim=-1)
+
+    encoder_query = encoder_key = encoder_value = (None,)
+    if encoder_hidden_states is not None and hasattr(attn, "to_added_qkv"):
+        encoder_query, encoder_key, encoder_value = attn.to_added_qkv(encoder_hidden_states).chunk(3, dim=-1)
+
+    return query, key, value, encoder_query, encoder_key, encoder_value
+
+
+def _get_qkv_projections(attn: "OvisImageAttention", hidden_states, encoder_hidden_states=None):
+    if attn.fused_projections:
+        return _get_fused_projections(attn, hidden_states, encoder_hidden_states)
+    return _get_projections(attn, hidden_states, encoder_hidden_states)
+
+
+class OvisImageAttnProcessor:
+    _attention_backend = None
+    _parallel_config = None
+
+    def __init__(self):
+        if not hasattr(F, "scaled_dot_product_attention"):
+            raise ImportError(f"{self.__class__.__name__} requires PyTorch 2.0. Please upgrade your pytorch version.")
+
+    def __call__(
+        self,
+        attn: "OvisImageAttention",
+        hidden_states: torch.Tensor,
+        encoder_hidden_states: torch.Tensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        image_rotary_emb: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        query, key, value, encoder_query, encoder_key, encoder_value = _get_qkv_projections(
+            attn, hidden_states, encoder_hidden_states
+        )
+
+        query = query.unflatten(-1, (attn.heads, -1))
+        key = key.unflatten(-1, (attn.heads, -1))
+        value = value.unflatten(-1, (attn.heads, -1))
+
+        query = attn.norm_q(query)
+        key = attn.norm_k(key)
+
+        if attn.added_kv_proj_dim is not None:
+            encoder_query = encoder_query.unflatten(-1, (attn.heads, -1))
+            encoder_key = encoder_key.unflatten(-1, (attn.heads, -1))
+            encoder_value = encoder_value.unflatten(-1, (attn.heads, -1))
+
+            encoder_query = attn.norm_added_q(encoder_query)
+            encoder_key = attn.norm_added_k(encoder_key)
+
+            query = torch.cat([encoder_query, query], dim=1)
+            key = torch.cat([encoder_key, key], dim=1)
+            value = torch.cat([encoder_value, value], dim=1)
+
+        if image_rotary_emb is not None:
+            query = apply_rotary_emb(query, image_rotary_emb, sequence_dim=1)
+            key = apply_rotary_emb(key, image_rotary_emb, sequence_dim=1)
+
+        hidden_states = dispatch_attention_fn(
+            query,
+            key,
+            value,
+            attn_mask=attention_mask,
+            backend=self._attention_backend,
+            parallel_config=self._parallel_config,
+        )
+        hidden_states = hidden_states.flatten(2, 3)
+        hidden_states = hidden_states.to(query.dtype)
+
+        if encoder_hidden_states is not None:
+            encoder_hidden_states, hidden_states = hidden_states.split_with_sizes(
+                [encoder_hidden_states.shape[1], hidden_states.shape[1] - encoder_hidden_states.shape[1]], dim=1
+            )
+            hidden_states = attn.to_out[0](hidden_states)
+            hidden_states = attn.to_out[1](hidden_states)
+            encoder_hidden_states = attn.to_add_out(encoder_hidden_states)
+
+            return hidden_states, encoder_hidden_states
+        else:
+            return hidden_states
+
+
+class OvisImageAttention(torch.nn.Module, AttentionModuleMixin):
+    _default_processor_cls = OvisImageAttnProcessor
+    _available_processors = [
+        OvisImageAttnProcessor,
+    ]
+
+    def __init__(
+        self,
+        query_dim: int,
+        heads: int = 8,
+        dim_head: int = 64,
+        dropout: float = 0.0,
+        bias: bool = False,
+        added_kv_proj_dim: Optional[int] = None,
+        added_proj_bias: Optional[bool] = True,
+        out_bias: bool = True,
+        eps: float = 1e-5,
+        out_dim: int = None,
+        context_pre_only: Optional[bool] = None,
+        pre_only: bool = False,
+        elementwise_affine: bool = True,
+        processor=None,
+    ):
+        super().__init__()
+
+        self.head_dim = dim_head
+        self.inner_dim = out_dim if out_dim is not None else dim_head * heads
+        self.query_dim = query_dim
+        self.use_bias = bias
+        self.dropout = dropout
+        self.out_dim = out_dim if out_dim is not None else query_dim
+        self.context_pre_only = context_pre_only
+        self.pre_only = pre_only
+        self.heads = out_dim // dim_head if out_dim is not None else heads
+        self.added_kv_proj_dim = added_kv_proj_dim
+        self.added_proj_bias = added_proj_bias
+
+        self.norm_q = torch.nn.RMSNorm(dim_head, eps=eps, elementwise_affine=elementwise_affine)
+        self.norm_k = torch.nn.RMSNorm(dim_head, eps=eps, elementwise_affine=elementwise_affine)
+        self.to_q = torch.nn.Linear(query_dim, self.inner_dim, bias=bias)
+        self.to_k = torch.nn.Linear(query_dim, self.inner_dim, bias=bias)
+        self.to_v = torch.nn.Linear(query_dim, self.inner_dim, bias=bias)
+
+        if not self.pre_only:
+            self.to_out = torch.nn.ModuleList([])
+            self.to_out.append(torch.nn.Linear(self.inner_dim, self.out_dim, bias=out_bias))
+            self.to_out.append(torch.nn.Dropout(dropout))
+
+        if added_kv_proj_dim is not None:
+            self.norm_added_q = torch.nn.RMSNorm(dim_head, eps=eps)
+            self.norm_added_k = torch.nn.RMSNorm(dim_head, eps=eps)
+            self.add_q_proj = torch.nn.Linear(added_kv_proj_dim, self.inner_dim, bias=added_proj_bias)
+            self.add_k_proj = torch.nn.Linear(added_kv_proj_dim, self.inner_dim, bias=added_proj_bias)
+            self.add_v_proj = torch.nn.Linear(added_kv_proj_dim, self.inner_dim, bias=added_proj_bias)
+            self.to_add_out = torch.nn.Linear(self.inner_dim, query_dim, bias=out_bias)
+
+        if processor is None:
+            processor = self._default_processor_cls()
+        self.set_processor(processor)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        encoder_hidden_states: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        image_rotary_emb: Optional[torch.Tensor] = None,
+        **kwargs,
+    ) -> torch.Tensor:
+        attn_parameters = set(inspect.signature(self.processor.__call__).parameters.keys())
+        quiet_attn_parameters = {"ip_adapter_masks", "ip_hidden_states"}
+        unused_kwargs = [k for k, _ in kwargs.items() if k not in attn_parameters and k not in quiet_attn_parameters]
+        if len(unused_kwargs) > 0:
+            logger.warning(
+                f"joint_attention_kwargs {unused_kwargs} are not expected by {self.processor.__class__.__name__} and will be ignored."
+            )
+        kwargs = {k: w for k, w in kwargs.items() if k in attn_parameters}
+        return self.processor(self, hidden_states, encoder_hidden_states, attention_mask, image_rotary_emb, **kwargs)
+
+
+@maybe_allow_in_graph
+class OvisImageSingleTransformerBlock(nn.Module):
+    def __init__(self, dim: int, num_attention_heads: int, attention_head_dim: int, mlp_ratio: float = 4.0):
+        super().__init__()
+        self.mlp_hidden_dim = int(dim * mlp_ratio)
+
+        self.norm = AdaLayerNormZeroSingle(dim)
+        self.proj_mlp = nn.Linear(dim, self.mlp_hidden_dim * 2)
+        self.act_mlp = nn.SiLU()
+        self.proj_out = nn.Linear(dim + self.mlp_hidden_dim, dim)
+
+        self.attn = OvisImageAttention(
+            query_dim=dim,
+            dim_head=attention_head_dim,
+            heads=num_attention_heads,
+            out_dim=dim,
+            bias=True,
+            processor=OvisImageAttnProcessor(),
+            eps=1e-6,
+            pre_only=True,
+        )
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        encoder_hidden_states: torch.Tensor,
+        temb: torch.Tensor,
+        image_rotary_emb: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
+        joint_attention_kwargs: Optional[Dict[str, Any]] = None,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        text_seq_len = encoder_hidden_states.shape[1]
+        hidden_states = torch.cat([encoder_hidden_states, hidden_states], dim=1)
+
+        residual = hidden_states
+        norm_hidden_states, gate = self.norm(hidden_states, emb=temb)
+        mlp_hidden_states, mlp_hidden_gate = torch.split(
+            self.proj_mlp(norm_hidden_states), [self.mlp_hidden_dim, self.mlp_hidden_dim], dim=-1
+        )
+        mlp_hidden_states = self.act_mlp(mlp_hidden_gate) * mlp_hidden_states
+        joint_attention_kwargs = joint_attention_kwargs or {}
+        attn_output = self.attn(
+            hidden_states=norm_hidden_states,
+            image_rotary_emb=image_rotary_emb,
+            **joint_attention_kwargs,
+        )
+
+        hidden_states = torch.cat([attn_output, mlp_hidden_states], dim=2)
+        gate = gate.unsqueeze(1)
+        hidden_states = gate * self.proj_out(hidden_states)
+        hidden_states = residual + hidden_states
+        if hidden_states.dtype == torch.float16:
+            hidden_states = hidden_states.clip(-65504, 65504)
+
+        encoder_hidden_states, hidden_states = hidden_states[:, :text_seq_len], hidden_states[:, text_seq_len:]
+        return encoder_hidden_states, hidden_states
+
+
+@maybe_allow_in_graph
+class OvisImageTransformerBlock(nn.Module):
+    def __init__(
+        self, dim: int, num_attention_heads: int, attention_head_dim: int, qk_norm: str = "rms_norm", eps: float = 1e-6
+    ):
+        super().__init__()
+
+        self.norm1 = AdaLayerNormZero(dim)
+        self.norm1_context = AdaLayerNormZero(dim)
+
+        self.attn = OvisImageAttention(
+            query_dim=dim,
+            added_kv_proj_dim=dim,
+            dim_head=attention_head_dim,
+            heads=num_attention_heads,
+            out_dim=dim,
+            context_pre_only=False,
+            bias=True,
+            processor=OvisImageAttnProcessor(),
+            eps=eps,
+        )
+
+        self.norm2 = nn.LayerNorm(dim, elementwise_affine=False, eps=1e-6)
+        self.ff = FeedForward(dim=dim, dim_out=dim, activation_fn="swiglu")
+
+        self.norm2_context = nn.LayerNorm(dim, elementwise_affine=False, eps=1e-6)
+        self.ff_context = FeedForward(dim=dim, dim_out=dim, activation_fn="swiglu")
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        encoder_hidden_states: torch.Tensor,
+        temb: torch.Tensor,
+        image_rotary_emb: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
+        joint_attention_kwargs: Optional[Dict[str, Any]] = None,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        norm_hidden_states, gate_msa, shift_mlp, scale_mlp, gate_mlp = self.norm1(hidden_states, emb=temb)
+
+        norm_encoder_hidden_states, c_gate_msa, c_shift_mlp, c_scale_mlp, c_gate_mlp = self.norm1_context(
+            encoder_hidden_states, emb=temb
+        )
+        joint_attention_kwargs = joint_attention_kwargs or {}
+
+        # Attention.
+        attention_outputs = self.attn(
+            hidden_states=norm_hidden_states,
+            encoder_hidden_states=norm_encoder_hidden_states,
+            image_rotary_emb=image_rotary_emb,
+            **joint_attention_kwargs,
+        )
+
+        if len(attention_outputs) == 2:
+            attn_output, context_attn_output = attention_outputs
+        elif len(attention_outputs) == 3:
+            attn_output, context_attn_output, ip_attn_output = attention_outputs
+
+        # Process attention outputs for the `hidden_states`.
+        attn_output = gate_msa.unsqueeze(1) * attn_output
+        hidden_states = hidden_states + attn_output
+
+        norm_hidden_states = self.norm2(hidden_states)
+        norm_hidden_states = norm_hidden_states * (1 + scale_mlp[:, None]) + shift_mlp[:, None]
+
+        ff_output = self.ff(norm_hidden_states)
+        ff_output = gate_mlp.unsqueeze(1) * ff_output
+
+        hidden_states = hidden_states + ff_output
+        if len(attention_outputs) == 3:
+            hidden_states = hidden_states + ip_attn_output
+
+        # Process attention outputs for the `encoder_hidden_states`.
+        context_attn_output = c_gate_msa.unsqueeze(1) * context_attn_output
+        encoder_hidden_states = encoder_hidden_states + context_attn_output
+
+        norm_encoder_hidden_states = self.norm2_context(encoder_hidden_states)
+        norm_encoder_hidden_states = norm_encoder_hidden_states * (1 + c_scale_mlp[:, None]) + c_shift_mlp[:, None]
+
+        context_ff_output = self.ff_context(norm_encoder_hidden_states)
+        encoder_hidden_states = encoder_hidden_states + c_gate_mlp.unsqueeze(1) * context_ff_output
+        if encoder_hidden_states.dtype == torch.float16:
+            encoder_hidden_states = encoder_hidden_states.clip(-65504, 65504)
+
+        return encoder_hidden_states, hidden_states
+
+
+class OvisImagePosEmbed(nn.Module):
+    def __init__(self, theta: int, axes_dim: List[int]):
+        super().__init__()
+        self.theta = theta
+        self.axes_dim = axes_dim
+
+    def forward(self, ids: torch.Tensor) -> torch.Tensor:
+        n_axes = ids.shape[-1]
+        cos_out = []
+        sin_out = []
+        pos = ids.float()
+        is_mps = ids.device.type == "mps"
+        is_npu = ids.device.type == "npu"
+        freqs_dtype = torch.float32 if (is_mps or is_npu) else torch.float64
+        for i in range(n_axes):
+            cos, sin = get_1d_rotary_pos_embed(
+                self.axes_dim[i],
+                pos[:, i],
+                theta=self.theta,
+                repeat_interleave_real=True,
+                use_real=True,
+                freqs_dtype=freqs_dtype,
+            )
+            cos_out.append(cos)
+            sin_out.append(sin)
+        freqs_cos = torch.cat(cos_out, dim=-1).to(ids.device)
+        freqs_sin = torch.cat(sin_out, dim=-1).to(ids.device)
+        return freqs_cos, freqs_sin
+
+
+class OvisImageTransformer2DModel(
+    ModelMixin,
+    ConfigMixin,
+    PeftAdapterMixin,
+    FromOriginalModelMixin,
+    CacheMixin,
+):
+    """
+    The Transformer model introduced in Ovis-Image.
+
+    Reference: https://github.com/AIDC-AI/Ovis-Image
+
+    Args:
+        patch_size (`int`, defaults to `1`):
+            Patch size to turn the input data into small patches.
+        in_channels (`int`, defaults to `64`):
+            The number of channels in the input.
+        out_channels (`int`, *optional*, defaults to `None`):
+            The number of channels in the output. If not specified, it defaults to `in_channels`.
+        num_layers (`int`, defaults to `6`):
+            The number of layers of dual stream DiT blocks to use.
+        num_single_layers (`int`, defaults to `27`):
+            The number of layers of single stream DiT blocks to use.
+        attention_head_dim (`int`, defaults to `128`):
+            The number of dimensions to use for each attention head.
+        num_attention_heads (`int`, defaults to `24`):
+            The number of attention heads to use.
+        joint_attention_dim (`int`, defaults to `2048`):
+            The number of dimensions to use for the joint attention (embedding/channel dimension of
+            `encoder_hidden_states`).
+        axes_dims_rope (`Tuple[int]`, defaults to `(16, 56, 56)`):
+            The dimensions to use for the rotary positional embeddings.
+    """
+
+    _supports_gradient_checkpointing = True
+    _no_split_modules = ["OvisImageTransformerBlock", "OvisImageSingleTransformerBlock"]
+    _skip_layerwise_casting_patterns = ["pos_embed", "norm"]
+    _repeated_blocks = ["OvisImageTransformerBlock", "OvisImageSingleTransformerBlock"]
+
+    @register_to_config
+    def __init__(
+        self,
+        patch_size: int = 1,
+        in_channels: int = 64,
+        out_channels: Optional[int] = 64,
+        num_layers: int = 6,
+        num_single_layers: int = 27,
+        attention_head_dim: int = 128,
+        num_attention_heads: int = 24,
+        joint_attention_dim: int = 2048,
+        axes_dims_rope: Tuple[int, int, int] = (16, 56, 56),
+    ):
+        super().__init__()
+        self.out_channels = out_channels or in_channels
+        self.inner_dim = num_attention_heads * attention_head_dim
+
+        self.pos_embed = OvisImagePosEmbed(theta=10000, axes_dim=axes_dims_rope)
+
+        self.time_proj = Timesteps(num_channels=256, flip_sin_to_cos=True, downscale_freq_shift=0)
+        self.timestep_embedder = TimestepEmbedding(in_channels=256, time_embed_dim=self.inner_dim)
+
+        self.context_embedder_norm = nn.RMSNorm(joint_attention_dim, eps=1e-6)
+        self.context_embedder = nn.Linear(joint_attention_dim, self.inner_dim)
+        self.x_embedder = nn.Linear(in_channels, self.inner_dim)
+
+        self.transformer_blocks = nn.ModuleList(
+            [
+                OvisImageTransformerBlock(
+                    dim=self.inner_dim,
+                    num_attention_heads=num_attention_heads,
+                    attention_head_dim=attention_head_dim,
+                )
+                for _ in range(num_layers)
+            ]
+        )
+
+        self.single_transformer_blocks = nn.ModuleList(
+            [
+                OvisImageSingleTransformerBlock(
+                    dim=self.inner_dim,
+                    num_attention_heads=num_attention_heads,
+                    attention_head_dim=attention_head_dim,
+                )
+                for _ in range(num_single_layers)
+            ]
+        )
+
+        self.norm_out = AdaLayerNormContinuous(self.inner_dim, self.inner_dim, elementwise_affine=False, eps=1e-6)
+        self.proj_out = nn.Linear(self.inner_dim, patch_size * patch_size * self.out_channels, bias=True)
+
+        self.gradient_checkpointing = False
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        encoder_hidden_states: torch.Tensor = None,
+        timestep: torch.LongTensor = None,
+        img_ids: torch.Tensor = None,
+        txt_ids: torch.Tensor = None,
+        return_dict: bool = True,
+    ) -> Union[torch.Tensor, Transformer2DModelOutput]:
+        """
+        The [`OvisImageTransformer2DModel`] forward method.
+
+        Args:
+            hidden_states (`torch.Tensor` of shape `(batch_size, image_sequence_length, in_channels)`):
+                Input `hidden_states`.
+            encoder_hidden_states (`torch.Tensor` of shape `(batch_size, text_sequence_length, joint_attention_dim)`):
+                Conditional embeddings (embeddings computed from the input conditions such as prompts) to use.
+            timestep (`torch.LongTensor`):
+                Used to indicate denoising step.
+            img_ids: (`torch.Tensor`):
+                The position ids for image tokens.
+            txt_ids (`torch.Tensor`):
+                The position ids for text tokens.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~models.transformer_2d.Transformer2DModelOutput`] instead of a plain
+                tuple.
+
+        Returns:
+            If `return_dict` is True, an [`~models.transformer_2d.Transformer2DModelOutput`] is returned, otherwise a
+            `tuple` where the first element is the sample tensor.
+        """
+        hidden_states = self.x_embedder(hidden_states)
+
+        timestep = timestep.to(hidden_states.dtype) * 1000
+
+        timesteps_proj = self.time_proj(timestep)
+        temb = self.timestep_embedder(timesteps_proj.to(dtype=hidden_states.dtype))
+
+        encoder_hidden_states = self.context_embedder_norm(encoder_hidden_states)
+        encoder_hidden_states = self.context_embedder(encoder_hidden_states)
+
+        if txt_ids.ndim == 3:
+            logger.warning(
+                "Passing `txt_ids` 3d torch.Tensor is deprecated."
+                "Please remove the batch dimension and pass it as a 2d torch Tensor"
+            )
+            txt_ids = txt_ids[0]
+        if img_ids.ndim == 3:
+            logger.warning(
+                "Passing `img_ids` 3d torch.Tensor is deprecated."
+                "Please remove the batch dimension and pass it as a 2d torch Tensor"
+            )
+            img_ids = img_ids[0]
+
+        ids = torch.cat((txt_ids, img_ids), dim=0)
+        if is_torch_npu_available():
+            freqs_cos, freqs_sin = self.pos_embed(ids.cpu())
+            image_rotary_emb = (freqs_cos.npu(), freqs_sin.npu())
+        else:
+            image_rotary_emb = self.pos_embed(ids)
+
+        for index_block, block in enumerate(self.transformer_blocks):
+            if torch.is_grad_enabled() and self.gradient_checkpointing:
+                encoder_hidden_states, hidden_states = self._gradient_checkpointing_func(
+                    block,
+                    hidden_states,
+                    encoder_hidden_states,
+                    temb,
+                    image_rotary_emb,
+                )
+
+            else:
+                encoder_hidden_states, hidden_states = block(
+                    hidden_states=hidden_states,
+                    encoder_hidden_states=encoder_hidden_states,
+                    temb=temb,
+                    image_rotary_emb=image_rotary_emb,
+                )
+
+        for index_block, block in enumerate(self.single_transformer_blocks):
+            if torch.is_grad_enabled() and self.gradient_checkpointing:
+                encoder_hidden_states, hidden_states = self._gradient_checkpointing_func(
+                    block,
+                    hidden_states,
+                    encoder_hidden_states,
+                    temb,
+                    image_rotary_emb,
+                )
+
+            else:
+                encoder_hidden_states, hidden_states = block(
+                    hidden_states=hidden_states,
+                    encoder_hidden_states=encoder_hidden_states,
+                    temb=temb,
+                    image_rotary_emb=image_rotary_emb,
+                )
+
+        hidden_states = self.norm_out(hidden_states, temb)
+        output = self.proj_out(hidden_states)
+
+        if not return_dict:
+            return (output,)
+
+        return Transformer2DModelOutput(sample=output)
diff --git a/src/diffusers/models/transformers/transformer_prx.py b/src/diffusers/models/transformers/transformer_prx.py
index 9b2664b9cb26..a87c120fdcd7 100644
--- a/src/diffusers/models/transformers/transformer_prx.py
+++ b/src/diffusers/models/transformers/transformer_prx.py
@@ -16,7 +16,6 @@
 
 import torch
 from torch import nn
-from torch.nn.functional import fold, unfold
 
 from ...configuration_utils import ConfigMixin, register_to_config
 from ...utils import logging
@@ -275,7 +274,12 @@ def __init__(self, dim: int, theta: int, axes_dim: List[int]):
 
     def rope(self, pos: torch.Tensor, dim: int, theta: int) -> torch.Tensor:
         assert dim % 2 == 0
-        scale = torch.arange(0, dim, 2, dtype=torch.float64, device=pos.device) / dim
+
+        is_mps = pos.device.type == "mps"
+        is_npu = pos.device.type == "npu"
+        dtype = torch.float32 if (is_mps or is_npu) else torch.float64
+
+        scale = torch.arange(0, dim, 2, dtype=dtype, device=pos.device) / dim
         omega = 1.0 / (theta**scale)
         out = pos.unsqueeze(-1) * omega.unsqueeze(0)
         out = torch.stack([torch.cos(out), -torch.sin(out), torch.sin(out), torch.cos(out)], dim=-1)
@@ -527,7 +531,19 @@ def img2seq(img: torch.Tensor, patch_size: int) -> torch.Tensor:
             Flattened patch sequence of shape `(B, L, C * patch_size * patch_size)`, where `L = (H // patch_size) * (W
             // patch_size)` is the number of patches.
     """
-    return unfold(img, kernel_size=patch_size, stride=patch_size).transpose(1, 2)
+    b, c, h, w = img.shape
+    p = patch_size
+
+    # Reshape to (B, C, H//p, p, W//p, p) separating grid and patch dimensions
+    img = img.reshape(b, c, h // p, p, w // p, p)
+
+    # Permute to (B, H//p, W//p, C, p, p) using einsum
+    # n=batch, c=channels, h=grid_height, p=patch_height, w=grid_width, q=patch_width
+    img = torch.einsum("nchpwq->nhwcpq", img)
+
+    # Flatten to (B, L, C * p * p)
+    img = img.reshape(b, -1, c * p * p)
+    return img
 
 
 def seq2img(seq: torch.Tensor, patch_size: int, shape: torch.Tensor) -> torch.Tensor:
@@ -549,12 +565,26 @@ def seq2img(seq: torch.Tensor, patch_size: int, shape: torch.Tensor) -> torch.Te
             Reconstructed image tensor of shape `(B, C, H, W)`.
     """
     if isinstance(shape, tuple):
-        shape = shape[-2:]
+        h, w = shape[-2:]
     elif isinstance(shape, torch.Tensor):
-        shape = (int(shape[0]), int(shape[1]))
+        h, w = (int(shape[0]), int(shape[1]))
     else:
         raise NotImplementedError(f"shape type {type(shape)} not supported")
-    return fold(seq.transpose(1, 2), shape, kernel_size=patch_size, stride=patch_size)
+
+    b, l, d = seq.shape
+    p = patch_size
+    c = d // (p * p)
+
+    # Reshape back to grid structure: (B, H//p, W//p, C, p, p)
+    seq = seq.reshape(b, h // p, w // p, c, p, p)
+
+    # Permute back to image layout: (B, C, H//p, p, W//p, p)
+    # n=batch, h=grid_height, w=grid_width, c=channels, p=patch_height, q=patch_width
+    seq = torch.einsum("nhwcpq->nchpwq", seq)
+
+    # Final reshape to (B, C, H, W)
+    seq = seq.reshape(b, c, h, w)
+    return seq
 
 
 class PRXTransformer2DModel(ModelMixin, ConfigMixin, AttentionMixin):
@@ -689,6 +719,7 @@ def _compute_timestep_embedding(self, timestep: torch.Tensor, dtype: torch.dtype
                 max_period=self.time_max_period,
                 scale=self.time_factor,
                 flip_sin_to_cos=True,  # Match original cos, sin order
+                downscale_freq_shift=0.0,
             ).to(dtype)
         )
 
diff --git a/src/diffusers/models/transformers/transformer_sana_video.py b/src/diffusers/models/transformers/transformer_sana_video.py
new file mode 100644
index 000000000000..a4f90342631a
--- /dev/null
+++ b/src/diffusers/models/transformers/transformer_sana_video.py
@@ -0,0 +1,705 @@
+# Copyright 2025 The HuggingFace Team and SANA-Video Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import math
+from typing import Any, Dict, Optional, Tuple, Union
+
+import torch
+import torch.nn.functional as F
+from torch import nn
+
+from ...configuration_utils import ConfigMixin, register_to_config
+from ...loaders import FromOriginalModelMixin, PeftAdapterMixin
+from ...utils import USE_PEFT_BACKEND, logging, scale_lora_layers, unscale_lora_layers
+from ..attention import AttentionMixin
+from ..attention_dispatch import dispatch_attention_fn
+from ..attention_processor import Attention
+from ..embeddings import PixArtAlphaTextProjection, TimestepEmbedding, Timesteps, get_1d_rotary_pos_embed
+from ..modeling_outputs import Transformer2DModelOutput
+from ..modeling_utils import ModelMixin
+from ..normalization import AdaLayerNormSingle, RMSNorm
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+
+class GLUMBTempConv(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        expand_ratio: float = 4,
+        norm_type: Optional[str] = None,
+        residual_connection: bool = True,
+    ) -> None:
+        super().__init__()
+
+        hidden_channels = int(expand_ratio * in_channels)
+        self.norm_type = norm_type
+        self.residual_connection = residual_connection
+
+        self.nonlinearity = nn.SiLU()
+        self.conv_inverted = nn.Conv2d(in_channels, hidden_channels * 2, 1, 1, 0)
+        self.conv_depth = nn.Conv2d(hidden_channels * 2, hidden_channels * 2, 3, 1, 1, groups=hidden_channels * 2)
+        self.conv_point = nn.Conv2d(hidden_channels, out_channels, 1, 1, 0, bias=False)
+
+        self.norm = None
+        if norm_type == "rms_norm":
+            self.norm = RMSNorm(out_channels, eps=1e-5, elementwise_affine=True, bias=True)
+
+        self.conv_temp = nn.Conv2d(
+            out_channels, out_channels, kernel_size=(3, 1), stride=1, padding=(1, 0), bias=False
+        )
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        if self.residual_connection:
+            residual = hidden_states
+        batch_size, num_frames, height, width, num_channels = hidden_states.shape
+        hidden_states = hidden_states.view(batch_size * num_frames, height, width, num_channels).permute(0, 3, 1, 2)
+
+        hidden_states = self.conv_inverted(hidden_states)
+        hidden_states = self.nonlinearity(hidden_states)
+
+        hidden_states = self.conv_depth(hidden_states)
+        hidden_states, gate = torch.chunk(hidden_states, 2, dim=1)
+        hidden_states = hidden_states * self.nonlinearity(gate)
+
+        hidden_states = self.conv_point(hidden_states)
+
+        # Temporal aggregation
+        hidden_states_temporal = hidden_states.view(batch_size, num_frames, num_channels, height * width).permute(
+            0, 2, 1, 3
+        )
+        hidden_states = hidden_states_temporal + self.conv_temp(hidden_states_temporal)
+        hidden_states = hidden_states.permute(0, 2, 3, 1).view(batch_size, num_frames, height, width, num_channels)
+
+        if self.norm_type == "rms_norm":
+            # move channel to the last dimension so we apply RMSnorm across channel dimension
+            hidden_states = self.norm(hidden_states.movedim(1, -1)).movedim(-1, 1)
+
+        if self.residual_connection:
+            hidden_states = hidden_states + residual
+
+        return hidden_states
+
+
+class SanaLinearAttnProcessor3_0:
+    r"""
+    Processor for implementing scaled dot-product linear attention.
+    """
+
+    def __call__(
+        self,
+        attn: Attention,
+        hidden_states: torch.Tensor,
+        encoder_hidden_states: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        rotary_emb: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        original_dtype = hidden_states.dtype
+
+        if encoder_hidden_states is None:
+            encoder_hidden_states = hidden_states
+
+        query = attn.to_q(hidden_states)
+        key = attn.to_k(encoder_hidden_states)
+        value = attn.to_v(encoder_hidden_states)
+
+        if attn.norm_q is not None:
+            query = attn.norm_q(query)
+        if attn.norm_k is not None:
+            key = attn.norm_k(key)
+
+        query = query.unflatten(2, (attn.heads, -1))
+        key = key.unflatten(2, (attn.heads, -1))
+        value = value.unflatten(2, (attn.heads, -1))
+        # B,N,H,C
+
+        query = F.relu(query)
+        key = F.relu(key)
+
+        if rotary_emb is not None:
+
+            def apply_rotary_emb(
+                hidden_states: torch.Tensor,
+                freqs_cos: torch.Tensor,
+                freqs_sin: torch.Tensor,
+            ):
+                x1, x2 = hidden_states.unflatten(-1, (-1, 2)).unbind(-1)
+                cos = freqs_cos[..., 0::2]
+                sin = freqs_sin[..., 1::2]
+                out = torch.empty_like(hidden_states)
+                out[..., 0::2] = x1 * cos - x2 * sin
+                out[..., 1::2] = x1 * sin + x2 * cos
+                return out.type_as(hidden_states)
+
+            query_rotate = apply_rotary_emb(query, *rotary_emb)
+            key_rotate = apply_rotary_emb(key, *rotary_emb)
+
+        # B,H,C,N
+        query = query.permute(0, 2, 3, 1)
+        key = key.permute(0, 2, 3, 1)
+        query_rotate = query_rotate.permute(0, 2, 3, 1)
+        key_rotate = key_rotate.permute(0, 2, 3, 1)
+        value = value.permute(0, 2, 3, 1)
+
+        query_rotate, key_rotate, value = query_rotate.float(), key_rotate.float(), value.float()
+
+        z = 1 / (key.sum(dim=-1, keepdim=True).transpose(-2, -1) @ query + 1e-15)
+
+        scores = torch.matmul(value, key_rotate.transpose(-1, -2))
+        hidden_states = torch.matmul(scores, query_rotate)
+
+        hidden_states = hidden_states * z
+        # B,H,C,N
+        hidden_states = hidden_states.flatten(1, 2).transpose(1, 2)
+        hidden_states = hidden_states.to(original_dtype)
+
+        hidden_states = attn.to_out[0](hidden_states)
+        hidden_states = attn.to_out[1](hidden_states)
+
+        return hidden_states
+
+
+class WanRotaryPosEmbed(nn.Module):
+    def __init__(
+        self,
+        attention_head_dim: int,
+        patch_size: Tuple[int, int, int],
+        max_seq_len: int,
+        theta: float = 10000.0,
+    ):
+        super().__init__()
+
+        self.attention_head_dim = attention_head_dim
+        self.patch_size = patch_size
+        self.max_seq_len = max_seq_len
+
+        h_dim = w_dim = 2 * (attention_head_dim // 6)
+        t_dim = attention_head_dim - h_dim - w_dim
+
+        self.t_dim = t_dim
+        self.h_dim = h_dim
+        self.w_dim = w_dim
+
+        freqs_dtype = torch.float32 if torch.backends.mps.is_available() else torch.float64
+
+        freqs_cos = []
+        freqs_sin = []
+
+        for dim in [t_dim, h_dim, w_dim]:
+            freq_cos, freq_sin = get_1d_rotary_pos_embed(
+                dim,
+                max_seq_len,
+                theta,
+                use_real=True,
+                repeat_interleave_real=True,
+                freqs_dtype=freqs_dtype,
+            )
+            freqs_cos.append(freq_cos)
+            freqs_sin.append(freq_sin)
+
+        self.register_buffer("freqs_cos", torch.cat(freqs_cos, dim=1), persistent=False)
+        self.register_buffer("freqs_sin", torch.cat(freqs_sin, dim=1), persistent=False)
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        batch_size, num_channels, num_frames, height, width = hidden_states.shape
+        p_t, p_h, p_w = self.patch_size
+        ppf, pph, ppw = num_frames // p_t, height // p_h, width // p_w
+
+        split_sizes = [self.t_dim, self.h_dim, self.w_dim]
+
+        freqs_cos = self.freqs_cos.split(split_sizes, dim=1)
+        freqs_sin = self.freqs_sin.split(split_sizes, dim=1)
+
+        freqs_cos_f = freqs_cos[0][:ppf].view(ppf, 1, 1, -1).expand(ppf, pph, ppw, -1)
+        freqs_cos_h = freqs_cos[1][:pph].view(1, pph, 1, -1).expand(ppf, pph, ppw, -1)
+        freqs_cos_w = freqs_cos[2][:ppw].view(1, 1, ppw, -1).expand(ppf, pph, ppw, -1)
+
+        freqs_sin_f = freqs_sin[0][:ppf].view(ppf, 1, 1, -1).expand(ppf, pph, ppw, -1)
+        freqs_sin_h = freqs_sin[1][:pph].view(1, pph, 1, -1).expand(ppf, pph, ppw, -1)
+        freqs_sin_w = freqs_sin[2][:ppw].view(1, 1, ppw, -1).expand(ppf, pph, ppw, -1)
+
+        freqs_cos = torch.cat([freqs_cos_f, freqs_cos_h, freqs_cos_w], dim=-1).reshape(1, ppf * pph * ppw, 1, -1)
+        freqs_sin = torch.cat([freqs_sin_f, freqs_sin_h, freqs_sin_w], dim=-1).reshape(1, ppf * pph * ppw, 1, -1)
+
+        return freqs_cos, freqs_sin
+
+
+class SanaModulatedNorm(nn.Module):
+    def __init__(self, dim: int, elementwise_affine: bool = False, eps: float = 1e-6):
+        super().__init__()
+        self.norm = nn.LayerNorm(dim, elementwise_affine=elementwise_affine, eps=eps)
+
+    def forward(
+        self, hidden_states: torch.Tensor, temb: torch.Tensor, scale_shift_table: torch.Tensor
+    ) -> torch.Tensor:
+        hidden_states = self.norm(hidden_states)
+        shift, scale = (scale_shift_table[None, None] + temb[:, :, None].to(scale_shift_table.device)).unbind(dim=2)
+        hidden_states = hidden_states * (1 + scale) + shift
+        return hidden_states
+
+
+class SanaCombinedTimestepGuidanceEmbeddings(nn.Module):
+    def __init__(self, embedding_dim):
+        super().__init__()
+        self.time_proj = Timesteps(num_channels=256, flip_sin_to_cos=True, downscale_freq_shift=0)
+        self.timestep_embedder = TimestepEmbedding(in_channels=256, time_embed_dim=embedding_dim)
+
+        self.guidance_condition_proj = Timesteps(num_channels=256, flip_sin_to_cos=True, downscale_freq_shift=0)
+        self.guidance_embedder = TimestepEmbedding(in_channels=256, time_embed_dim=embedding_dim)
+
+        self.silu = nn.SiLU()
+        self.linear = nn.Linear(embedding_dim, 6 * embedding_dim, bias=True)
+
+    def forward(self, timestep: torch.Tensor, guidance: torch.Tensor = None, hidden_dtype: torch.dtype = None):
+        timesteps_proj = self.time_proj(timestep)
+        timesteps_emb = self.timestep_embedder(timesteps_proj.to(dtype=hidden_dtype))  # (N, D)
+
+        guidance_proj = self.guidance_condition_proj(guidance)
+        guidance_emb = self.guidance_embedder(guidance_proj.to(dtype=hidden_dtype))
+        conditioning = timesteps_emb + guidance_emb
+
+        return self.linear(self.silu(conditioning)), conditioning
+
+
+class SanaAttnProcessor2_0:
+    r"""
+    Processor for implementing scaled dot-product attention (enabled by default if you're using PyTorch 2.0).
+    """
+
+    _attention_backend = None
+    _parallel_config = None
+
+    def __init__(self):
+        if not hasattr(F, "scaled_dot_product_attention"):
+            raise ImportError("SanaAttnProcessor2_0 requires PyTorch 2.0, to use it, please upgrade PyTorch to 2.0.")
+
+    def __call__(
+        self,
+        attn: Attention,
+        hidden_states: torch.Tensor,
+        encoder_hidden_states: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        batch_size, sequence_length, _ = (
+            hidden_states.shape if encoder_hidden_states is None else encoder_hidden_states.shape
+        )
+
+        if attention_mask is not None:
+            attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size)
+            # scaled_dot_product_attention expects attention_mask shape to be
+            # (batch, heads, source_length, target_length)
+            attention_mask = attention_mask.view(batch_size, attn.heads, -1, attention_mask.shape[-1])
+
+        query = attn.to_q(hidden_states)
+
+        if encoder_hidden_states is None:
+            encoder_hidden_states = hidden_states
+
+        key = attn.to_k(encoder_hidden_states)
+        value = attn.to_v(encoder_hidden_states)
+
+        if attn.norm_q is not None:
+            query = attn.norm_q(query)
+        if attn.norm_k is not None:
+            key = attn.norm_k(key)
+
+        inner_dim = key.shape[-1]
+        head_dim = inner_dim // attn.heads
+
+        query = query.view(batch_size, -1, attn.heads, head_dim)
+        key = key.view(batch_size, -1, attn.heads, head_dim)
+        value = value.view(batch_size, -1, attn.heads, head_dim)
+
+        # the output of sdp = (batch, num_heads, seq_len, head_dim)
+        hidden_states = dispatch_attention_fn(
+            query,
+            key,
+            value,
+            attn_mask=attention_mask,
+            dropout_p=0.0,
+            is_causal=False,
+            backend=self._attention_backend,
+            parallel_config=self._parallel_config,
+        )
+        hidden_states = hidden_states.flatten(2, 3)
+        hidden_states = hidden_states.type_as(query)
+
+        # linear proj
+        hidden_states = attn.to_out[0](hidden_states)
+        # dropout
+        hidden_states = attn.to_out[1](hidden_states)
+
+        hidden_states = hidden_states / attn.rescale_output_factor
+
+        return hidden_states
+
+
+class SanaVideoTransformerBlock(nn.Module):
+    r"""
+    Transformer block introduced in [Sana-Video](https://huggingface.co/papers/2509.24695).
+    """
+
+    def __init__(
+        self,
+        dim: int = 2240,
+        num_attention_heads: int = 20,
+        attention_head_dim: int = 112,
+        dropout: float = 0.0,
+        num_cross_attention_heads: Optional[int] = 20,
+        cross_attention_head_dim: Optional[int] = 112,
+        cross_attention_dim: Optional[int] = 2240,
+        attention_bias: bool = True,
+        norm_elementwise_affine: bool = False,
+        norm_eps: float = 1e-6,
+        attention_out_bias: bool = True,
+        mlp_ratio: float = 3.0,
+        qk_norm: Optional[str] = "rms_norm_across_heads",
+        rope_max_seq_len: int = 1024,
+    ) -> None:
+        super().__init__()
+
+        # 1. Self Attention
+        self.norm1 = nn.LayerNorm(dim, elementwise_affine=False, eps=norm_eps)
+        self.attn1 = Attention(
+            query_dim=dim,
+            heads=num_attention_heads,
+            dim_head=attention_head_dim,
+            kv_heads=num_attention_heads if qk_norm is not None else None,
+            qk_norm=qk_norm,
+            dropout=dropout,
+            bias=attention_bias,
+            cross_attention_dim=None,
+            processor=SanaLinearAttnProcessor3_0(),
+        )
+
+        # 2. Cross Attention
+        if cross_attention_dim is not None:
+            self.norm2 = nn.LayerNorm(dim, elementwise_affine=norm_elementwise_affine, eps=norm_eps)
+            self.attn2 = Attention(
+                query_dim=dim,
+                qk_norm=qk_norm,
+                kv_heads=num_cross_attention_heads if qk_norm is not None else None,
+                cross_attention_dim=cross_attention_dim,
+                heads=num_cross_attention_heads,
+                dim_head=cross_attention_head_dim,
+                dropout=dropout,
+                bias=True,
+                out_bias=attention_out_bias,
+                processor=SanaAttnProcessor2_0(),
+            )
+
+        # 3. Feed-forward
+        self.ff = GLUMBTempConv(dim, dim, mlp_ratio, norm_type=None, residual_connection=False)
+
+        self.scale_shift_table = nn.Parameter(torch.randn(6, dim) / dim**0.5)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        encoder_hidden_states: Optional[torch.Tensor] = None,
+        encoder_attention_mask: Optional[torch.Tensor] = None,
+        timestep: Optional[torch.LongTensor] = None,
+        frames: int = None,
+        height: int = None,
+        width: int = None,
+        rotary_emb: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        batch_size = hidden_states.shape[0]
+
+        # 1. Modulation
+        shift_msa, scale_msa, gate_msa, shift_mlp, scale_mlp, gate_mlp = (
+            self.scale_shift_table[None, None] + timestep.reshape(batch_size, timestep.shape[1], 6, -1)
+        ).unbind(dim=2)
+
+        # 2. Self Attention
+        norm_hidden_states = self.norm1(hidden_states)
+        norm_hidden_states = norm_hidden_states * (1 + scale_msa) + shift_msa
+        norm_hidden_states = norm_hidden_states.to(hidden_states.dtype)
+
+        attn_output = self.attn1(norm_hidden_states, rotary_emb=rotary_emb)
+        hidden_states = hidden_states + gate_msa * attn_output
+
+        # 3. Cross Attention
+        if self.attn2 is not None:
+            attn_output = self.attn2(
+                hidden_states,
+                encoder_hidden_states=encoder_hidden_states,
+                attention_mask=encoder_attention_mask,
+            )
+            hidden_states = attn_output + hidden_states
+
+        # 4. Feed-forward
+        norm_hidden_states = self.norm2(hidden_states)
+        norm_hidden_states = norm_hidden_states * (1 + scale_mlp) + shift_mlp
+
+        norm_hidden_states = norm_hidden_states.unflatten(1, (frames, height, width))
+        ff_output = self.ff(norm_hidden_states)
+        ff_output = ff_output.flatten(1, 3)
+        hidden_states = hidden_states + gate_mlp * ff_output
+
+        return hidden_states
+
+
+class SanaVideoTransformer3DModel(ModelMixin, ConfigMixin, PeftAdapterMixin, FromOriginalModelMixin, AttentionMixin):
+    r"""
+    A 3D Transformer model introduced in [Sana-Video](https://huggingface.co/papers/2509.24695) family of models.
+
+    Args:
+        in_channels (`int`, defaults to `16`):
+            The number of channels in the input.
+        out_channels (`int`, *optional*, defaults to `16`):
+            The number of channels in the output.
+        num_attention_heads (`int`, defaults to `20`):
+            The number of heads to use for multi-head attention.
+        attention_head_dim (`int`, defaults to `112`):
+            The number of channels in each head.
+        num_layers (`int`, defaults to `20`):
+            The number of layers of Transformer blocks to use.
+        num_cross_attention_heads (`int`, *optional*, defaults to `20`):
+            The number of heads to use for cross-attention.
+        cross_attention_head_dim (`int`, *optional*, defaults to `112`):
+            The number of channels in each head for cross-attention.
+        cross_attention_dim (`int`, *optional*, defaults to `2240`):
+            The number of channels in the cross-attention output.
+        caption_channels (`int`, defaults to `2304`):
+            The number of channels in the caption embeddings.
+        mlp_ratio (`float`, defaults to `2.5`):
+            The expansion ratio to use in the GLUMBConv layer.
+        dropout (`float`, defaults to `0.0`):
+            The dropout probability.
+        attention_bias (`bool`, defaults to `False`):
+            Whether to use bias in the attention layer.
+        sample_size (`int`, defaults to `32`):
+            The base size of the input latent.
+        patch_size (`int`, defaults to `1`):
+            The size of the patches to use in the patch embedding layer.
+        norm_elementwise_affine (`bool`, defaults to `False`):
+            Whether to use elementwise affinity in the normalization layer.
+        norm_eps (`float`, defaults to `1e-6`):
+            The epsilon value for the normalization layer.
+        qk_norm (`str`, *optional*, defaults to `None`):
+            The normalization to use for the query and key.
+    """
+
+    _supports_gradient_checkpointing = True
+    _no_split_modules = ["SanaVideoTransformerBlock", "SanaModulatedNorm"]
+    _skip_layerwise_casting_patterns = ["patch_embedding", "norm"]
+
+    @register_to_config
+    def __init__(
+        self,
+        in_channels: int = 16,
+        out_channels: Optional[int] = 16,
+        num_attention_heads: int = 20,
+        attention_head_dim: int = 112,
+        num_layers: int = 20,
+        num_cross_attention_heads: Optional[int] = 20,
+        cross_attention_head_dim: Optional[int] = 112,
+        cross_attention_dim: Optional[int] = 2240,
+        caption_channels: int = 2304,
+        mlp_ratio: float = 2.5,
+        dropout: float = 0.0,
+        attention_bias: bool = False,
+        sample_size: int = 30,
+        patch_size: Tuple[int, int, int] = (1, 2, 2),
+        norm_elementwise_affine: bool = False,
+        norm_eps: float = 1e-6,
+        interpolation_scale: Optional[int] = None,
+        guidance_embeds: bool = False,
+        guidance_embeds_scale: float = 0.1,
+        qk_norm: Optional[str] = "rms_norm_across_heads",
+        rope_max_seq_len: int = 1024,
+    ) -> None:
+        super().__init__()
+
+        out_channels = out_channels or in_channels
+        inner_dim = num_attention_heads * attention_head_dim
+
+        # 1. Patch & position embedding
+        self.rope = WanRotaryPosEmbed(attention_head_dim, patch_size, rope_max_seq_len)
+        self.patch_embedding = nn.Conv3d(in_channels, inner_dim, kernel_size=patch_size, stride=patch_size)
+
+        # 2. Additional condition embeddings
+        if guidance_embeds:
+            self.time_embed = SanaCombinedTimestepGuidanceEmbeddings(inner_dim)
+        else:
+            self.time_embed = AdaLayerNormSingle(inner_dim)
+
+        self.caption_projection = PixArtAlphaTextProjection(in_features=caption_channels, hidden_size=inner_dim)
+        self.caption_norm = RMSNorm(inner_dim, eps=1e-5, elementwise_affine=True)
+
+        # 3. Transformer blocks
+        self.transformer_blocks = nn.ModuleList(
+            [
+                SanaVideoTransformerBlock(
+                    inner_dim,
+                    num_attention_heads,
+                    attention_head_dim,
+                    dropout=dropout,
+                    num_cross_attention_heads=num_cross_attention_heads,
+                    cross_attention_head_dim=cross_attention_head_dim,
+                    cross_attention_dim=cross_attention_dim,
+                    attention_bias=attention_bias,
+                    norm_elementwise_affine=norm_elementwise_affine,
+                    norm_eps=norm_eps,
+                    mlp_ratio=mlp_ratio,
+                    qk_norm=qk_norm,
+                )
+                for _ in range(num_layers)
+            ]
+        )
+
+        # 4. Output blocks
+        self.scale_shift_table = nn.Parameter(torch.randn(2, inner_dim) / inner_dim**0.5)
+        self.norm_out = SanaModulatedNorm(inner_dim, elementwise_affine=False, eps=1e-6)
+        self.proj_out = nn.Linear(inner_dim, math.prod(patch_size) * out_channels)
+
+        self.gradient_checkpointing = False
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        encoder_hidden_states: torch.Tensor,
+        timestep: torch.Tensor,
+        guidance: Optional[torch.Tensor] = None,
+        encoder_attention_mask: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        attention_kwargs: Optional[Dict[str, Any]] = None,
+        controlnet_block_samples: Optional[Tuple[torch.Tensor]] = None,
+        return_dict: bool = True,
+    ) -> Union[Tuple[torch.Tensor, ...], Transformer2DModelOutput]:
+        if attention_kwargs is not None:
+            attention_kwargs = attention_kwargs.copy()
+            lora_scale = attention_kwargs.pop("scale", 1.0)
+        else:
+            lora_scale = 1.0
+
+        if USE_PEFT_BACKEND:
+            # weight the lora layers by setting `lora_scale` for each PEFT layer
+            scale_lora_layers(self, lora_scale)
+        else:
+            if attention_kwargs is not None and attention_kwargs.get("scale", None) is not None:
+                logger.warning(
+                    "Passing `scale` via `attention_kwargs` when not using the PEFT backend is ineffective."
+                )
+
+        # ensure attention_mask is a bias, and give it a singleton query_tokens dimension.
+        #   we may have done this conversion already, e.g. if we came here via UNet2DConditionModel#forward.
+        #   we can tell by counting dims; if ndim == 2: it's a mask rather than a bias.
+        # expects mask of shape:
+        #   [batch, key_tokens]
+        # adds singleton query_tokens dimension:
+        #   [batch,                    1, key_tokens]
+        # this helps to broadcast it as a bias over attention scores, which will be in one of the following shapes:
+        #   [batch,  heads, query_tokens, key_tokens] (e.g. torch sdp attn)
+        #   [batch * heads, query_tokens, key_tokens] (e.g. xformers or classic attn)
+        if attention_mask is not None and attention_mask.ndim == 2:
+            # assume that mask is expressed as:
+            #   (1 = keep,      0 = discard)
+            # convert mask into a bias that can be added to attention scores:
+            #       (keep = +0,     discard = -10000.0)
+            attention_mask = (1 - attention_mask.to(hidden_states.dtype)) * -10000.0
+            attention_mask = attention_mask.unsqueeze(1)
+
+        # convert encoder_attention_mask to a bias the same way we do for attention_mask
+        if encoder_attention_mask is not None and encoder_attention_mask.ndim == 2:
+            encoder_attention_mask = (1 - encoder_attention_mask.to(hidden_states.dtype)) * -10000.0
+            encoder_attention_mask = encoder_attention_mask.unsqueeze(1)
+
+        # 1. Input
+        batch_size, num_channels, num_frames, height, width = hidden_states.shape
+        p_t, p_h, p_w = self.config.patch_size
+        post_patch_num_frames = num_frames // p_t
+        post_patch_height = height // p_h
+        post_patch_width = width // p_w
+
+        rotary_emb = self.rope(hidden_states)
+
+        hidden_states = self.patch_embedding(hidden_states)
+        hidden_states = hidden_states.flatten(2).transpose(1, 2)
+
+        if guidance is not None:
+            timestep, embedded_timestep = self.time_embed(
+                timestep.flatten(), guidance=guidance, hidden_dtype=hidden_states.dtype
+            )
+        else:
+            timestep, embedded_timestep = self.time_embed(
+                timestep.flatten(), batch_size=batch_size, hidden_dtype=hidden_states.dtype
+            )
+
+        timestep = timestep.view(batch_size, -1, timestep.size(-1))
+        embedded_timestep = embedded_timestep.view(batch_size, -1, embedded_timestep.size(-1))
+
+        encoder_hidden_states = self.caption_projection(encoder_hidden_states)
+        encoder_hidden_states = encoder_hidden_states.view(batch_size, -1, hidden_states.shape[-1])
+
+        encoder_hidden_states = self.caption_norm(encoder_hidden_states)
+
+        # 2. Transformer blocks
+        if torch.is_grad_enabled() and self.gradient_checkpointing:
+            for index_block, block in enumerate(self.transformer_blocks):
+                hidden_states = self._gradient_checkpointing_func(
+                    block,
+                    hidden_states,
+                    attention_mask,
+                    encoder_hidden_states,
+                    encoder_attention_mask,
+                    timestep,
+                    post_patch_num_frames,
+                    post_patch_height,
+                    post_patch_width,
+                    rotary_emb,
+                )
+                if controlnet_block_samples is not None and 0 < index_block <= len(controlnet_block_samples):
+                    hidden_states = hidden_states + controlnet_block_samples[index_block - 1]
+
+        else:
+            for index_block, block in enumerate(self.transformer_blocks):
+                hidden_states = block(
+                    hidden_states,
+                    attention_mask,
+                    encoder_hidden_states,
+                    encoder_attention_mask,
+                    timestep,
+                    post_patch_num_frames,
+                    post_patch_height,
+                    post_patch_width,
+                    rotary_emb,
+                )
+                if controlnet_block_samples is not None and 0 < index_block <= len(controlnet_block_samples):
+                    hidden_states = hidden_states + controlnet_block_samples[index_block - 1]
+
+        # 3. Normalization
+        hidden_states = self.norm_out(hidden_states, embedded_timestep, self.scale_shift_table)
+
+        hidden_states = self.proj_out(hidden_states)
+
+        # 5. Unpatchify
+        hidden_states = hidden_states.reshape(
+            batch_size, post_patch_num_frames, post_patch_height, post_patch_width, p_t, p_h, p_w, -1
+        )
+        hidden_states = hidden_states.permute(0, 7, 1, 4, 2, 5, 3, 6)
+        output = hidden_states.flatten(6, 7).flatten(4, 5).flatten(2, 3)
+
+        if USE_PEFT_BACKEND:
+            # remove `lora_scale` from each PEFT layer
+            unscale_lora_layers(self, lora_scale)
+
+        if not return_dict:
+            return (output,)
+
+        return Transformer2DModelOutput(sample=output)
diff --git a/src/diffusers/models/transformers/transformer_sd3.py b/src/diffusers/models/transformers/transformer_sd3.py
index 762d89c303d7..05391e047b7a 100644
--- a/src/diffusers/models/transformers/transformer_sd3.py
+++ b/src/diffusers/models/transformers/transformer_sd3.py
@@ -20,10 +20,9 @@
 from ...loaders import FromOriginalModelMixin, PeftAdapterMixin, SD3Transformer2DLoadersMixin
 from ...utils import USE_PEFT_BACKEND, logging, scale_lora_layers, unscale_lora_layers
 from ...utils.torch_utils import maybe_allow_in_graph
-from ..attention import FeedForward, JointTransformerBlock
+from ..attention import AttentionMixin, FeedForward, JointTransformerBlock
 from ..attention_processor import (
     Attention,
-    AttentionProcessor,
     FusedJointAttnProcessor2_0,
     JointAttnProcessor2_0,
 )
@@ -78,7 +77,7 @@ def forward(self, hidden_states: torch.Tensor, temb: torch.Tensor):
 
 
 class SD3Transformer2DModel(
-    ModelMixin, ConfigMixin, PeftAdapterMixin, FromOriginalModelMixin, SD3Transformer2DLoadersMixin
+    ModelMixin, AttentionMixin, ConfigMixin, PeftAdapterMixin, FromOriginalModelMixin, SD3Transformer2DLoadersMixin
 ):
     """
     The Transformer model introduced in [Stable Diffusion 3](https://huggingface.co/papers/2403.03206).
@@ -214,66 +213,6 @@ def fn_recursive_feed_forward(module: torch.nn.Module, chunk_size: int, dim: int
         for module in self.children():
             fn_recursive_feed_forward(module, None, 0)
 
-    @property
-    # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.attn_processors
-    def attn_processors(self) -> Dict[str, AttentionProcessor]:
-        r"""
-        Returns:
-            `dict` of attention processors: A dictionary containing all attention processors used in the model with
-            indexed by its weight name.
-        """
-        # set recursively
-        processors = {}
-
-        def fn_recursive_add_processors(name: str, module: torch.nn.Module, processors: Dict[str, AttentionProcessor]):
-            if hasattr(module, "get_processor"):
-                processors[f"{name}.processor"] = module.get_processor()
-
-            for sub_name, child in module.named_children():
-                fn_recursive_add_processors(f"{name}.{sub_name}", child, processors)
-
-            return processors
-
-        for name, module in self.named_children():
-            fn_recursive_add_processors(name, module, processors)
-
-        return processors
-
-    # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.set_attn_processor
-    def set_attn_processor(self, processor: Union[AttentionProcessor, Dict[str, AttentionProcessor]]):
-        r"""
-        Sets the attention processor to use to compute attention.
-
-        Parameters:
-            processor (`dict` of `AttentionProcessor` or only `AttentionProcessor`):
-                The instantiated processor class or a dictionary of processor classes that will be set as the processor
-                for **all** `Attention` layers.
-
-                If `processor` is a dict, the key needs to define the path to the corresponding cross attention
-                processor. This is strongly recommended when setting trainable attention processors.
-
-        """
-        count = len(self.attn_processors.keys())
-
-        if isinstance(processor, dict) and len(processor) != count:
-            raise ValueError(
-                f"A dict of processors was passed, but the number of processors {len(processor)} does not match the"
-                f" number of attention layers: {count}. Please make sure to pass {count} processor classes."
-            )
-
-        def fn_recursive_attn_processor(name: str, module: torch.nn.Module, processor):
-            if hasattr(module, "set_processor"):
-                if not isinstance(processor, dict):
-                    module.set_processor(processor)
-                else:
-                    module.set_processor(processor.pop(f"{name}.processor"))
-
-            for sub_name, child in module.named_children():
-                fn_recursive_attn_processor(f"{name}.{sub_name}", child, processor)
-
-        for name, module in self.named_children():
-            fn_recursive_attn_processor(name, module, processor)
-
     # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.fuse_qkv_projections with FusedAttnProcessor2_0->FusedJointAttnProcessor2_0
     def fuse_qkv_projections(self):
         """
diff --git a/src/diffusers/models/transformers/transformer_skyreels_v2.py b/src/diffusers/models/transformers/transformer_skyreels_v2.py
index 6b600aa22487..2b9fc5b8d9fb 100644
--- a/src/diffusers/models/transformers/transformer_skyreels_v2.py
+++ b/src/diffusers/models/transformers/transformer_skyreels_v2.py
@@ -389,6 +389,10 @@ def __init__(
         t_dim = attention_head_dim - h_dim - w_dim
         freqs_dtype = torch.float32 if torch.backends.mps.is_available() else torch.float64
 
+        self.t_dim = t_dim
+        self.h_dim = h_dim
+        self.w_dim = w_dim
+
         freqs_cos = []
         freqs_sin = []
 
@@ -412,11 +416,7 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
         p_t, p_h, p_w = self.patch_size
         ppf, pph, ppw = num_frames // p_t, height // p_h, width // p_w
 
-        split_sizes = [
-            self.attention_head_dim - 2 * (self.attention_head_dim // 3),
-            self.attention_head_dim // 3,
-            self.attention_head_dim // 3,
-        ]
+        split_sizes = [self.t_dim, self.h_dim, self.w_dim]
 
         freqs_cos = self.freqs_cos.split(split_sizes, dim=1)
         freqs_sin = self.freqs_sin.split(split_sizes, dim=1)
@@ -570,7 +570,7 @@ class SkyReelsV2Transformer3DModel(
     @register_to_config
     def __init__(
         self,
-        patch_size: Tuple[int] = (1, 2, 2),
+        patch_size: Tuple[int, ...] = (1, 2, 2),
         num_attention_heads: int = 16,
         attention_head_dim: int = 128,
         in_channels: int = 16,
diff --git a/src/diffusers/models/transformers/transformer_wan.py b/src/diffusers/models/transformers/transformer_wan.py
index dd75fb124f1a..f7693ec5d3ac 100644
--- a/src/diffusers/models/transformers/transformer_wan.py
+++ b/src/diffusers/models/transformers/transformer_wan.py
@@ -362,6 +362,11 @@ def __init__(
 
         h_dim = w_dim = 2 * (attention_head_dim // 6)
         t_dim = attention_head_dim - h_dim - w_dim
+
+        self.t_dim = t_dim
+        self.h_dim = h_dim
+        self.w_dim = w_dim
+
         freqs_dtype = torch.float32 if torch.backends.mps.is_available() else torch.float64
 
         freqs_cos = []
@@ -387,11 +392,7 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
         p_t, p_h, p_w = self.patch_size
         ppf, pph, ppw = num_frames // p_t, height // p_h, width // p_w
 
-        split_sizes = [
-            self.attention_head_dim - 2 * (self.attention_head_dim // 3),
-            self.attention_head_dim // 3,
-            self.attention_head_dim // 3,
-        ]
+        split_sizes = [self.t_dim, self.h_dim, self.w_dim]
 
         freqs_cos = self.freqs_cos.split(split_sizes, dim=1)
         freqs_sin = self.freqs_sin.split(split_sizes, dim=1)
@@ -555,12 +556,15 @@ class WanTransformer3DModel(
             "encoder_hidden_states": ContextParallelInput(split_dim=1, expected_dims=3, split_output=False),
         },
         "proj_out": ContextParallelOutput(gather_dim=1, expected_dims=3),
+        "": {
+            "timestep": ContextParallelInput(split_dim=1, expected_dims=2, split_output=False),
+        },
     }
 
     @register_to_config
     def __init__(
         self,
-        patch_size: Tuple[int] = (1, 2, 2),
+        patch_size: Tuple[int, ...] = (1, 2, 2),
         num_attention_heads: int = 40,
         attention_head_dim: int = 128,
         in_channels: int = 16,
diff --git a/src/diffusers/models/transformers/transformer_wan_animate.py b/src/diffusers/models/transformers/transformer_wan_animate.py
new file mode 100644
index 000000000000..6a47a67385a3
--- /dev/null
+++ b/src/diffusers/models/transformers/transformer_wan_animate.py
@@ -0,0 +1,1298 @@
+# Copyright 2025 The Wan Team and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import math
+from typing import Any, Dict, Optional, Tuple, Union
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from ...configuration_utils import ConfigMixin, register_to_config
+from ...loaders import FromOriginalModelMixin, PeftAdapterMixin
+from ...utils import USE_PEFT_BACKEND, logging, scale_lora_layers, unscale_lora_layers
+from ..attention import AttentionMixin, AttentionModuleMixin, FeedForward
+from ..attention_dispatch import dispatch_attention_fn
+from ..cache_utils import CacheMixin
+from ..embeddings import PixArtAlphaTextProjection, TimestepEmbedding, Timesteps, get_1d_rotary_pos_embed
+from ..modeling_outputs import Transformer2DModelOutput
+from ..modeling_utils import ModelMixin
+from ..normalization import FP32LayerNorm
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+
+WAN_ANIMATE_MOTION_ENCODER_CHANNEL_SIZES = {
+    "4": 512,
+    "8": 512,
+    "16": 512,
+    "32": 512,
+    "64": 256,
+    "128": 128,
+    "256": 64,
+    "512": 32,
+    "1024": 16,
+}
+
+
+# Copied from diffusers.models.transformers.transformer_wan._get_qkv_projections
+def _get_qkv_projections(attn: "WanAttention", hidden_states: torch.Tensor, encoder_hidden_states: torch.Tensor):
+    # encoder_hidden_states is only passed for cross-attention
+    if encoder_hidden_states is None:
+        encoder_hidden_states = hidden_states
+
+    if attn.fused_projections:
+        if attn.cross_attention_dim_head is None:
+            # In self-attention layers, we can fuse the entire QKV projection into a single linear
+            query, key, value = attn.to_qkv(hidden_states).chunk(3, dim=-1)
+        else:
+            # In cross-attention layers, we can only fuse the KV projections into a single linear
+            query = attn.to_q(hidden_states)
+            key, value = attn.to_kv(encoder_hidden_states).chunk(2, dim=-1)
+    else:
+        query = attn.to_q(hidden_states)
+        key = attn.to_k(encoder_hidden_states)
+        value = attn.to_v(encoder_hidden_states)
+    return query, key, value
+
+
+# Copied from diffusers.models.transformers.transformer_wan._get_added_kv_projections
+def _get_added_kv_projections(attn: "WanAttention", encoder_hidden_states_img: torch.Tensor):
+    if attn.fused_projections:
+        key_img, value_img = attn.to_added_kv(encoder_hidden_states_img).chunk(2, dim=-1)
+    else:
+        key_img = attn.add_k_proj(encoder_hidden_states_img)
+        value_img = attn.add_v_proj(encoder_hidden_states_img)
+    return key_img, value_img
+
+
+class FusedLeakyReLU(nn.Module):
+    """
+    Fused LeakyRelu with scale factor and channel-wise bias.
+    """
+
+    def __init__(self, negative_slope: float = 0.2, scale: float = 2**0.5, bias_channels: Optional[int] = None):
+        super().__init__()
+        self.negative_slope = negative_slope
+        self.scale = scale
+        self.channels = bias_channels
+
+        if self.channels is not None:
+            self.bias = nn.Parameter(
+                torch.zeros(
+                    self.channels,
+                )
+            )
+        else:
+            self.bias = None
+
+    def forward(self, x: torch.Tensor, channel_dim: int = 1) -> torch.Tensor:
+        if self.bias is not None:
+            # Expand self.bias to have all singleton dims except at self.channel_dim
+            expanded_shape = [1] * x.ndim
+            expanded_shape[channel_dim] = self.bias.shape[0]
+            bias = self.bias.reshape(*expanded_shape)
+            x = x + bias
+        return F.leaky_relu(x, self.negative_slope) * self.scale
+
+
+class MotionConv2d(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        kernel_size: int,
+        stride: int = 1,
+        padding: int = 0,
+        bias: bool = True,
+        blur_kernel: Optional[Tuple[int, ...]] = None,
+        blur_upsample_factor: int = 1,
+        use_activation: bool = True,
+    ):
+        super().__init__()
+        self.use_activation = use_activation
+        self.in_channels = in_channels
+
+        # Handle blurring (applying a FIR filter with the given kernel) if available
+        self.blur = False
+        if blur_kernel is not None:
+            p = (len(blur_kernel) - stride) + (kernel_size - 1)
+            self.blur_padding = ((p + 1) // 2, p // 2)
+
+            kernel = torch.tensor(blur_kernel)
+            # Convert kernel to 2D if necessary
+            if kernel.ndim == 1:
+                kernel = kernel[None, :] * kernel[:, None]
+            # Normalize kernel
+            kernel = kernel / kernel.sum()
+            if blur_upsample_factor > 1:
+                kernel = kernel * (blur_upsample_factor**2)
+            self.register_buffer("blur_kernel", kernel, persistent=False)
+            self.blur = True
+
+        # Main Conv2d parameters (with scale factor)
+        self.weight = nn.Parameter(torch.randn(out_channels, in_channels, kernel_size, kernel_size))
+        self.scale = 1 / math.sqrt(in_channels * kernel_size**2)
+
+        self.stride = stride
+        self.padding = padding
+
+        # If using an activation function, the bias will be fused into the activation
+        if bias and not self.use_activation:
+            self.bias = nn.Parameter(torch.zeros(out_channels))
+        else:
+            self.bias = None
+
+        if self.use_activation:
+            self.act_fn = FusedLeakyReLU(bias_channels=out_channels)
+        else:
+            self.act_fn = None
+
+    def forward(self, x: torch.Tensor, channel_dim: int = 1) -> torch.Tensor:
+        # Apply blur if using
+        if self.blur:
+            # NOTE: the original implementation uses a 2D upfirdn operation with the upsampling and downsampling rates
+            # set to 1, which should be equivalent to a 2D convolution
+            expanded_kernel = self.blur_kernel[None, None, :, :].expand(self.in_channels, 1, -1, -1)
+            x = F.conv2d(x, expanded_kernel, padding=self.blur_padding, groups=self.in_channels)
+
+        # Main Conv2D with scaling
+        x = F.conv2d(x, self.weight * self.scale, bias=self.bias, stride=self.stride, padding=self.padding)
+
+        # Activation with fused bias, if using
+        if self.use_activation:
+            x = self.act_fn(x, channel_dim=channel_dim)
+        return x
+
+    def __repr__(self):
+        return (
+            f"{self.__class__.__name__}({self.weight.shape[1]}, {self.weight.shape[0]},"
+            f" kernel_size={self.weight.shape[2]}, stride={self.stride}, padding={self.padding})"
+        )
+
+
+class MotionLinear(nn.Module):
+    def __init__(
+        self,
+        in_dim: int,
+        out_dim: int,
+        bias: bool = True,
+        use_activation: bool = False,
+    ):
+        super().__init__()
+        self.use_activation = use_activation
+
+        # Linear weight with scale factor
+        self.weight = nn.Parameter(torch.randn(out_dim, in_dim))
+        self.scale = 1 / math.sqrt(in_dim)
+
+        # If an activation is present, the bias will be fused to it
+        if bias and not self.use_activation:
+            self.bias = nn.Parameter(torch.zeros(out_dim))
+        else:
+            self.bias = None
+
+        if self.use_activation:
+            self.act_fn = FusedLeakyReLU(bias_channels=out_dim)
+        else:
+            self.act_fn = None
+
+    def forward(self, input: torch.Tensor, channel_dim: int = 1) -> torch.Tensor:
+        out = F.linear(input, self.weight * self.scale, bias=self.bias)
+        if self.use_activation:
+            out = self.act_fn(out, channel_dim=channel_dim)
+        return out
+
+    def __repr__(self):
+        return (
+            f"{self.__class__.__name__}(in_features={self.weight.shape[1]}, out_features={self.weight.shape[0]},"
+            f" bias={self.bias is not None})"
+        )
+
+
+class MotionEncoderResBlock(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        kernel_size: int = 3,
+        kernel_size_skip: int = 1,
+        blur_kernel: Tuple[int, ...] = (1, 3, 3, 1),
+        downsample_factor: int = 2,
+    ):
+        super().__init__()
+        self.downsample_factor = downsample_factor
+
+        # 3 x 3 Conv + fused leaky ReLU
+        self.conv1 = MotionConv2d(
+            in_channels,
+            in_channels,
+            kernel_size,
+            stride=1,
+            padding=kernel_size // 2,
+            use_activation=True,
+        )
+
+        # 3 x 3 Conv that downsamples 2x + fused leaky ReLU
+        self.conv2 = MotionConv2d(
+            in_channels,
+            out_channels,
+            kernel_size=kernel_size,
+            stride=self.downsample_factor,
+            padding=0,
+            blur_kernel=blur_kernel,
+            use_activation=True,
+        )
+
+        # 1 x 1 Conv that downsamples 2x in skip connection
+        self.conv_skip = MotionConv2d(
+            in_channels,
+            out_channels,
+            kernel_size=kernel_size_skip,
+            stride=self.downsample_factor,
+            padding=0,
+            bias=False,
+            blur_kernel=blur_kernel,
+            use_activation=False,
+        )
+
+    def forward(self, x: torch.Tensor, channel_dim: int = 1) -> torch.Tensor:
+        x_out = self.conv1(x, channel_dim)
+        x_out = self.conv2(x_out, channel_dim)
+
+        x_skip = self.conv_skip(x, channel_dim)
+
+        x_out = (x_out + x_skip) / math.sqrt(2)
+        return x_out
+
+
+class WanAnimateMotionEncoder(nn.Module):
+    def __init__(
+        self,
+        size: int = 512,
+        style_dim: int = 512,
+        motion_dim: int = 20,
+        out_dim: int = 512,
+        motion_blocks: int = 5,
+        channels: Optional[Dict[str, int]] = None,
+    ):
+        super().__init__()
+        self.size = size
+
+        # Appearance encoder: conv layers
+        if channels is None:
+            channels = WAN_ANIMATE_MOTION_ENCODER_CHANNEL_SIZES
+
+        self.conv_in = MotionConv2d(3, channels[str(size)], 1, use_activation=True)
+
+        self.res_blocks = nn.ModuleList()
+        in_channels = channels[str(size)]
+        log_size = int(math.log(size, 2))
+        for i in range(log_size, 2, -1):
+            out_channels = channels[str(2 ** (i - 1))]
+            self.res_blocks.append(MotionEncoderResBlock(in_channels, out_channels))
+            in_channels = out_channels
+
+        self.conv_out = MotionConv2d(in_channels, style_dim, 4, padding=0, bias=False, use_activation=False)
+
+        # Motion encoder: linear layers
+        # NOTE: there are no activations in between the linear layers here, which is weird but I believe matches the
+        # original code.
+        linears = [MotionLinear(style_dim, style_dim) for _ in range(motion_blocks - 1)]
+        linears.append(MotionLinear(style_dim, motion_dim))
+        self.motion_network = nn.ModuleList(linears)
+
+        self.motion_synthesis_weight = nn.Parameter(torch.randn(out_dim, motion_dim))
+
+    def forward(self, face_image: torch.Tensor, channel_dim: int = 1) -> torch.Tensor:
+        if (face_image.shape[-2] != self.size) or (face_image.shape[-1] != self.size):
+            raise ValueError(
+                f"Face pixel values has resolution ({face_image.shape[-1]}, {face_image.shape[-2]}) but is expected"
+                f" to have resolution ({self.size}, {self.size})"
+            )
+
+        # Appearance encoding through convs
+        face_image = self.conv_in(face_image, channel_dim)
+        for block in self.res_blocks:
+            face_image = block(face_image, channel_dim)
+        face_image = self.conv_out(face_image, channel_dim)
+        motion_feat = face_image.squeeze(-1).squeeze(-1)
+
+        # Motion feature extraction
+        for linear_layer in self.motion_network:
+            motion_feat = linear_layer(motion_feat, channel_dim=channel_dim)
+
+        # Motion synthesis via Linear Motion Decomposition
+        weight = self.motion_synthesis_weight + 1e-8
+        # Upcast the QR orthogonalization operation to FP32
+        original_motion_dtype = motion_feat.dtype
+        motion_feat = motion_feat.to(torch.float32)
+        weight = weight.to(torch.float32)
+
+        Q = torch.linalg.qr(weight)[0].to(device=motion_feat.device)
+
+        motion_feat_diag = torch.diag_embed(motion_feat)  # Alpha, diagonal matrix
+        motion_decomposition = torch.matmul(motion_feat_diag, Q.T)
+        motion_vec = torch.sum(motion_decomposition, dim=1)
+
+        motion_vec = motion_vec.to(dtype=original_motion_dtype)
+
+        return motion_vec
+
+
+class WanAnimateFaceEncoder(nn.Module):
+    def __init__(
+        self,
+        in_dim: int,
+        out_dim: int,
+        hidden_dim: int = 1024,
+        num_heads: int = 4,
+        kernel_size: int = 3,
+        eps: float = 1e-6,
+        pad_mode: str = "replicate",
+    ):
+        super().__init__()
+        self.num_heads = num_heads
+        self.time_causal_padding = (kernel_size - 1, 0)
+        self.pad_mode = pad_mode
+
+        self.act = nn.SiLU()
+
+        self.conv1_local = nn.Conv1d(in_dim, hidden_dim * num_heads, kernel_size=kernel_size, stride=1)
+        self.conv2 = nn.Conv1d(hidden_dim, hidden_dim, kernel_size, stride=2)
+        self.conv3 = nn.Conv1d(hidden_dim, hidden_dim, kernel_size, stride=2)
+
+        self.norm1 = nn.LayerNorm(hidden_dim, eps, elementwise_affine=False)
+        self.norm2 = nn.LayerNorm(hidden_dim, eps, elementwise_affine=False)
+        self.norm3 = nn.LayerNorm(hidden_dim, eps, elementwise_affine=False)
+
+        self.out_proj = nn.Linear(hidden_dim, out_dim)
+
+        self.padding_tokens = nn.Parameter(torch.zeros(1, 1, 1, out_dim))
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        batch_size = x.shape[0]
+
+        # Reshape to channels-first to apply causal Conv1d over frame dim
+        x = x.permute(0, 2, 1)
+        x = F.pad(x, self.time_causal_padding, mode=self.pad_mode)
+        x = self.conv1_local(x)  # [B, C, T_padded] --> [B, N * C, T]
+        x = x.unflatten(1, (self.num_heads, -1)).flatten(0, 1)  # [B, N * C, T] --> [B * N, C, T]
+        # Reshape back to channels-last to apply LayerNorm over channel dim
+        x = x.permute(0, 2, 1)
+        x = self.norm1(x)
+        x = self.act(x)
+
+        x = x.permute(0, 2, 1)
+        x = F.pad(x, self.time_causal_padding, mode=self.pad_mode)
+        x = self.conv2(x)
+        x = x.permute(0, 2, 1)
+        x = self.norm2(x)
+        x = self.act(x)
+
+        x = x.permute(0, 2, 1)
+        x = F.pad(x, self.time_causal_padding, mode=self.pad_mode)
+        x = self.conv3(x)
+        x = x.permute(0, 2, 1)
+        x = self.norm3(x)
+        x = self.act(x)
+
+        x = self.out_proj(x)
+        x = x.unflatten(0, (batch_size, -1)).permute(0, 2, 1, 3)  # [B * N, T, C_out] --> [B, T, N, C_out]
+
+        padding = self.padding_tokens.repeat(batch_size, x.shape[1], 1, 1).to(device=x.device)
+        x = torch.cat([x, padding], dim=-2)  # [B, T, N, C_out] --> [B, T, N + 1, C_out]
+
+        return x
+
+
+class WanAnimateFaceBlockAttnProcessor:
+    _attention_backend = None
+    _parallel_config = None
+
+    def __init__(self):
+        if not hasattr(F, "scaled_dot_product_attention"):
+            raise ImportError(
+                f"{self.__class__.__name__} requires PyTorch 2.0. To use it, please upgrade PyTorch to version 2.0 or"
+                f" higher."
+            )
+
+    def __call__(
+        self,
+        attn: "WanAnimateFaceBlockCrossAttention",
+        hidden_states: torch.Tensor,
+        encoder_hidden_states: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        # encoder_hidden_states corresponds to the motion vec
+        # attention_mask corresponds to the motion mask (if any)
+        hidden_states = attn.pre_norm_q(hidden_states)
+        encoder_hidden_states = attn.pre_norm_kv(encoder_hidden_states)
+
+        # B --> batch_size, T --> reduced inference segment len, N --> face_encoder_num_heads + 1, C --> attn.dim
+        B, T, N, C = encoder_hidden_states.shape
+
+        query, key, value = _get_qkv_projections(attn, hidden_states, encoder_hidden_states)
+
+        query = query.unflatten(2, (attn.heads, -1))  # [B, S, H * D] --> [B, S, H, D]
+        key = key.view(B, T, N, attn.heads, -1)  # [B, T, N, H * D_kv] --> [B, T, N, H, D_kv]
+        value = value.view(B, T, N, attn.heads, -1)
+
+        query = attn.norm_q(query)
+        key = attn.norm_k(key)
+
+        # NOTE: the below line (which follows the official code) means that in practice, the number of frames T in
+        # encoder_hidden_states (the motion vector after applying the face encoder) must evenly divide the
+        # post-patchify sequence length S of the transformer hidden_states. Is it possible to remove this dependency?
+        query = query.unflatten(1, (T, -1)).flatten(0, 1)  # [B, S, H, D] --> [B * T, S / T, H, D]
+        key = key.flatten(0, 1)  # [B, T, N, H, D_kv] --> [B * T, N, H, D_kv]
+        value = value.flatten(0, 1)
+
+        hidden_states = dispatch_attention_fn(
+            query,
+            key,
+            value,
+            attn_mask=None,
+            dropout_p=0.0,
+            is_causal=False,
+            backend=self._attention_backend,
+            parallel_config=self._parallel_config,
+        )
+
+        hidden_states = hidden_states.flatten(2, 3)
+        hidden_states = hidden_states.type_as(query)
+        hidden_states = hidden_states.unflatten(0, (B, T)).flatten(1, 2)
+
+        hidden_states = attn.to_out(hidden_states)
+
+        if attention_mask is not None:
+            # NOTE: attention_mask is assumed to be a multiplicative mask
+            attention_mask = attention_mask.flatten(start_dim=1)
+            hidden_states = hidden_states * attention_mask
+
+        return hidden_states
+
+
+class WanAnimateFaceBlockCrossAttention(nn.Module, AttentionModuleMixin):
+    """
+    Temporally-aligned cross attention with the face motion signal in the Wan Animate Face Blocks.
+    """
+
+    _default_processor_cls = WanAnimateFaceBlockAttnProcessor
+    _available_processors = [WanAnimateFaceBlockAttnProcessor]
+
+    def __init__(
+        self,
+        dim: int,
+        heads: int = 8,
+        dim_head: int = 64,
+        eps: float = 1e-6,
+        cross_attention_dim_head: Optional[int] = None,
+        processor=None,
+    ):
+        super().__init__()
+        self.inner_dim = dim_head * heads
+        self.heads = heads
+        self.cross_attention_head_dim = cross_attention_dim_head
+        self.kv_inner_dim = self.inner_dim if cross_attention_dim_head is None else cross_attention_dim_head * heads
+
+        # 1. Pre-Attention Norms for the hidden_states (video latents) and encoder_hidden_states (motion vector).
+        # NOTE: this is not used in "vanilla" WanAttention
+        self.pre_norm_q = nn.LayerNorm(dim, eps, elementwise_affine=False)
+        self.pre_norm_kv = nn.LayerNorm(dim, eps, elementwise_affine=False)
+
+        # 2. QKV and Output Projections
+        self.to_q = torch.nn.Linear(dim, self.inner_dim, bias=True)
+        self.to_k = torch.nn.Linear(dim, self.kv_inner_dim, bias=True)
+        self.to_v = torch.nn.Linear(dim, self.kv_inner_dim, bias=True)
+        self.to_out = torch.nn.Linear(self.inner_dim, dim, bias=True)
+
+        # 3. QK Norm
+        # NOTE: this is applied after the reshape, so only over dim_head rather than dim_head * heads
+        self.norm_q = torch.nn.RMSNorm(dim_head, eps=eps, elementwise_affine=True)
+        self.norm_k = torch.nn.RMSNorm(dim_head, eps=eps, elementwise_affine=True)
+
+        # 4. Set attention processor
+        if processor is None:
+            processor = self._default_processor_cls()
+        self.set_processor(processor)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        encoder_hidden_states: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        **kwargs,
+    ) -> torch.Tensor:
+        return self.processor(self, hidden_states, encoder_hidden_states, attention_mask)
+
+
+# Copied from diffusers.models.transformers.transformer_wan.WanAttnProcessor
+class WanAttnProcessor:
+    _attention_backend = None
+    _parallel_config = None
+
+    def __init__(self):
+        if not hasattr(F, "scaled_dot_product_attention"):
+            raise ImportError(
+                "WanAttnProcessor requires PyTorch 2.0. To use it, please upgrade PyTorch to version 2.0 or higher."
+            )
+
+    def __call__(
+        self,
+        attn: "WanAttention",
+        hidden_states: torch.Tensor,
+        encoder_hidden_states: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        rotary_emb: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
+    ) -> torch.Tensor:
+        encoder_hidden_states_img = None
+        if attn.add_k_proj is not None:
+            # 512 is the context length of the text encoder, hardcoded for now
+            image_context_length = encoder_hidden_states.shape[1] - 512
+            encoder_hidden_states_img = encoder_hidden_states[:, :image_context_length]
+            encoder_hidden_states = encoder_hidden_states[:, image_context_length:]
+
+        query, key, value = _get_qkv_projections(attn, hidden_states, encoder_hidden_states)
+
+        query = attn.norm_q(query)
+        key = attn.norm_k(key)
+
+        query = query.unflatten(2, (attn.heads, -1))
+        key = key.unflatten(2, (attn.heads, -1))
+        value = value.unflatten(2, (attn.heads, -1))
+
+        if rotary_emb is not None:
+
+            def apply_rotary_emb(
+                hidden_states: torch.Tensor,
+                freqs_cos: torch.Tensor,
+                freqs_sin: torch.Tensor,
+            ):
+                x1, x2 = hidden_states.unflatten(-1, (-1, 2)).unbind(-1)
+                cos = freqs_cos[..., 0::2]
+                sin = freqs_sin[..., 1::2]
+                out = torch.empty_like(hidden_states)
+                out[..., 0::2] = x1 * cos - x2 * sin
+                out[..., 1::2] = x1 * sin + x2 * cos
+                return out.type_as(hidden_states)
+
+            query = apply_rotary_emb(query, *rotary_emb)
+            key = apply_rotary_emb(key, *rotary_emb)
+
+        # I2V task
+        hidden_states_img = None
+        if encoder_hidden_states_img is not None:
+            key_img, value_img = _get_added_kv_projections(attn, encoder_hidden_states_img)
+            key_img = attn.norm_added_k(key_img)
+
+            key_img = key_img.unflatten(2, (attn.heads, -1))
+            value_img = value_img.unflatten(2, (attn.heads, -1))
+
+            hidden_states_img = dispatch_attention_fn(
+                query,
+                key_img,
+                value_img,
+                attn_mask=None,
+                dropout_p=0.0,
+                is_causal=False,
+                backend=self._attention_backend,
+                parallel_config=self._parallel_config,
+            )
+            hidden_states_img = hidden_states_img.flatten(2, 3)
+            hidden_states_img = hidden_states_img.type_as(query)
+
+        hidden_states = dispatch_attention_fn(
+            query,
+            key,
+            value,
+            attn_mask=attention_mask,
+            dropout_p=0.0,
+            is_causal=False,
+            backend=self._attention_backend,
+            parallel_config=self._parallel_config,
+        )
+        hidden_states = hidden_states.flatten(2, 3)
+        hidden_states = hidden_states.type_as(query)
+
+        if hidden_states_img is not None:
+            hidden_states = hidden_states + hidden_states_img
+
+        hidden_states = attn.to_out[0](hidden_states)
+        hidden_states = attn.to_out[1](hidden_states)
+        return hidden_states
+
+
+# Copied from diffusers.models.transformers.transformer_wan.WanAttention
+class WanAttention(torch.nn.Module, AttentionModuleMixin):
+    _default_processor_cls = WanAttnProcessor
+    _available_processors = [WanAttnProcessor]
+
+    def __init__(
+        self,
+        dim: int,
+        heads: int = 8,
+        dim_head: int = 64,
+        eps: float = 1e-5,
+        dropout: float = 0.0,
+        added_kv_proj_dim: Optional[int] = None,
+        cross_attention_dim_head: Optional[int] = None,
+        processor=None,
+        is_cross_attention=None,
+    ):
+        super().__init__()
+
+        self.inner_dim = dim_head * heads
+        self.heads = heads
+        self.added_kv_proj_dim = added_kv_proj_dim
+        self.cross_attention_dim_head = cross_attention_dim_head
+        self.kv_inner_dim = self.inner_dim if cross_attention_dim_head is None else cross_attention_dim_head * heads
+
+        self.to_q = torch.nn.Linear(dim, self.inner_dim, bias=True)
+        self.to_k = torch.nn.Linear(dim, self.kv_inner_dim, bias=True)
+        self.to_v = torch.nn.Linear(dim, self.kv_inner_dim, bias=True)
+        self.to_out = torch.nn.ModuleList(
+            [
+                torch.nn.Linear(self.inner_dim, dim, bias=True),
+                torch.nn.Dropout(dropout),
+            ]
+        )
+        self.norm_q = torch.nn.RMSNorm(dim_head * heads, eps=eps, elementwise_affine=True)
+        self.norm_k = torch.nn.RMSNorm(dim_head * heads, eps=eps, elementwise_affine=True)
+
+        self.add_k_proj = self.add_v_proj = None
+        if added_kv_proj_dim is not None:
+            self.add_k_proj = torch.nn.Linear(added_kv_proj_dim, self.inner_dim, bias=True)
+            self.add_v_proj = torch.nn.Linear(added_kv_proj_dim, self.inner_dim, bias=True)
+            self.norm_added_k = torch.nn.RMSNorm(dim_head * heads, eps=eps)
+
+        self.is_cross_attention = cross_attention_dim_head is not None
+
+        self.set_processor(processor)
+
+    def fuse_projections(self):
+        if getattr(self, "fused_projections", False):
+            return
+
+        if self.cross_attention_dim_head is None:
+            concatenated_weights = torch.cat([self.to_q.weight.data, self.to_k.weight.data, self.to_v.weight.data])
+            concatenated_bias = torch.cat([self.to_q.bias.data, self.to_k.bias.data, self.to_v.bias.data])
+            out_features, in_features = concatenated_weights.shape
+            with torch.device("meta"):
+                self.to_qkv = nn.Linear(in_features, out_features, bias=True)
+            self.to_qkv.load_state_dict(
+                {"weight": concatenated_weights, "bias": concatenated_bias}, strict=True, assign=True
+            )
+        else:
+            concatenated_weights = torch.cat([self.to_k.weight.data, self.to_v.weight.data])
+            concatenated_bias = torch.cat([self.to_k.bias.data, self.to_v.bias.data])
+            out_features, in_features = concatenated_weights.shape
+            with torch.device("meta"):
+                self.to_kv = nn.Linear(in_features, out_features, bias=True)
+            self.to_kv.load_state_dict(
+                {"weight": concatenated_weights, "bias": concatenated_bias}, strict=True, assign=True
+            )
+
+        if self.added_kv_proj_dim is not None:
+            concatenated_weights = torch.cat([self.add_k_proj.weight.data, self.add_v_proj.weight.data])
+            concatenated_bias = torch.cat([self.add_k_proj.bias.data, self.add_v_proj.bias.data])
+            out_features, in_features = concatenated_weights.shape
+            with torch.device("meta"):
+                self.to_added_kv = nn.Linear(in_features, out_features, bias=True)
+            self.to_added_kv.load_state_dict(
+                {"weight": concatenated_weights, "bias": concatenated_bias}, strict=True, assign=True
+            )
+
+        self.fused_projections = True
+
+    @torch.no_grad()
+    def unfuse_projections(self):
+        if not getattr(self, "fused_projections", False):
+            return
+
+        if hasattr(self, "to_qkv"):
+            delattr(self, "to_qkv")
+        if hasattr(self, "to_kv"):
+            delattr(self, "to_kv")
+        if hasattr(self, "to_added_kv"):
+            delattr(self, "to_added_kv")
+
+        self.fused_projections = False
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        encoder_hidden_states: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        rotary_emb: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
+        **kwargs,
+    ) -> torch.Tensor:
+        return self.processor(self, hidden_states, encoder_hidden_states, attention_mask, rotary_emb, **kwargs)
+
+
+# Copied from diffusers.models.transformers.transformer_wan.WanImageEmbedding
+class WanImageEmbedding(torch.nn.Module):
+    def __init__(self, in_features: int, out_features: int, pos_embed_seq_len=None):
+        super().__init__()
+
+        self.norm1 = FP32LayerNorm(in_features)
+        self.ff = FeedForward(in_features, out_features, mult=1, activation_fn="gelu")
+        self.norm2 = FP32LayerNorm(out_features)
+        if pos_embed_seq_len is not None:
+            self.pos_embed = nn.Parameter(torch.zeros(1, pos_embed_seq_len, in_features))
+        else:
+            self.pos_embed = None
+
+    def forward(self, encoder_hidden_states_image: torch.Tensor) -> torch.Tensor:
+        if self.pos_embed is not None:
+            batch_size, seq_len, embed_dim = encoder_hidden_states_image.shape
+            encoder_hidden_states_image = encoder_hidden_states_image.view(-1, 2 * seq_len, embed_dim)
+            encoder_hidden_states_image = encoder_hidden_states_image + self.pos_embed
+
+        hidden_states = self.norm1(encoder_hidden_states_image)
+        hidden_states = self.ff(hidden_states)
+        hidden_states = self.norm2(hidden_states)
+        return hidden_states
+
+
+# Copied from diffusers.models.transformers.transformer_wan.WanTimeTextImageEmbedding
+class WanTimeTextImageEmbedding(nn.Module):
+    def __init__(
+        self,
+        dim: int,
+        time_freq_dim: int,
+        time_proj_dim: int,
+        text_embed_dim: int,
+        image_embed_dim: Optional[int] = None,
+        pos_embed_seq_len: Optional[int] = None,
+    ):
+        super().__init__()
+
+        self.timesteps_proj = Timesteps(num_channels=time_freq_dim, flip_sin_to_cos=True, downscale_freq_shift=0)
+        self.time_embedder = TimestepEmbedding(in_channels=time_freq_dim, time_embed_dim=dim)
+        self.act_fn = nn.SiLU()
+        self.time_proj = nn.Linear(dim, time_proj_dim)
+        self.text_embedder = PixArtAlphaTextProjection(text_embed_dim, dim, act_fn="gelu_tanh")
+
+        self.image_embedder = None
+        if image_embed_dim is not None:
+            self.image_embedder = WanImageEmbedding(image_embed_dim, dim, pos_embed_seq_len=pos_embed_seq_len)
+
+    def forward(
+        self,
+        timestep: torch.Tensor,
+        encoder_hidden_states: torch.Tensor,
+        encoder_hidden_states_image: Optional[torch.Tensor] = None,
+        timestep_seq_len: Optional[int] = None,
+    ):
+        timestep = self.timesteps_proj(timestep)
+        if timestep_seq_len is not None:
+            timestep = timestep.unflatten(0, (-1, timestep_seq_len))
+
+        time_embedder_dtype = next(iter(self.time_embedder.parameters())).dtype
+        if timestep.dtype != time_embedder_dtype and time_embedder_dtype != torch.int8:
+            timestep = timestep.to(time_embedder_dtype)
+        temb = self.time_embedder(timestep).type_as(encoder_hidden_states)
+        timestep_proj = self.time_proj(self.act_fn(temb))
+
+        encoder_hidden_states = self.text_embedder(encoder_hidden_states)
+        if encoder_hidden_states_image is not None:
+            encoder_hidden_states_image = self.image_embedder(encoder_hidden_states_image)
+
+        return temb, timestep_proj, encoder_hidden_states, encoder_hidden_states_image
+
+
+# Copied from diffusers.models.transformers.transformer_wan.WanRotaryPosEmbed
+class WanRotaryPosEmbed(nn.Module):
+    def __init__(
+        self,
+        attention_head_dim: int,
+        patch_size: Tuple[int, int, int],
+        max_seq_len: int,
+        theta: float = 10000.0,
+    ):
+        super().__init__()
+
+        self.attention_head_dim = attention_head_dim
+        self.patch_size = patch_size
+        self.max_seq_len = max_seq_len
+
+        h_dim = w_dim = 2 * (attention_head_dim // 6)
+        t_dim = attention_head_dim - h_dim - w_dim
+
+        self.t_dim = t_dim
+        self.h_dim = h_dim
+        self.w_dim = w_dim
+
+        freqs_dtype = torch.float32 if torch.backends.mps.is_available() else torch.float64
+
+        freqs_cos = []
+        freqs_sin = []
+
+        for dim in [t_dim, h_dim, w_dim]:
+            freq_cos, freq_sin = get_1d_rotary_pos_embed(
+                dim,
+                max_seq_len,
+                theta,
+                use_real=True,
+                repeat_interleave_real=True,
+                freqs_dtype=freqs_dtype,
+            )
+            freqs_cos.append(freq_cos)
+            freqs_sin.append(freq_sin)
+
+        self.register_buffer("freqs_cos", torch.cat(freqs_cos, dim=1), persistent=False)
+        self.register_buffer("freqs_sin", torch.cat(freqs_sin, dim=1), persistent=False)
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        batch_size, num_channels, num_frames, height, width = hidden_states.shape
+        p_t, p_h, p_w = self.patch_size
+        ppf, pph, ppw = num_frames // p_t, height // p_h, width // p_w
+
+        split_sizes = [self.t_dim, self.h_dim, self.w_dim]
+
+        freqs_cos = self.freqs_cos.split(split_sizes, dim=1)
+        freqs_sin = self.freqs_sin.split(split_sizes, dim=1)
+
+        freqs_cos_f = freqs_cos[0][:ppf].view(ppf, 1, 1, -1).expand(ppf, pph, ppw, -1)
+        freqs_cos_h = freqs_cos[1][:pph].view(1, pph, 1, -1).expand(ppf, pph, ppw, -1)
+        freqs_cos_w = freqs_cos[2][:ppw].view(1, 1, ppw, -1).expand(ppf, pph, ppw, -1)
+
+        freqs_sin_f = freqs_sin[0][:ppf].view(ppf, 1, 1, -1).expand(ppf, pph, ppw, -1)
+        freqs_sin_h = freqs_sin[1][:pph].view(1, pph, 1, -1).expand(ppf, pph, ppw, -1)
+        freqs_sin_w = freqs_sin[2][:ppw].view(1, 1, ppw, -1).expand(ppf, pph, ppw, -1)
+
+        freqs_cos = torch.cat([freqs_cos_f, freqs_cos_h, freqs_cos_w], dim=-1).reshape(1, ppf * pph * ppw, 1, -1)
+        freqs_sin = torch.cat([freqs_sin_f, freqs_sin_h, freqs_sin_w], dim=-1).reshape(1, ppf * pph * ppw, 1, -1)
+
+        return freqs_cos, freqs_sin
+
+
+# Copied from diffusers.models.transformers.transformer_wan.WanTransformerBlock
+class WanTransformerBlock(nn.Module):
+    def __init__(
+        self,
+        dim: int,
+        ffn_dim: int,
+        num_heads: int,
+        qk_norm: str = "rms_norm_across_heads",
+        cross_attn_norm: bool = False,
+        eps: float = 1e-6,
+        added_kv_proj_dim: Optional[int] = None,
+    ):
+        super().__init__()
+
+        # 1. Self-attention
+        self.norm1 = FP32LayerNorm(dim, eps, elementwise_affine=False)
+        self.attn1 = WanAttention(
+            dim=dim,
+            heads=num_heads,
+            dim_head=dim // num_heads,
+            eps=eps,
+            cross_attention_dim_head=None,
+            processor=WanAttnProcessor(),
+        )
+
+        # 2. Cross-attention
+        self.attn2 = WanAttention(
+            dim=dim,
+            heads=num_heads,
+            dim_head=dim // num_heads,
+            eps=eps,
+            added_kv_proj_dim=added_kv_proj_dim,
+            cross_attention_dim_head=dim // num_heads,
+            processor=WanAttnProcessor(),
+        )
+        self.norm2 = FP32LayerNorm(dim, eps, elementwise_affine=True) if cross_attn_norm else nn.Identity()
+
+        # 3. Feed-forward
+        self.ffn = FeedForward(dim, inner_dim=ffn_dim, activation_fn="gelu-approximate")
+        self.norm3 = FP32LayerNorm(dim, eps, elementwise_affine=False)
+
+        self.scale_shift_table = nn.Parameter(torch.randn(1, 6, dim) / dim**0.5)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        encoder_hidden_states: torch.Tensor,
+        temb: torch.Tensor,
+        rotary_emb: torch.Tensor,
+    ) -> torch.Tensor:
+        if temb.ndim == 4:
+            # temb: batch_size, seq_len, 6, inner_dim (wan2.2 ti2v)
+            shift_msa, scale_msa, gate_msa, c_shift_msa, c_scale_msa, c_gate_msa = (
+                self.scale_shift_table.unsqueeze(0) + temb.float()
+            ).chunk(6, dim=2)
+            # batch_size, seq_len, 1, inner_dim
+            shift_msa = shift_msa.squeeze(2)
+            scale_msa = scale_msa.squeeze(2)
+            gate_msa = gate_msa.squeeze(2)
+            c_shift_msa = c_shift_msa.squeeze(2)
+            c_scale_msa = c_scale_msa.squeeze(2)
+            c_gate_msa = c_gate_msa.squeeze(2)
+        else:
+            # temb: batch_size, 6, inner_dim (wan2.1/wan2.2 14B)
+            shift_msa, scale_msa, gate_msa, c_shift_msa, c_scale_msa, c_gate_msa = (
+                self.scale_shift_table + temb.float()
+            ).chunk(6, dim=1)
+
+        # 1. Self-attention
+        norm_hidden_states = (self.norm1(hidden_states.float()) * (1 + scale_msa) + shift_msa).type_as(hidden_states)
+        attn_output = self.attn1(norm_hidden_states, None, None, rotary_emb)
+        hidden_states = (hidden_states.float() + attn_output * gate_msa).type_as(hidden_states)
+
+        # 2. Cross-attention
+        norm_hidden_states = self.norm2(hidden_states.float()).type_as(hidden_states)
+        attn_output = self.attn2(norm_hidden_states, encoder_hidden_states, None, None)
+        hidden_states = hidden_states + attn_output
+
+        # 3. Feed-forward
+        norm_hidden_states = (self.norm3(hidden_states.float()) * (1 + c_scale_msa) + c_shift_msa).type_as(
+            hidden_states
+        )
+        ff_output = self.ffn(norm_hidden_states)
+        hidden_states = (hidden_states.float() + ff_output.float() * c_gate_msa).type_as(hidden_states)
+
+        return hidden_states
+
+
+class WanAnimateTransformer3DModel(
+    ModelMixin, ConfigMixin, PeftAdapterMixin, FromOriginalModelMixin, CacheMixin, AttentionMixin
+):
+    r"""
+    A Transformer model for video-like data used in the WanAnimate model.
+
+    Args:
+        patch_size (`Tuple[int]`, defaults to `(1, 2, 2)`):
+            3D patch dimensions for video embedding (t_patch, h_patch, w_patch).
+        num_attention_heads (`int`, defaults to `40`):
+            Fixed length for text embeddings.
+        attention_head_dim (`int`, defaults to `128`):
+            The number of channels in each head.
+        in_channels (`int`, defaults to `16`):
+            The number of channels in the input.
+        out_channels (`int`, defaults to `16`):
+            The number of channels in the output.
+        text_dim (`int`, defaults to `512`):
+            Input dimension for text embeddings.
+        freq_dim (`int`, defaults to `256`):
+            Dimension for sinusoidal time embeddings.
+        ffn_dim (`int`, defaults to `13824`):
+            Intermediate dimension in feed-forward network.
+        num_layers (`int`, defaults to `40`):
+            The number of layers of transformer blocks to use.
+        window_size (`Tuple[int]`, defaults to `(-1, -1)`):
+            Window size for local attention (-1 indicates global attention).
+        cross_attn_norm (`bool`, defaults to `True`):
+            Enable cross-attention normalization.
+        qk_norm (`bool`, defaults to `True`):
+            Enable query/key normalization.
+        eps (`float`, defaults to `1e-6`):
+            Epsilon value for normalization layers.
+        image_dim (`int`, *optional*, defaults to `1280`):
+            The number of channels to use for the image embedding. If `None`, no projection is used.
+        added_kv_proj_dim (`int`, *optional*, defaults to `5120`):
+            The number of channels to use for the added key and value projections. If `None`, no projection is used.
+    """
+
+    _supports_gradient_checkpointing = True
+    _skip_layerwise_casting_patterns = ["patch_embedding", "condition_embedder", "norm"]
+    _no_split_modules = ["WanTransformerBlock", "MotionEncoderResBlock"]
+    _keep_in_fp32_modules = [
+        "time_embedder",
+        "scale_shift_table",
+        "norm1",
+        "norm2",
+        "norm3",
+        "motion_synthesis_weight",
+    ]
+    _keys_to_ignore_on_load_unexpected = ["norm_added_q"]
+    _repeated_blocks = ["WanTransformerBlock"]
+
+    @register_to_config
+    def __init__(
+        self,
+        patch_size: Tuple[int] = (1, 2, 2),
+        num_attention_heads: int = 40,
+        attention_head_dim: int = 128,
+        in_channels: Optional[int] = 36,
+        latent_channels: Optional[int] = 16,
+        out_channels: Optional[int] = 16,
+        text_dim: int = 4096,
+        freq_dim: int = 256,
+        ffn_dim: int = 13824,
+        num_layers: int = 40,
+        cross_attn_norm: bool = True,
+        qk_norm: Optional[str] = "rms_norm_across_heads",
+        eps: float = 1e-6,
+        image_dim: Optional[int] = 1280,
+        added_kv_proj_dim: Optional[int] = None,
+        rope_max_seq_len: int = 1024,
+        pos_embed_seq_len: Optional[int] = None,
+        motion_encoder_channel_sizes: Optional[Dict[str, int]] = None,  # Start of Wan Animate-specific args
+        motion_encoder_size: int = 512,
+        motion_style_dim: int = 512,
+        motion_dim: int = 20,
+        motion_encoder_dim: int = 512,
+        face_encoder_hidden_dim: int = 1024,
+        face_encoder_num_heads: int = 4,
+        inject_face_latents_blocks: int = 5,
+        motion_encoder_batch_size: int = 8,
+    ) -> None:
+        super().__init__()
+
+        inner_dim = num_attention_heads * attention_head_dim
+        # Allow either only in_channels or only latent_channels to be set for convenience
+        if in_channels is None and latent_channels is not None:
+            in_channels = 2 * latent_channels + 4
+        elif in_channels is not None and latent_channels is None:
+            latent_channels = (in_channels - 4) // 2
+        elif in_channels is not None and latent_channels is not None:
+            # TODO: should this always be true?
+            assert in_channels == 2 * latent_channels + 4, "in_channels should be 2 * latent_channels + 4"
+        else:
+            raise ValueError("At least one of `in_channels` and `latent_channels` must be supplied.")
+        out_channels = out_channels or latent_channels
+
+        # 1. Patch & position embedding
+        self.rope = WanRotaryPosEmbed(attention_head_dim, patch_size, rope_max_seq_len)
+        self.patch_embedding = nn.Conv3d(in_channels, inner_dim, kernel_size=patch_size, stride=patch_size)
+        self.pose_patch_embedding = nn.Conv3d(latent_channels, inner_dim, kernel_size=patch_size, stride=patch_size)
+
+        # 2. Condition embeddings
+        self.condition_embedder = WanTimeTextImageEmbedding(
+            dim=inner_dim,
+            time_freq_dim=freq_dim,
+            time_proj_dim=inner_dim * 6,
+            text_embed_dim=text_dim,
+            image_embed_dim=image_dim,
+            pos_embed_seq_len=pos_embed_seq_len,
+        )
+
+        # Motion encoder
+        self.motion_encoder = WanAnimateMotionEncoder(
+            size=motion_encoder_size,
+            style_dim=motion_style_dim,
+            motion_dim=motion_dim,
+            out_dim=motion_encoder_dim,
+            channels=motion_encoder_channel_sizes,
+        )
+
+        # Face encoder
+        self.face_encoder = WanAnimateFaceEncoder(
+            in_dim=motion_encoder_dim,
+            out_dim=inner_dim,
+            hidden_dim=face_encoder_hidden_dim,
+            num_heads=face_encoder_num_heads,
+        )
+
+        # 3. Transformer blocks
+        self.blocks = nn.ModuleList(
+            [
+                WanTransformerBlock(
+                    dim=inner_dim,
+                    ffn_dim=ffn_dim,
+                    num_heads=num_attention_heads,
+                    qk_norm=qk_norm,
+                    cross_attn_norm=cross_attn_norm,
+                    eps=eps,
+                    added_kv_proj_dim=added_kv_proj_dim,
+                )
+                for _ in range(num_layers)
+            ]
+        )
+
+        self.face_adapter = nn.ModuleList(
+            [
+                WanAnimateFaceBlockCrossAttention(
+                    dim=inner_dim,
+                    heads=num_attention_heads,
+                    dim_head=inner_dim // num_attention_heads,
+                    eps=eps,
+                    cross_attention_dim_head=inner_dim // num_attention_heads,
+                    processor=WanAnimateFaceBlockAttnProcessor(),
+                )
+                for _ in range(num_layers // inject_face_latents_blocks)
+            ]
+        )
+
+        # 4. Output norm & projection
+        self.norm_out = FP32LayerNorm(inner_dim, eps, elementwise_affine=False)
+        self.proj_out = nn.Linear(inner_dim, out_channels * math.prod(patch_size))
+        self.scale_shift_table = nn.Parameter(torch.randn(1, 2, inner_dim) / inner_dim**0.5)
+
+        self.gradient_checkpointing = False
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        timestep: torch.LongTensor,
+        encoder_hidden_states: torch.Tensor,
+        encoder_hidden_states_image: Optional[torch.Tensor] = None,
+        pose_hidden_states: Optional[torch.Tensor] = None,
+        face_pixel_values: Optional[torch.Tensor] = None,
+        motion_encode_batch_size: Optional[int] = None,
+        return_dict: bool = True,
+        attention_kwargs: Optional[Dict[str, Any]] = None,
+    ) -> Union[torch.Tensor, Dict[str, torch.Tensor]]:
+        """
+        Forward pass of Wan2.2-Animate transformer model.
+
+        Args:
+            hidden_states (`torch.Tensor` of shape `(B, 2C + 4, T + 1, H, W)`):
+                Input noisy video latents of shape `(B, 2C + 4, T + 1, H, W)`, where B is the batch size, C is the
+                number of latent channels (16 for Wan VAE), T is the number of latent frames in an inference segment, H
+                is the latent height, and W is the latent width.
+            timestep: (`torch.LongTensor`):
+                The current timestep in the denoising loop.
+            encoder_hidden_states (`torch.Tensor`):
+                Text embeddings from the text encoder (umT5 for Wan Animate).
+            encoder_hidden_states_image (`torch.Tensor`):
+                CLIP visual features of the reference (character) image.
+            pose_hidden_states (`torch.Tensor` of shape `(B, C, T, H, W)`):
+                Pose video latents. TODO: description
+            face_pixel_values (`torch.Tensor` of shape `(B, C', S, H', W')`):
+                Face video in pixel space (not latent space). Typically C' = 3 and H' and W' are the height/width of
+                the face video in pixels. Here S is the inference segment length, usually set to 77.
+            motion_encode_batch_size (`int`, *optional*):
+                The batch size for batched encoding of the face video via the motion encoder. Will default to
+                `self.config.motion_encoder_batch_size` if not set.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether to return the output as a dict or tuple.
+        """
+
+        if attention_kwargs is not None:
+            attention_kwargs = attention_kwargs.copy()
+            lora_scale = attention_kwargs.pop("scale", 1.0)
+        else:
+            lora_scale = 1.0
+
+        if USE_PEFT_BACKEND:
+            # weight the lora layers by setting `lora_scale` for each PEFT layer
+            scale_lora_layers(self, lora_scale)
+        else:
+            if attention_kwargs is not None and attention_kwargs.get("scale", None) is not None:
+                logger.warning(
+                    "Passing `scale` via `attention_kwargs` when not using the PEFT backend is ineffective."
+                )
+
+        # Check that shapes match up
+        if pose_hidden_states is not None and pose_hidden_states.shape[2] + 1 != hidden_states.shape[2]:
+            raise ValueError(
+                f"pose_hidden_states frame dim (dim 2) is {pose_hidden_states.shape[2]} but must be one less than the"
+                f" hidden_states's corresponding frame dim: {hidden_states.shape[2]}"
+            )
+
+        batch_size, num_channels, num_frames, height, width = hidden_states.shape
+        p_t, p_h, p_w = self.config.patch_size
+        post_patch_num_frames = num_frames // p_t
+        post_patch_height = height // p_h
+        post_patch_width = width // p_w
+
+        # 1. Rotary position embedding
+        rotary_emb = self.rope(hidden_states)
+
+        # 2. Patch embedding
+        hidden_states = self.patch_embedding(hidden_states)
+        pose_hidden_states = self.pose_patch_embedding(pose_hidden_states)
+        # Add pose embeddings to hidden states
+        hidden_states[:, :, 1:] = hidden_states[:, :, 1:] + pose_hidden_states
+        # Calling contiguous() here is important so that we don't recompile when performing regional compilation
+        hidden_states = hidden_states.flatten(2).transpose(1, 2).contiguous()
+
+        # 3. Condition embeddings (time, text, image)
+        # Wan Animate is based on Wan 2.1 and thus uses Wan 2.1's timestep logic
+        temb, timestep_proj, encoder_hidden_states, encoder_hidden_states_image = self.condition_embedder(
+            timestep, encoder_hidden_states, encoder_hidden_states_image, timestep_seq_len=None
+        )
+
+        # batch_size, 6, inner_dim
+        timestep_proj = timestep_proj.unflatten(1, (6, -1))
+
+        if encoder_hidden_states_image is not None:
+            encoder_hidden_states = torch.concat([encoder_hidden_states_image, encoder_hidden_states], dim=1)
+
+        # 4. Get motion features from the face video
+        # Motion vector computation from face pixel values
+        batch_size, channels, num_face_frames, height, width = face_pixel_values.shape
+        # Rearrange from (B, C, T, H, W) to (B*T, C, H, W)
+        face_pixel_values = face_pixel_values.permute(0, 2, 1, 3, 4).reshape(-1, channels, height, width)
+
+        # Extract motion features using motion encoder
+        # Perform batched motion encoder inference to allow trading off inference speed for memory usage
+        motion_encode_batch_size = motion_encode_batch_size or self.config.motion_encoder_batch_size
+        face_batches = torch.split(face_pixel_values, motion_encode_batch_size)
+        motion_vec_batches = []
+        for face_batch in face_batches:
+            motion_vec_batch = self.motion_encoder(face_batch)
+            motion_vec_batches.append(motion_vec_batch)
+        motion_vec = torch.cat(motion_vec_batches)
+        motion_vec = motion_vec.view(batch_size, num_face_frames, -1)
+
+        # Now get face features from the motion vector
+        motion_vec = self.face_encoder(motion_vec)
+
+        # Add padding at the beginning (prepend zeros)
+        pad_face = torch.zeros_like(motion_vec[:, :1])
+        motion_vec = torch.cat([pad_face, motion_vec], dim=1)
+
+        # 5. Transformer blocks with face adapter integration
+        for block_idx, block in enumerate(self.blocks):
+            if torch.is_grad_enabled() and self.gradient_checkpointing:
+                hidden_states = self._gradient_checkpointing_func(
+                    block, hidden_states, encoder_hidden_states, timestep_proj, rotary_emb
+                )
+            else:
+                hidden_states = block(hidden_states, encoder_hidden_states, timestep_proj, rotary_emb)
+
+            # Face adapter integration: apply after every 5th block (0, 5, 10, 15, ...)
+            if block_idx % self.config.inject_face_latents_blocks == 0:
+                face_adapter_block_idx = block_idx // self.config.inject_face_latents_blocks
+                face_adapter_output = self.face_adapter[face_adapter_block_idx](hidden_states, motion_vec)
+                # In case the face adapter and main transformer blocks are on different devices, which can happen when
+                # using model parallelism
+                face_adapter_output = face_adapter_output.to(device=hidden_states.device)
+                hidden_states = face_adapter_output + hidden_states
+
+        # 6. Output norm, projection & unpatchify
+        # batch_size, inner_dim
+        shift, scale = (self.scale_shift_table.to(temb.device) + temb.unsqueeze(1)).chunk(2, dim=1)
+
+        hidden_states_original_dtype = hidden_states.dtype
+        hidden_states = self.norm_out(hidden_states.float())
+        # Move the shift and scale tensors to the same device as hidden_states.
+        # When using multi-GPU inference via accelerate these will be on the
+        # first device rather than the last device, which hidden_states ends up
+        # on.
+        shift = shift.to(hidden_states.device)
+        scale = scale.to(hidden_states.device)
+        hidden_states = (hidden_states * (1 + scale) + shift).to(dtype=hidden_states_original_dtype)
+
+        hidden_states = self.proj_out(hidden_states)
+
+        hidden_states = hidden_states.reshape(
+            batch_size, post_patch_num_frames, post_patch_height, post_patch_width, p_t, p_h, p_w, -1
+        )
+        hidden_states = hidden_states.permute(0, 7, 1, 4, 2, 5, 3, 6)
+        output = hidden_states.flatten(6, 7).flatten(4, 5).flatten(2, 3)
+
+        if USE_PEFT_BACKEND:
+            # remove `lora_scale` from each PEFT layer
+            unscale_lora_layers(self, lora_scale)
+
+        if not return_dict:
+            return (output,)
+
+        return Transformer2DModelOutput(sample=output)
diff --git a/src/diffusers/models/transformers/transformer_wan_vace.py b/src/diffusers/models/transformers/transformer_wan_vace.py
index 30c38c244ad8..1be4f73e33e2 100644
--- a/src/diffusers/models/transformers/transformer_wan_vace.py
+++ b/src/diffusers/models/transformers/transformer_wan_vace.py
@@ -182,7 +182,7 @@ class WanVACETransformer3DModel(
     @register_to_config
     def __init__(
         self,
-        patch_size: Tuple[int] = (1, 2, 2),
+        patch_size: Tuple[int, ...] = (1, 2, 2),
         num_attention_heads: int = 40,
         attention_head_dim: int = 128,
         in_channels: int = 16,
diff --git a/src/diffusers/models/transformers/transformer_z_image.py b/src/diffusers/models/transformers/transformer_z_image.py
new file mode 100644
index 000000000000..5c401b9d202b
--- /dev/null
+++ b/src/diffusers/models/transformers/transformer_z_image.py
@@ -0,0 +1,653 @@
+# Copyright 2025 Alibaba Z-Image Team and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import math
+from typing import List, Optional, Tuple
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.nn.utils.rnn import pad_sequence
+
+from ...configuration_utils import ConfigMixin, register_to_config
+from ...loaders import FromOriginalModelMixin, PeftAdapterMixin
+from ...models.attention_processor import Attention
+from ...models.modeling_utils import ModelMixin
+from ...models.normalization import RMSNorm
+from ...utils.torch_utils import maybe_allow_in_graph
+from ..attention_dispatch import dispatch_attention_fn
+from ..modeling_outputs import Transformer2DModelOutput
+
+
+ADALN_EMBED_DIM = 256
+SEQ_MULTI_OF = 32
+
+
+class TimestepEmbedder(nn.Module):
+    def __init__(self, out_size, mid_size=None, frequency_embedding_size=256):
+        super().__init__()
+        if mid_size is None:
+            mid_size = out_size
+        self.mlp = nn.Sequential(
+            nn.Linear(frequency_embedding_size, mid_size, bias=True),
+            nn.SiLU(),
+            nn.Linear(mid_size, out_size, bias=True),
+        )
+
+        self.frequency_embedding_size = frequency_embedding_size
+
+    @staticmethod
+    def timestep_embedding(t, dim, max_period=10000):
+        with torch.amp.autocast("cuda", enabled=False):
+            half = dim // 2
+            freqs = torch.exp(
+                -math.log(max_period) * torch.arange(start=0, end=half, dtype=torch.float32, device=t.device) / half
+            )
+            args = t[:, None].float() * freqs[None]
+            embedding = torch.cat([torch.cos(args), torch.sin(args)], dim=-1)
+            if dim % 2:
+                embedding = torch.cat([embedding, torch.zeros_like(embedding[:, :1])], dim=-1)
+            return embedding
+
+    def forward(self, t):
+        t_freq = self.timestep_embedding(t, self.frequency_embedding_size)
+        weight_dtype = self.mlp[0].weight.dtype
+        compute_dtype = getattr(self.mlp[0], "compute_dtype", None)
+        if weight_dtype.is_floating_point:
+            t_freq = t_freq.to(weight_dtype)
+        elif compute_dtype is not None:
+            t_freq = t_freq.to(compute_dtype)
+        t_emb = self.mlp(t_freq)
+        return t_emb
+
+
+class ZSingleStreamAttnProcessor:
+    """
+    Processor for Z-Image single stream attention that adapts the existing Attention class to match the behavior of the
+    original Z-ImageAttention module.
+    """
+
+    _attention_backend = None
+    _parallel_config = None
+
+    def __init__(self):
+        if not hasattr(F, "scaled_dot_product_attention"):
+            raise ImportError(
+                "ZSingleStreamAttnProcessor requires PyTorch 2.0. To use it, please upgrade PyTorch to version 2.0 or higher."
+            )
+
+    def __call__(
+        self,
+        attn: Attention,
+        hidden_states: torch.Tensor,
+        encoder_hidden_states: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        freqs_cis: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        query = attn.to_q(hidden_states)
+        key = attn.to_k(hidden_states)
+        value = attn.to_v(hidden_states)
+
+        query = query.unflatten(-1, (attn.heads, -1))
+        key = key.unflatten(-1, (attn.heads, -1))
+        value = value.unflatten(-1, (attn.heads, -1))
+
+        # Apply Norms
+        if attn.norm_q is not None:
+            query = attn.norm_q(query)
+        if attn.norm_k is not None:
+            key = attn.norm_k(key)
+
+        # Apply RoPE
+        def apply_rotary_emb(x_in: torch.Tensor, freqs_cis: torch.Tensor) -> torch.Tensor:
+            with torch.amp.autocast("cuda", enabled=False):
+                x = torch.view_as_complex(x_in.float().reshape(*x_in.shape[:-1], -1, 2))
+                freqs_cis = freqs_cis.unsqueeze(2)
+                x_out = torch.view_as_real(x * freqs_cis).flatten(3)
+                return x_out.type_as(x_in)  # todo
+
+        if freqs_cis is not None:
+            query = apply_rotary_emb(query, freqs_cis)
+            key = apply_rotary_emb(key, freqs_cis)
+
+        # Cast to correct dtype
+        dtype = query.dtype
+        query, key = query.to(dtype), key.to(dtype)
+
+        # From [batch, seq_len] to [batch, 1, 1, seq_len] -> broadcast to [batch, heads, seq_len, seq_len]
+        if attention_mask is not None and attention_mask.ndim == 2:
+            attention_mask = attention_mask[:, None, None, :]
+
+        # Compute joint attention
+        hidden_states = dispatch_attention_fn(
+            query,
+            key,
+            value,
+            attn_mask=attention_mask,
+            dropout_p=0.0,
+            is_causal=False,
+            backend=self._attention_backend,
+            parallel_config=self._parallel_config,
+        )
+
+        # Reshape back
+        hidden_states = hidden_states.flatten(2, 3)
+        hidden_states = hidden_states.to(dtype)
+
+        output = attn.to_out[0](hidden_states)
+        if len(attn.to_out) > 1:  # dropout
+            output = attn.to_out[1](output)
+
+        return output
+
+
+class FeedForward(nn.Module):
+    def __init__(self, dim: int, hidden_dim: int):
+        super().__init__()
+        self.w1 = nn.Linear(dim, hidden_dim, bias=False)
+        self.w2 = nn.Linear(hidden_dim, dim, bias=False)
+        self.w3 = nn.Linear(dim, hidden_dim, bias=False)
+
+    def _forward_silu_gating(self, x1, x3):
+        return F.silu(x1) * x3
+
+    def forward(self, x):
+        return self.w2(self._forward_silu_gating(self.w1(x), self.w3(x)))
+
+
+@maybe_allow_in_graph
+class ZImageTransformerBlock(nn.Module):
+    def __init__(
+        self,
+        layer_id: int,
+        dim: int,
+        n_heads: int,
+        n_kv_heads: int,
+        norm_eps: float,
+        qk_norm: bool,
+        modulation=True,
+    ):
+        super().__init__()
+        self.dim = dim
+        self.head_dim = dim // n_heads
+
+        # Refactored to use diffusers Attention with custom processor
+        # Original Z-Image params: dim, n_heads, n_kv_heads, qk_norm
+        self.attention = Attention(
+            query_dim=dim,
+            cross_attention_dim=None,
+            dim_head=dim // n_heads,
+            heads=n_heads,
+            qk_norm="rms_norm" if qk_norm else None,
+            eps=1e-5,
+            bias=False,
+            out_bias=False,
+            processor=ZSingleStreamAttnProcessor(),
+        )
+
+        self.feed_forward = FeedForward(dim=dim, hidden_dim=int(dim / 3 * 8))
+        self.layer_id = layer_id
+
+        self.attention_norm1 = RMSNorm(dim, eps=norm_eps)
+        self.ffn_norm1 = RMSNorm(dim, eps=norm_eps)
+
+        self.attention_norm2 = RMSNorm(dim, eps=norm_eps)
+        self.ffn_norm2 = RMSNorm(dim, eps=norm_eps)
+
+        self.modulation = modulation
+        if modulation:
+            self.adaLN_modulation = nn.Sequential(nn.Linear(min(dim, ADALN_EMBED_DIM), 4 * dim, bias=True))
+
+    def forward(
+        self,
+        x: torch.Tensor,
+        attn_mask: torch.Tensor,
+        freqs_cis: torch.Tensor,
+        adaln_input: Optional[torch.Tensor] = None,
+    ):
+        if self.modulation:
+            assert adaln_input is not None
+            scale_msa, gate_msa, scale_mlp, gate_mlp = self.adaLN_modulation(adaln_input).unsqueeze(1).chunk(4, dim=2)
+            gate_msa, gate_mlp = gate_msa.tanh(), gate_mlp.tanh()
+            scale_msa, scale_mlp = 1.0 + scale_msa, 1.0 + scale_mlp
+
+            # Attention block
+            attn_out = self.attention(
+                self.attention_norm1(x) * scale_msa, attention_mask=attn_mask, freqs_cis=freqs_cis
+            )
+            x = x + gate_msa * self.attention_norm2(attn_out)
+
+            # FFN block
+            x = x + gate_mlp * self.ffn_norm2(self.feed_forward(self.ffn_norm1(x) * scale_mlp))
+        else:
+            # Attention block
+            attn_out = self.attention(self.attention_norm1(x), attention_mask=attn_mask, freqs_cis=freqs_cis)
+            x = x + self.attention_norm2(attn_out)
+
+            # FFN block
+            x = x + self.ffn_norm2(self.feed_forward(self.ffn_norm1(x)))
+
+        return x
+
+
+class FinalLayer(nn.Module):
+    def __init__(self, hidden_size, out_channels):
+        super().__init__()
+        self.norm_final = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
+        self.linear = nn.Linear(hidden_size, out_channels, bias=True)
+
+        self.adaLN_modulation = nn.Sequential(
+            nn.SiLU(),
+            nn.Linear(min(hidden_size, ADALN_EMBED_DIM), hidden_size, bias=True),
+        )
+
+    def forward(self, x, c):
+        scale = 1.0 + self.adaLN_modulation(c)
+        x = self.norm_final(x) * scale.unsqueeze(1)
+        x = self.linear(x)
+        return x
+
+
+class RopeEmbedder:
+    def __init__(
+        self,
+        theta: float = 256.0,
+        axes_dims: List[int] = (16, 56, 56),
+        axes_lens: List[int] = (64, 128, 128),
+    ):
+        self.theta = theta
+        self.axes_dims = axes_dims
+        self.axes_lens = axes_lens
+        assert len(axes_dims) == len(axes_lens), "axes_dims and axes_lens must have the same length"
+        self.freqs_cis = None
+
+    @staticmethod
+    def precompute_freqs_cis(dim: List[int], end: List[int], theta: float = 256.0):
+        with torch.device("cpu"):
+            freqs_cis = []
+            for i, (d, e) in enumerate(zip(dim, end)):
+                freqs = 1.0 / (theta ** (torch.arange(0, d, 2, dtype=torch.float64, device="cpu") / d))
+                timestep = torch.arange(e, device=freqs.device, dtype=torch.float64)
+                freqs = torch.outer(timestep, freqs).float()
+                freqs_cis_i = torch.polar(torch.ones_like(freqs), freqs).to(torch.complex64)  # complex64
+                freqs_cis.append(freqs_cis_i)
+
+            return freqs_cis
+
+    def __call__(self, ids: torch.Tensor):
+        assert ids.ndim == 2
+        assert ids.shape[-1] == len(self.axes_dims)
+        device = ids.device
+
+        if self.freqs_cis is None:
+            self.freqs_cis = self.precompute_freqs_cis(self.axes_dims, self.axes_lens, theta=self.theta)
+            self.freqs_cis = [freqs_cis.to(device) for freqs_cis in self.freqs_cis]
+        else:
+            # Ensure freqs_cis are on the same device as ids
+            if self.freqs_cis[0].device != device:
+                self.freqs_cis = [freqs_cis.to(device) for freqs_cis in self.freqs_cis]
+
+        result = []
+        for i in range(len(self.axes_dims)):
+            index = ids[:, i]
+            result.append(self.freqs_cis[i][index])
+        return torch.cat(result, dim=-1)
+
+
+class ZImageTransformer2DModel(ModelMixin, ConfigMixin, PeftAdapterMixin, FromOriginalModelMixin):
+    _supports_gradient_checkpointing = True
+    _no_split_modules = ["ZImageTransformerBlock"]
+    _repeated_blocks = ["ZImageTransformerBlock"]
+    _skip_layerwise_casting_patterns = ["t_embedder", "cap_embedder"]  # precision sensitive layers
+
+    @register_to_config
+    def __init__(
+        self,
+        all_patch_size=(2,),
+        all_f_patch_size=(1,),
+        in_channels=16,
+        dim=3840,
+        n_layers=30,
+        n_refiner_layers=2,
+        n_heads=30,
+        n_kv_heads=30,
+        norm_eps=1e-5,
+        qk_norm=True,
+        cap_feat_dim=2560,
+        rope_theta=256.0,
+        t_scale=1000.0,
+        axes_dims=[32, 48, 48],
+        axes_lens=[1024, 512, 512],
+    ) -> None:
+        super().__init__()
+        self.in_channels = in_channels
+        self.out_channels = in_channels
+        self.all_patch_size = all_patch_size
+        self.all_f_patch_size = all_f_patch_size
+        self.dim = dim
+        self.n_heads = n_heads
+
+        self.rope_theta = rope_theta
+        self.t_scale = t_scale
+        self.gradient_checkpointing = False
+
+        assert len(all_patch_size) == len(all_f_patch_size)
+
+        all_x_embedder = {}
+        all_final_layer = {}
+        for patch_idx, (patch_size, f_patch_size) in enumerate(zip(all_patch_size, all_f_patch_size)):
+            x_embedder = nn.Linear(f_patch_size * patch_size * patch_size * in_channels, dim, bias=True)
+            all_x_embedder[f"{patch_size}-{f_patch_size}"] = x_embedder
+
+            final_layer = FinalLayer(dim, patch_size * patch_size * f_patch_size * self.out_channels)
+            all_final_layer[f"{patch_size}-{f_patch_size}"] = final_layer
+
+        self.all_x_embedder = nn.ModuleDict(all_x_embedder)
+        self.all_final_layer = nn.ModuleDict(all_final_layer)
+        self.noise_refiner = nn.ModuleList(
+            [
+                ZImageTransformerBlock(
+                    1000 + layer_id,
+                    dim,
+                    n_heads,
+                    n_kv_heads,
+                    norm_eps,
+                    qk_norm,
+                    modulation=True,
+                )
+                for layer_id in range(n_refiner_layers)
+            ]
+        )
+        self.context_refiner = nn.ModuleList(
+            [
+                ZImageTransformerBlock(
+                    layer_id,
+                    dim,
+                    n_heads,
+                    n_kv_heads,
+                    norm_eps,
+                    qk_norm,
+                    modulation=False,
+                )
+                for layer_id in range(n_refiner_layers)
+            ]
+        )
+        self.t_embedder = TimestepEmbedder(min(dim, ADALN_EMBED_DIM), mid_size=1024)
+        self.cap_embedder = nn.Sequential(RMSNorm(cap_feat_dim, eps=norm_eps), nn.Linear(cap_feat_dim, dim, bias=True))
+
+        self.x_pad_token = nn.Parameter(torch.empty((1, dim)))
+        self.cap_pad_token = nn.Parameter(torch.empty((1, dim)))
+
+        self.layers = nn.ModuleList(
+            [
+                ZImageTransformerBlock(layer_id, dim, n_heads, n_kv_heads, norm_eps, qk_norm)
+                for layer_id in range(n_layers)
+            ]
+        )
+        head_dim = dim // n_heads
+        assert head_dim == sum(axes_dims)
+        self.axes_dims = axes_dims
+        self.axes_lens = axes_lens
+
+        self.rope_embedder = RopeEmbedder(theta=rope_theta, axes_dims=axes_dims, axes_lens=axes_lens)
+
+    def unpatchify(self, x: List[torch.Tensor], size: List[Tuple], patch_size, f_patch_size) -> List[torch.Tensor]:
+        pH = pW = patch_size
+        pF = f_patch_size
+        bsz = len(x)
+        assert len(size) == bsz
+        for i in range(bsz):
+            F, H, W = size[i]
+            ori_len = (F // pF) * (H // pH) * (W // pW)
+            # "f h w pf ph pw c -> c (f pf) (h ph) (w pw)"
+            x[i] = (
+                x[i][:ori_len]
+                .view(F // pF, H // pH, W // pW, pF, pH, pW, self.out_channels)
+                .permute(6, 0, 3, 1, 4, 2, 5)
+                .reshape(self.out_channels, F, H, W)
+            )
+        return x
+
+    @staticmethod
+    def create_coordinate_grid(size, start=None, device=None):
+        if start is None:
+            start = (0 for _ in size)
+
+        axes = [torch.arange(x0, x0 + span, dtype=torch.int32, device=device) for x0, span in zip(start, size)]
+        grids = torch.meshgrid(axes, indexing="ij")
+        return torch.stack(grids, dim=-1)
+
+    def patchify_and_embed(
+        self,
+        all_image: List[torch.Tensor],
+        all_cap_feats: List[torch.Tensor],
+        patch_size: int,
+        f_patch_size: int,
+    ):
+        pH = pW = patch_size
+        pF = f_patch_size
+        device = all_image[0].device
+
+        all_image_out = []
+        all_image_size = []
+        all_image_pos_ids = []
+        all_image_pad_mask = []
+        all_cap_pos_ids = []
+        all_cap_pad_mask = []
+        all_cap_feats_out = []
+
+        for i, (image, cap_feat) in enumerate(zip(all_image, all_cap_feats)):
+            ### Process Caption
+            cap_ori_len = len(cap_feat)
+            cap_padding_len = (-cap_ori_len) % SEQ_MULTI_OF
+            # padded position ids
+            cap_padded_pos_ids = self.create_coordinate_grid(
+                size=(cap_ori_len + cap_padding_len, 1, 1),
+                start=(1, 0, 0),
+                device=device,
+            ).flatten(0, 2)
+            all_cap_pos_ids.append(cap_padded_pos_ids)
+            # pad mask
+            cap_pad_mask = torch.cat(
+                [
+                    torch.zeros((cap_ori_len,), dtype=torch.bool, device=device),
+                    torch.ones((cap_padding_len,), dtype=torch.bool, device=device),
+                ],
+                dim=0,
+            )
+            all_cap_pad_mask.append(
+                cap_pad_mask if cap_padding_len > 0 else torch.zeros((cap_ori_len,), dtype=torch.bool, device=device)
+            )
+
+            # padded feature
+            cap_padded_feat = torch.cat([cap_feat, cap_feat[-1:].repeat(cap_padding_len, 1)], dim=0)
+            all_cap_feats_out.append(cap_padded_feat)
+
+            ### Process Image
+            C, F, H, W = image.size()
+            all_image_size.append((F, H, W))
+            F_tokens, H_tokens, W_tokens = F // pF, H // pH, W // pW
+
+            image = image.view(C, F_tokens, pF, H_tokens, pH, W_tokens, pW)
+            # "c f pf h ph w pw -> (f h w) (pf ph pw c)"
+            image = image.permute(1, 3, 5, 2, 4, 6, 0).reshape(F_tokens * H_tokens * W_tokens, pF * pH * pW * C)
+
+            image_ori_len = len(image)
+            image_padding_len = (-image_ori_len) % SEQ_MULTI_OF
+
+            image_ori_pos_ids = self.create_coordinate_grid(
+                size=(F_tokens, H_tokens, W_tokens),
+                start=(cap_ori_len + cap_padding_len + 1, 0, 0),
+                device=device,
+            ).flatten(0, 2)
+            image_padded_pos_ids = torch.cat(
+                [
+                    image_ori_pos_ids,
+                    self.create_coordinate_grid(size=(1, 1, 1), start=(0, 0, 0), device=device)
+                    .flatten(0, 2)
+                    .repeat(image_padding_len, 1),
+                ],
+                dim=0,
+            )
+            all_image_pos_ids.append(image_padded_pos_ids if image_padding_len > 0 else image_ori_pos_ids)
+            # pad mask
+            image_pad_mask = torch.cat(
+                [
+                    torch.zeros((image_ori_len,), dtype=torch.bool, device=device),
+                    torch.ones((image_padding_len,), dtype=torch.bool, device=device),
+                ],
+                dim=0,
+            )
+            all_image_pad_mask.append(
+                image_pad_mask
+                if image_padding_len > 0
+                else torch.zeros((image_ori_len,), dtype=torch.bool, device=device)
+            )
+            # padded feature
+            image_padded_feat = torch.cat(
+                [image, image[-1:].repeat(image_padding_len, 1)],
+                dim=0,
+            )
+            all_image_out.append(image_padded_feat if image_padding_len > 0 else image)
+
+        return (
+            all_image_out,
+            all_cap_feats_out,
+            all_image_size,
+            all_image_pos_ids,
+            all_cap_pos_ids,
+            all_image_pad_mask,
+            all_cap_pad_mask,
+        )
+
+    def forward(
+        self,
+        x: List[torch.Tensor],
+        t,
+        cap_feats: List[torch.Tensor],
+        patch_size=2,
+        f_patch_size=1,
+        return_dict: bool = True,
+    ):
+        assert patch_size in self.all_patch_size
+        assert f_patch_size in self.all_f_patch_size
+
+        bsz = len(x)
+        device = x[0].device
+        t = t * self.t_scale
+        t = self.t_embedder(t)
+
+        (
+            x,
+            cap_feats,
+            x_size,
+            x_pos_ids,
+            cap_pos_ids,
+            x_inner_pad_mask,
+            cap_inner_pad_mask,
+        ) = self.patchify_and_embed(x, cap_feats, patch_size, f_patch_size)
+
+        # x embed & refine
+        x_item_seqlens = [len(_) for _ in x]
+        assert all(_ % SEQ_MULTI_OF == 0 for _ in x_item_seqlens)
+        x_max_item_seqlen = max(x_item_seqlens)
+
+        x = torch.cat(x, dim=0)
+        x = self.all_x_embedder[f"{patch_size}-{f_patch_size}"](x)
+
+        # Match t_embedder output dtype to x for layerwise casting compatibility
+        adaln_input = t.type_as(x)
+        x[torch.cat(x_inner_pad_mask)] = self.x_pad_token
+        x = list(x.split(x_item_seqlens, dim=0))
+        x_freqs_cis = list(self.rope_embedder(torch.cat(x_pos_ids, dim=0)).split([len(_) for _ in x_pos_ids], dim=0))
+
+        x = pad_sequence(x, batch_first=True, padding_value=0.0)
+        x_freqs_cis = pad_sequence(x_freqs_cis, batch_first=True, padding_value=0.0)
+        # Clarify the length matches to satisfy Dynamo due to "Symbolic Shape Inference" to avoid compilation errors
+        x_freqs_cis = x_freqs_cis[:, : x.shape[1]]
+
+        x_attn_mask = torch.zeros((bsz, x_max_item_seqlen), dtype=torch.bool, device=device)
+        for i, seq_len in enumerate(x_item_seqlens):
+            x_attn_mask[i, :seq_len] = 1
+
+        if torch.is_grad_enabled() and self.gradient_checkpointing:
+            for layer in self.noise_refiner:
+                x = self._gradient_checkpointing_func(layer, x, x_attn_mask, x_freqs_cis, adaln_input)
+        else:
+            for layer in self.noise_refiner:
+                x = layer(x, x_attn_mask, x_freqs_cis, adaln_input)
+
+        # cap embed & refine
+        cap_item_seqlens = [len(_) for _ in cap_feats]
+        cap_max_item_seqlen = max(cap_item_seqlens)
+
+        cap_feats = torch.cat(cap_feats, dim=0)
+        cap_feats = self.cap_embedder(cap_feats)
+        cap_feats[torch.cat(cap_inner_pad_mask)] = self.cap_pad_token
+        cap_feats = list(cap_feats.split(cap_item_seqlens, dim=0))
+        cap_freqs_cis = list(
+            self.rope_embedder(torch.cat(cap_pos_ids, dim=0)).split([len(_) for _ in cap_pos_ids], dim=0)
+        )
+
+        cap_feats = pad_sequence(cap_feats, batch_first=True, padding_value=0.0)
+        cap_freqs_cis = pad_sequence(cap_freqs_cis, batch_first=True, padding_value=0.0)
+        # Clarify the length matches to satisfy Dynamo due to "Symbolic Shape Inference" to avoid compilation errors
+        cap_freqs_cis = cap_freqs_cis[:, : cap_feats.shape[1]]
+
+        cap_attn_mask = torch.zeros((bsz, cap_max_item_seqlen), dtype=torch.bool, device=device)
+        for i, seq_len in enumerate(cap_item_seqlens):
+            cap_attn_mask[i, :seq_len] = 1
+
+        if torch.is_grad_enabled() and self.gradient_checkpointing:
+            for layer in self.context_refiner:
+                cap_feats = self._gradient_checkpointing_func(layer, cap_feats, cap_attn_mask, cap_freqs_cis)
+        else:
+            for layer in self.context_refiner:
+                cap_feats = layer(cap_feats, cap_attn_mask, cap_freqs_cis)
+
+        # unified
+        unified = []
+        unified_freqs_cis = []
+        for i in range(bsz):
+            x_len = x_item_seqlens[i]
+            cap_len = cap_item_seqlens[i]
+            unified.append(torch.cat([x[i][:x_len], cap_feats[i][:cap_len]]))
+            unified_freqs_cis.append(torch.cat([x_freqs_cis[i][:x_len], cap_freqs_cis[i][:cap_len]]))
+        unified_item_seqlens = [a + b for a, b in zip(cap_item_seqlens, x_item_seqlens)]
+        assert unified_item_seqlens == [len(_) for _ in unified]
+        unified_max_item_seqlen = max(unified_item_seqlens)
+
+        unified = pad_sequence(unified, batch_first=True, padding_value=0.0)
+        unified_freqs_cis = pad_sequence(unified_freqs_cis, batch_first=True, padding_value=0.0)
+        unified_attn_mask = torch.zeros((bsz, unified_max_item_seqlen), dtype=torch.bool, device=device)
+        for i, seq_len in enumerate(unified_item_seqlens):
+            unified_attn_mask[i, :seq_len] = 1
+
+        if torch.is_grad_enabled() and self.gradient_checkpointing:
+            for layer in self.layers:
+                unified = self._gradient_checkpointing_func(
+                    layer, unified, unified_attn_mask, unified_freqs_cis, adaln_input
+                )
+        else:
+            for layer in self.layers:
+                unified = layer(unified, unified_attn_mask, unified_freqs_cis, adaln_input)
+
+        unified = self.all_final_layer[f"{patch_size}-{f_patch_size}"](unified, adaln_input)
+        unified = list(unified.unbind(dim=0))
+        x = self.unpatchify(unified, x_size, patch_size, f_patch_size)
+
+        if not return_dict:
+            return (x,)
+
+        return Transformer2DModelOutput(sample=x)
diff --git a/src/diffusers/models/unets/unet_1d.py b/src/diffusers/models/unets/unet_1d.py
index 4c4c528a59ad..a027c553ed06 100644
--- a/src/diffusers/models/unets/unet_1d.py
+++ b/src/diffusers/models/unets/unet_1d.py
@@ -86,11 +86,11 @@ def __init__(
         flip_sin_to_cos: bool = True,
         use_timestep_embedding: bool = False,
         freq_shift: float = 0.0,
-        down_block_types: Tuple[str] = ("DownBlock1DNoSkip", "DownBlock1D", "AttnDownBlock1D"),
-        up_block_types: Tuple[str] = ("AttnUpBlock1D", "UpBlock1D", "UpBlock1DNoSkip"),
-        mid_block_type: Tuple[str] = "UNetMidBlock1D",
+        down_block_types: Tuple[str, ...] = ("DownBlock1DNoSkip", "DownBlock1D", "AttnDownBlock1D"),
+        up_block_types: Tuple[str, ...] = ("AttnUpBlock1D", "UpBlock1D", "UpBlock1DNoSkip"),
+        mid_block_type: str = "UNetMidBlock1D",
         out_block_type: str = None,
-        block_out_channels: Tuple[int] = (32, 32, 64),
+        block_out_channels: Tuple[int, ...] = (32, 32, 64),
         act_fn: str = None,
         norm_num_groups: int = 8,
         layers_per_block: int = 1,
diff --git a/src/diffusers/models/unets/unet_2d_condition.py b/src/diffusers/models/unets/unet_2d_condition.py
index f04d3dfa0136..e669aa51a54e 100644
--- a/src/diffusers/models/unets/unet_2d_condition.py
+++ b/src/diffusers/models/unets/unet_2d_condition.py
@@ -22,11 +22,11 @@
 from ...loaders.single_file_model import FromOriginalModelMixin
 from ...utils import USE_PEFT_BACKEND, BaseOutput, deprecate, logging, scale_lora_layers, unscale_lora_layers
 from ..activations import get_activation
+from ..attention import AttentionMixin
 from ..attention_processor import (
     ADDED_KV_ATTENTION_PROCESSORS,
     CROSS_ATTENTION_PROCESSORS,
     Attention,
-    AttentionProcessor,
     AttnAddedKVProcessor,
     AttnProcessor,
     FusedAttnProcessor2_0,
@@ -68,7 +68,7 @@ class UNet2DConditionOutput(BaseOutput):
 
 
 class UNet2DConditionModel(
-    ModelMixin, ConfigMixin, FromOriginalModelMixin, UNet2DConditionLoadersMixin, PeftAdapterMixin
+    ModelMixin, AttentionMixin, ConfigMixin, FromOriginalModelMixin, UNet2DConditionLoadersMixin, PeftAdapterMixin
 ):
     r"""
     A conditional 2D UNet model that takes a noisy sample, conditional state, and a timestep and returns a sample
@@ -177,16 +177,21 @@ def __init__(
         center_input_sample: bool = False,
         flip_sin_to_cos: bool = True,
         freq_shift: int = 0,
-        down_block_types: Tuple[str] = (
+        down_block_types: Tuple[str, ...] = (
             "CrossAttnDownBlock2D",
             "CrossAttnDownBlock2D",
             "CrossAttnDownBlock2D",
             "DownBlock2D",
         ),
         mid_block_type: Optional[str] = "UNetMidBlock2DCrossAttn",
-        up_block_types: Tuple[str] = ("UpBlock2D", "CrossAttnUpBlock2D", "CrossAttnUpBlock2D", "CrossAttnUpBlock2D"),
+        up_block_types: Tuple[str, ...] = (
+            "UpBlock2D",
+            "CrossAttnUpBlock2D",
+            "CrossAttnUpBlock2D",
+            "CrossAttnUpBlock2D",
+        ),
         only_cross_attention: Union[bool, Tuple[bool]] = False,
-        block_out_channels: Tuple[int] = (320, 640, 1280, 1280),
+        block_out_channels: Tuple[int, ...] = (320, 640, 1280, 1280),
         layers_per_block: Union[int, Tuple[int]] = 2,
         downsample_padding: int = 1,
         mid_block_scale_factor: float = 1,
@@ -486,10 +491,10 @@ def __init__(
 
     def _check_config(
         self,
-        down_block_types: Tuple[str],
-        up_block_types: Tuple[str],
+        down_block_types: Tuple[str, ...],
+        up_block_types: Tuple[str, ...],
         only_cross_attention: Union[bool, Tuple[bool]],
-        block_out_channels: Tuple[int],
+        block_out_channels: Tuple[int, ...],
         layers_per_block: Union[int, Tuple[int]],
         cross_attention_dim: Union[int, Tuple[int]],
         transformer_layers_per_block: Union[int, Tuple[int], Tuple[Tuple[int]]],
@@ -696,64 +701,6 @@ def _set_pos_net_if_use_gligen(self, attention_type: str, cross_attention_dim: i
                 positive_len=positive_len, out_dim=cross_attention_dim, feature_type=feature_type
             )
 
-    @property
-    def attn_processors(self) -> Dict[str, AttentionProcessor]:
-        r"""
-        Returns:
-            `dict` of attention processors: A dictionary containing all attention processors used in the model with
-            indexed by its weight name.
-        """
-        # set recursively
-        processors = {}
-
-        def fn_recursive_add_processors(name: str, module: torch.nn.Module, processors: Dict[str, AttentionProcessor]):
-            if hasattr(module, "get_processor"):
-                processors[f"{name}.processor"] = module.get_processor()
-
-            for sub_name, child in module.named_children():
-                fn_recursive_add_processors(f"{name}.{sub_name}", child, processors)
-
-            return processors
-
-        for name, module in self.named_children():
-            fn_recursive_add_processors(name, module, processors)
-
-        return processors
-
-    def set_attn_processor(self, processor: Union[AttentionProcessor, Dict[str, AttentionProcessor]]):
-        r"""
-        Sets the attention processor to use to compute attention.
-
-        Parameters:
-            processor (`dict` of `AttentionProcessor` or only `AttentionProcessor`):
-                The instantiated processor class or a dictionary of processor classes that will be set as the processor
-                for **all** `Attention` layers.
-
-                If `processor` is a dict, the key needs to define the path to the corresponding cross attention
-                processor. This is strongly recommended when setting trainable attention processors.
-
-        """
-        count = len(self.attn_processors.keys())
-
-        if isinstance(processor, dict) and len(processor) != count:
-            raise ValueError(
-                f"A dict of processors was passed, but the number of processors {len(processor)} does not match the"
-                f" number of attention layers: {count}. Please make sure to pass {count} processor classes."
-            )
-
-        def fn_recursive_attn_processor(name: str, module: torch.nn.Module, processor):
-            if hasattr(module, "set_processor"):
-                if not isinstance(processor, dict):
-                    module.set_processor(processor)
-                else:
-                    module.set_processor(processor.pop(f"{name}.processor"))
-
-            for sub_name, child in module.named_children():
-                fn_recursive_attn_processor(f"{name}.{sub_name}", child, processor)
-
-        for name, module in self.named_children():
-            fn_recursive_attn_processor(name, module, processor)
-
     def set_default_attn_processor(self):
         """
         Disables custom attention processors and sets the default attention implementation.
diff --git a/src/diffusers/models/unets/unet_3d_condition.py b/src/diffusers/models/unets/unet_3d_condition.py
index 6a119185b812..26dc50f84acd 100644
--- a/src/diffusers/models/unets/unet_3d_condition.py
+++ b/src/diffusers/models/unets/unet_3d_condition.py
@@ -23,11 +23,11 @@
 from ...loaders import UNet2DConditionLoadersMixin
 from ...utils import BaseOutput, logging
 from ..activations import get_activation
+from ..attention import AttentionMixin
 from ..attention_processor import (
     ADDED_KV_ATTENTION_PROCESSORS,
     CROSS_ATTENTION_PROCESSORS,
     Attention,
-    AttentionProcessor,
     AttnAddedKVProcessor,
     AttnProcessor,
     FusedAttnProcessor2_0,
@@ -58,7 +58,7 @@ class UNet3DConditionOutput(BaseOutput):
     sample: torch.Tensor
 
 
-class UNet3DConditionModel(ModelMixin, ConfigMixin, UNet2DConditionLoadersMixin):
+class UNet3DConditionModel(ModelMixin, AttentionMixin, ConfigMixin, UNet2DConditionLoadersMixin):
     r"""
     A conditional 3D UNet model that takes a noisy sample, conditional state, and a timestep and returns a sample
     shaped output.
@@ -285,31 +285,6 @@ def __init__(
             block_out_channels[0], out_channels, kernel_size=conv_out_kernel, padding=conv_out_padding
         )
 
-    @property
-    # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.attn_processors
-    def attn_processors(self) -> Dict[str, AttentionProcessor]:
-        r"""
-        Returns:
-            `dict` of attention processors: A dictionary containing all attention processors used in the model with
-            indexed by its weight name.
-        """
-        # set recursively
-        processors = {}
-
-        def fn_recursive_add_processors(name: str, module: torch.nn.Module, processors: Dict[str, AttentionProcessor]):
-            if hasattr(module, "get_processor"):
-                processors[f"{name}.processor"] = module.get_processor()
-
-            for sub_name, child in module.named_children():
-                fn_recursive_add_processors(f"{name}.{sub_name}", child, processors)
-
-            return processors
-
-        for name, module in self.named_children():
-            fn_recursive_add_processors(name, module, processors)
-
-        return processors
-
     # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.set_attention_slice
     def set_attention_slice(self, slice_size: Union[str, int, List[int]]) -> None:
         r"""
@@ -376,41 +351,6 @@ def fn_recursive_set_attention_slice(module: torch.nn.Module, slice_size: List[i
         for module in self.children():
             fn_recursive_set_attention_slice(module, reversed_slice_size)
 
-    # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.set_attn_processor
-    def set_attn_processor(self, processor: Union[AttentionProcessor, Dict[str, AttentionProcessor]]):
-        r"""
-        Sets the attention processor to use to compute attention.
-
-        Parameters:
-            processor (`dict` of `AttentionProcessor` or only `AttentionProcessor`):
-                The instantiated processor class or a dictionary of processor classes that will be set as the processor
-                for **all** `Attention` layers.
-
-                If `processor` is a dict, the key needs to define the path to the corresponding cross attention
-                processor. This is strongly recommended when setting trainable attention processors.
-
-        """
-        count = len(self.attn_processors.keys())
-
-        if isinstance(processor, dict) and len(processor) != count:
-            raise ValueError(
-                f"A dict of processors was passed, but the number of processors {len(processor)} does not match the"
-                f" number of attention layers: {count}. Please make sure to pass {count} processor classes."
-            )
-
-        def fn_recursive_attn_processor(name: str, module: torch.nn.Module, processor):
-            if hasattr(module, "set_processor"):
-                if not isinstance(processor, dict):
-                    module.set_processor(processor)
-                else:
-                    module.set_processor(processor.pop(f"{name}.processor"))
-
-            for sub_name, child in module.named_children():
-                fn_recursive_attn_processor(f"{name}.{sub_name}", child, processor)
-
-        for name, module in self.named_children():
-            fn_recursive_attn_processor(name, module, processor)
-
     def enable_forward_chunking(self, chunk_size: Optional[int] = None, dim: int = 0) -> None:
         """
         Sets the attention processor to use [feed forward
diff --git a/src/diffusers/models/unets/unet_i2vgen_xl.py b/src/diffusers/models/unets/unet_i2vgen_xl.py
index 3dba8edca79d..0ada264417dd 100644
--- a/src/diffusers/models/unets/unet_i2vgen_xl.py
+++ b/src/diffusers/models/unets/unet_i2vgen_xl.py
@@ -21,11 +21,10 @@
 from ...loaders import UNet2DConditionLoadersMixin
 from ...utils import logging
 from ..activations import get_activation
-from ..attention import Attention, FeedForward
+from ..attention import Attention, AttentionMixin, FeedForward
 from ..attention_processor import (
     ADDED_KV_ATTENTION_PROCESSORS,
     CROSS_ATTENTION_PROCESSORS,
-    AttentionProcessor,
     AttnAddedKVProcessor,
     AttnProcessor,
     FusedAttnProcessor2_0,
@@ -93,7 +92,7 @@ def forward(
         return hidden_states
 
 
-class I2VGenXLUNet(ModelMixin, ConfigMixin, UNet2DConditionLoadersMixin):
+class I2VGenXLUNet(ModelMixin, AttentionMixin, ConfigMixin, UNet2DConditionLoadersMixin):
     r"""
     I2VGenXL UNet. It is a conditional 3D UNet model that takes a noisy sample, conditional state, and a timestep and
     returns a sample-shaped output.
@@ -313,66 +312,6 @@ def __init__(
         self.conv_act = get_activation("silu")
         self.conv_out = nn.Conv2d(block_out_channels[0], out_channels, kernel_size=3, padding=1)
 
-    @property
-    # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.attn_processors
-    def attn_processors(self) -> Dict[str, AttentionProcessor]:
-        r"""
-        Returns:
-            `dict` of attention processors: A dictionary containing all attention processors used in the model with
-            indexed by its weight name.
-        """
-        # set recursively
-        processors = {}
-
-        def fn_recursive_add_processors(name: str, module: torch.nn.Module, processors: Dict[str, AttentionProcessor]):
-            if hasattr(module, "get_processor"):
-                processors[f"{name}.processor"] = module.get_processor()
-
-            for sub_name, child in module.named_children():
-                fn_recursive_add_processors(f"{name}.{sub_name}", child, processors)
-
-            return processors
-
-        for name, module in self.named_children():
-            fn_recursive_add_processors(name, module, processors)
-
-        return processors
-
-    # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.set_attn_processor
-    def set_attn_processor(self, processor: Union[AttentionProcessor, Dict[str, AttentionProcessor]]):
-        r"""
-        Sets the attention processor to use to compute attention.
-
-        Parameters:
-            processor (`dict` of `AttentionProcessor` or only `AttentionProcessor`):
-                The instantiated processor class or a dictionary of processor classes that will be set as the processor
-                for **all** `Attention` layers.
-
-                If `processor` is a dict, the key needs to define the path to the corresponding cross attention
-                processor. This is strongly recommended when setting trainable attention processors.
-
-        """
-        count = len(self.attn_processors.keys())
-
-        if isinstance(processor, dict) and len(processor) != count:
-            raise ValueError(
-                f"A dict of processors was passed, but the number of processors {len(processor)} does not match the"
-                f" number of attention layers: {count}. Please make sure to pass {count} processor classes."
-            )
-
-        def fn_recursive_attn_processor(name: str, module: torch.nn.Module, processor):
-            if hasattr(module, "set_processor"):
-                if not isinstance(processor, dict):
-                    module.set_processor(processor)
-                else:
-                    module.set_processor(processor.pop(f"{name}.processor"))
-
-            for sub_name, child in module.named_children():
-                fn_recursive_attn_processor(f"{name}.{sub_name}", child, processor)
-
-        for name, module in self.named_children():
-            fn_recursive_attn_processor(name, module, processor)
-
     # Copied from diffusers.models.unets.unet_3d_condition.UNet3DConditionModel.enable_forward_chunking
     def enable_forward_chunking(self, chunk_size: Optional[int] = None, dim: int = 0) -> None:
         """
diff --git a/src/diffusers/models/unets/unet_kandinsky3.py b/src/diffusers/models/unets/unet_kandinsky3.py
index 27241ce2e674..13f4641a4c50 100644
--- a/src/diffusers/models/unets/unet_kandinsky3.py
+++ b/src/diffusers/models/unets/unet_kandinsky3.py
@@ -13,14 +13,15 @@
 # limitations under the License.
 
 from dataclasses import dataclass
-from typing import Dict, Tuple, Union
+from typing import Tuple, Union
 
 import torch
 from torch import nn
 
 from ...configuration_utils import ConfigMixin, register_to_config
 from ...utils import BaseOutput, logging
-from ..attention_processor import Attention, AttentionProcessor, AttnProcessor
+from ..attention import AttentionMixin
+from ..attention_processor import Attention, AttnProcessor
 from ..embeddings import TimestepEmbedding, Timesteps
 from ..modeling_utils import ModelMixin
 
@@ -45,7 +46,7 @@ def forward(self, x):
         return x
 
 
-class Kandinsky3UNet(ModelMixin, ConfigMixin):
+class Kandinsky3UNet(ModelMixin, AttentionMixin, ConfigMixin):
     @register_to_config
     def __init__(
         self,
@@ -54,7 +55,7 @@ def __init__(
         groups: int = 32,
         attention_head_dim: int = 64,
         layers_per_block: Union[int, Tuple[int]] = 3,
-        block_out_channels: Tuple[int] = (384, 768, 1536, 3072),
+        block_out_channels: Tuple[int, ...] = (384, 768, 1536, 3072),
         cross_attention_dim: Union[int, Tuple[int]] = 4096,
         encoder_hid_dim: int = 4096,
     ):
@@ -140,64 +141,6 @@ def __init__(
         self.conv_act_out = nn.SiLU()
         self.conv_out = nn.Conv2d(init_channels, out_channels, kernel_size=3, padding=1)
 
-    @property
-    def attn_processors(self) -> Dict[str, AttentionProcessor]:
-        r"""
-        Returns:
-            `dict` of attention processors: A dictionary containing all attention processors used in the model with
-            indexed by its weight name.
-        """
-        # set recursively
-        processors = {}
-
-        def fn_recursive_add_processors(name: str, module: torch.nn.Module, processors: Dict[str, AttentionProcessor]):
-            if hasattr(module, "set_processor"):
-                processors[f"{name}.processor"] = module.processor
-
-            for sub_name, child in module.named_children():
-                fn_recursive_add_processors(f"{name}.{sub_name}", child, processors)
-
-            return processors
-
-        for name, module in self.named_children():
-            fn_recursive_add_processors(name, module, processors)
-
-        return processors
-
-    def set_attn_processor(self, processor: Union[AttentionProcessor, Dict[str, AttentionProcessor]]):
-        r"""
-        Sets the attention processor to use to compute attention.
-
-        Parameters:
-            processor (`dict` of `AttentionProcessor` or only `AttentionProcessor`):
-                The instantiated processor class or a dictionary of processor classes that will be set as the processor
-                for **all** `Attention` layers.
-
-                If `processor` is a dict, the key needs to define the path to the corresponding cross attention
-                processor. This is strongly recommended when setting trainable attention processors.
-
-        """
-        count = len(self.attn_processors.keys())
-
-        if isinstance(processor, dict) and len(processor) != count:
-            raise ValueError(
-                f"A dict of processors was passed, but the number of processors {len(processor)} does not match the"
-                f" number of attention layers: {count}. Please make sure to pass {count} processor classes."
-            )
-
-        def fn_recursive_attn_processor(name: str, module: torch.nn.Module, processor):
-            if hasattr(module, "set_processor"):
-                if not isinstance(processor, dict):
-                    module.set_processor(processor)
-                else:
-                    module.set_processor(processor.pop(f"{name}.processor"))
-
-            for sub_name, child in module.named_children():
-                fn_recursive_attn_processor(f"{name}.{sub_name}", child, processor)
-
-        for name, module in self.named_children():
-            fn_recursive_attn_processor(name, module, processor)
-
     def set_default_attn_processor(self):
         """
         Disables custom attention processors and sets the default attention implementation.
diff --git a/src/diffusers/models/unets/unet_motion_model.py b/src/diffusers/models/unets/unet_motion_model.py
index 18d5eb917f48..5a93541501d3 100644
--- a/src/diffusers/models/unets/unet_motion_model.py
+++ b/src/diffusers/models/unets/unet_motion_model.py
@@ -23,12 +23,11 @@
 from ...loaders import FromOriginalModelMixin, PeftAdapterMixin, UNet2DConditionLoadersMixin
 from ...utils import BaseOutput, deprecate, logging
 from ...utils.torch_utils import apply_freeu
-from ..attention import BasicTransformerBlock
+from ..attention import AttentionMixin, BasicTransformerBlock
 from ..attention_processor import (
     ADDED_KV_ATTENTION_PROCESSORS,
     CROSS_ATTENTION_PROCESSORS,
     Attention,
-    AttentionProcessor,
     AttnAddedKVProcessor,
     AttnProcessor,
     AttnProcessor2_0,
@@ -1195,7 +1194,7 @@ def forward(self, sample):
         pass
 
 
-class UNetMotionModel(ModelMixin, ConfigMixin, UNet2DConditionLoadersMixin, PeftAdapterMixin):
+class UNetMotionModel(ModelMixin, AttentionMixin, ConfigMixin, UNet2DConditionLoadersMixin, PeftAdapterMixin):
     r"""
     A modified conditional 2D UNet model that takes a noisy sample, conditional state, and a timestep and returns a
     sample shaped output.
@@ -1754,66 +1753,6 @@ def save_motion_modules(
             **kwargs,
         )
 
-    @property
-    # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.attn_processors
-    def attn_processors(self) -> Dict[str, AttentionProcessor]:
-        r"""
-        Returns:
-            `dict` of attention processors: A dictionary containing all attention processors used in the model with
-            indexed by its weight name.
-        """
-        # set recursively
-        processors = {}
-
-        def fn_recursive_add_processors(name: str, module: torch.nn.Module, processors: Dict[str, AttentionProcessor]):
-            if hasattr(module, "get_processor"):
-                processors[f"{name}.processor"] = module.get_processor()
-
-            for sub_name, child in module.named_children():
-                fn_recursive_add_processors(f"{name}.{sub_name}", child, processors)
-
-            return processors
-
-        for name, module in self.named_children():
-            fn_recursive_add_processors(name, module, processors)
-
-        return processors
-
-    # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.set_attn_processor
-    def set_attn_processor(self, processor: Union[AttentionProcessor, Dict[str, AttentionProcessor]]):
-        r"""
-        Sets the attention processor to use to compute attention.
-
-        Parameters:
-            processor (`dict` of `AttentionProcessor` or only `AttentionProcessor`):
-                The instantiated processor class or a dictionary of processor classes that will be set as the processor
-                for **all** `Attention` layers.
-
-                If `processor` is a dict, the key needs to define the path to the corresponding cross attention
-                processor. This is strongly recommended when setting trainable attention processors.
-
-        """
-        count = len(self.attn_processors.keys())
-
-        if isinstance(processor, dict) and len(processor) != count:
-            raise ValueError(
-                f"A dict of processors was passed, but the number of processors {len(processor)} does not match the"
-                f" number of attention layers: {count}. Please make sure to pass {count} processor classes."
-            )
-
-        def fn_recursive_attn_processor(name: str, module: torch.nn.Module, processor):
-            if hasattr(module, "set_processor"):
-                if not isinstance(processor, dict):
-                    module.set_processor(processor)
-                else:
-                    module.set_processor(processor.pop(f"{name}.processor"))
-
-            for sub_name, child in module.named_children():
-                fn_recursive_attn_processor(f"{name}.{sub_name}", child, processor)
-
-        for name, module in self.named_children():
-            fn_recursive_attn_processor(name, module, processor)
-
     def enable_forward_chunking(self, chunk_size: Optional[int] = None, dim: int = 0) -> None:
         """
         Sets the attention processor to use [feed forward
diff --git a/src/diffusers/models/unets/unet_spatio_temporal_condition.py b/src/diffusers/models/unets/unet_spatio_temporal_condition.py
index 059a6e807c8e..c0cd5fbdd489 100644
--- a/src/diffusers/models/unets/unet_spatio_temporal_condition.py
+++ b/src/diffusers/models/unets/unet_spatio_temporal_condition.py
@@ -1,5 +1,5 @@
 from dataclasses import dataclass
-from typing import Dict, Optional, Tuple, Union
+from typing import Optional, Tuple, Union
 
 import torch
 import torch.nn as nn
@@ -7,7 +7,8 @@
 from ...configuration_utils import ConfigMixin, register_to_config
 from ...loaders import UNet2DConditionLoadersMixin
 from ...utils import BaseOutput, logging
-from ..attention_processor import CROSS_ATTENTION_PROCESSORS, AttentionProcessor, AttnProcessor
+from ..attention import AttentionMixin
+from ..attention_processor import CROSS_ATTENTION_PROCESSORS, AttnProcessor
 from ..embeddings import TimestepEmbedding, Timesteps
 from ..modeling_utils import ModelMixin
 from .unet_3d_blocks import UNetMidBlockSpatioTemporal, get_down_block, get_up_block
@@ -29,7 +30,7 @@ class UNetSpatioTemporalConditionOutput(BaseOutput):
     sample: torch.Tensor = None
 
 
-class UNetSpatioTemporalConditionModel(ModelMixin, ConfigMixin, UNet2DConditionLoadersMixin):
+class UNetSpatioTemporalConditionModel(ModelMixin, AttentionMixin, ConfigMixin, UNet2DConditionLoadersMixin):
     r"""
     A conditional Spatio-Temporal UNet model that takes a noisy video frames, conditional state, and a timestep and
     returns a sample shaped output.
@@ -73,25 +74,25 @@ def __init__(
         sample_size: Optional[int] = None,
         in_channels: int = 8,
         out_channels: int = 4,
-        down_block_types: Tuple[str] = (
+        down_block_types: Tuple[str, ...] = (
             "CrossAttnDownBlockSpatioTemporal",
             "CrossAttnDownBlockSpatioTemporal",
             "CrossAttnDownBlockSpatioTemporal",
             "DownBlockSpatioTemporal",
         ),
-        up_block_types: Tuple[str] = (
+        up_block_types: Tuple[str, ...] = (
             "UpBlockSpatioTemporal",
             "CrossAttnUpBlockSpatioTemporal",
             "CrossAttnUpBlockSpatioTemporal",
             "CrossAttnUpBlockSpatioTemporal",
         ),
-        block_out_channels: Tuple[int] = (320, 640, 1280, 1280),
+        block_out_channels: Tuple[int, ...] = (320, 640, 1280, 1280),
         addition_time_embed_dim: int = 256,
         projection_class_embeddings_input_dim: int = 768,
         layers_per_block: Union[int, Tuple[int]] = 2,
         cross_attention_dim: Union[int, Tuple[int]] = 1024,
         transformer_layers_per_block: Union[int, Tuple[int], Tuple[Tuple]] = 1,
-        num_attention_heads: Union[int, Tuple[int]] = (5, 10, 20, 20),
+        num_attention_heads: Union[int, Tuple[int, ...]] = (5, 10, 20, 20),
         num_frames: int = 25,
     ):
         super().__init__()
@@ -245,68 +246,6 @@ def __init__(
             padding=1,
         )
 
-    @property
-    def attn_processors(self) -> Dict[str, AttentionProcessor]:
-        r"""
-        Returns:
-            `dict` of attention processors: A dictionary containing all attention processors used in the model with
-            indexed by its weight name.
-        """
-        # set recursively
-        processors = {}
-
-        def fn_recursive_add_processors(
-            name: str,
-            module: torch.nn.Module,
-            processors: Dict[str, AttentionProcessor],
-        ):
-            if hasattr(module, "get_processor"):
-                processors[f"{name}.processor"] = module.get_processor()
-
-            for sub_name, child in module.named_children():
-                fn_recursive_add_processors(f"{name}.{sub_name}", child, processors)
-
-            return processors
-
-        for name, module in self.named_children():
-            fn_recursive_add_processors(name, module, processors)
-
-        return processors
-
-    def set_attn_processor(self, processor: Union[AttentionProcessor, Dict[str, AttentionProcessor]]):
-        r"""
-        Sets the attention processor to use to compute attention.
-
-        Parameters:
-            processor (`dict` of `AttentionProcessor` or only `AttentionProcessor`):
-                The instantiated processor class or a dictionary of processor classes that will be set as the processor
-                for **all** `Attention` layers.
-
-                If `processor` is a dict, the key needs to define the path to the corresponding cross attention
-                processor. This is strongly recommended when setting trainable attention processors.
-
-        """
-        count = len(self.attn_processors.keys())
-
-        if isinstance(processor, dict) and len(processor) != count:
-            raise ValueError(
-                f"A dict of processors was passed, but the number of processors {len(processor)} does not match the"
-                f" number of attention layers: {count}. Please make sure to pass {count} processor classes."
-            )
-
-        def fn_recursive_attn_processor(name: str, module: torch.nn.Module, processor):
-            if hasattr(module, "set_processor"):
-                if not isinstance(processor, dict):
-                    module.set_processor(processor)
-                else:
-                    module.set_processor(processor.pop(f"{name}.processor"))
-
-            for sub_name, child in module.named_children():
-                fn_recursive_attn_processor(f"{name}.{sub_name}", child, processor)
-
-        for name, module in self.named_children():
-            fn_recursive_attn_processor(name, module, processor)
-
     def set_default_attn_processor(self):
         """
         Disables custom attention processors and sets the default attention implementation.
diff --git a/src/diffusers/models/unets/unet_stable_cascade.py b/src/diffusers/models/unets/unet_stable_cascade.py
index e79ce8ac1da4..23d358c1bf51 100644
--- a/src/diffusers/models/unets/unet_stable_cascade.py
+++ b/src/diffusers/models/unets/unet_stable_cascade.py
@@ -145,10 +145,10 @@ def __init__(
         timestep_ratio_embedding_dim: int = 64,
         patch_size: int = 1,
         conditioning_dim: int = 2048,
-        block_out_channels: Tuple[int] = (2048, 2048),
-        num_attention_heads: Tuple[int] = (32, 32),
-        down_num_layers_per_block: Tuple[int] = (8, 24),
-        up_num_layers_per_block: Tuple[int] = (24, 8),
+        block_out_channels: Tuple[int, ...] = (2048, 2048),
+        num_attention_heads: Tuple[int, ...] = (32, 32),
+        down_num_layers_per_block: Tuple[int, ...] = (8, 24),
+        up_num_layers_per_block: Tuple[int, ...] = (24, 8),
         down_blocks_repeat_mappers: Optional[Tuple[int]] = (
             1,
             1,
@@ -167,7 +167,7 @@ def __init__(
         kernel_size=3,
         dropout: Union[float, Tuple[float]] = (0.1, 0.1),
         self_attn: Union[bool, Tuple[bool]] = True,
-        timestep_conditioning_type: Tuple[str] = ("sca", "crp"),
+        timestep_conditioning_type: Tuple[str, ...] = ("sca", "crp"),
         switch_level: Optional[Tuple[bool]] = None,
     ):
         """
diff --git a/src/diffusers/models/unets/uvit_2d.py b/src/diffusers/models/unets/uvit_2d.py
index 94b39c84f055..4c99ef88ca19 100644
--- a/src/diffusers/models/unets/uvit_2d.py
+++ b/src/diffusers/models/unets/uvit_2d.py
@@ -13,7 +13,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from typing import Dict, Union
 
 import torch
 import torch.nn.functional as F
@@ -22,11 +21,10 @@
 
 from ...configuration_utils import ConfigMixin, register_to_config
 from ...loaders import PeftAdapterMixin
-from ..attention import BasicTransformerBlock, SkipFFTransformerBlock
+from ..attention import AttentionMixin, BasicTransformerBlock, SkipFFTransformerBlock
 from ..attention_processor import (
     ADDED_KV_ATTENTION_PROCESSORS,
     CROSS_ATTENTION_PROCESSORS,
-    AttentionProcessor,
     AttnAddedKVProcessor,
     AttnProcessor,
 )
@@ -36,7 +34,7 @@
 from ..resnet import Downsample2D, Upsample2D
 
 
-class UVit2DModel(ModelMixin, ConfigMixin, PeftAdapterMixin):
+class UVit2DModel(ModelMixin, AttentionMixin, ConfigMixin, PeftAdapterMixin):
     _supports_gradient_checkpointing = True
 
     @register_to_config
@@ -209,66 +207,6 @@ def layer_(*args):
 
         return logits
 
-    @property
-    # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.attn_processors
-    def attn_processors(self) -> Dict[str, AttentionProcessor]:
-        r"""
-        Returns:
-            `dict` of attention processors: A dictionary containing all attention processors used in the model with
-            indexed by its weight name.
-        """
-        # set recursively
-        processors = {}
-
-        def fn_recursive_add_processors(name: str, module: torch.nn.Module, processors: Dict[str, AttentionProcessor]):
-            if hasattr(module, "get_processor"):
-                processors[f"{name}.processor"] = module.get_processor()
-
-            for sub_name, child in module.named_children():
-                fn_recursive_add_processors(f"{name}.{sub_name}", child, processors)
-
-            return processors
-
-        for name, module in self.named_children():
-            fn_recursive_add_processors(name, module, processors)
-
-        return processors
-
-    # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.set_attn_processor
-    def set_attn_processor(self, processor: Union[AttentionProcessor, Dict[str, AttentionProcessor]]):
-        r"""
-        Sets the attention processor to use to compute attention.
-
-        Parameters:
-            processor (`dict` of `AttentionProcessor` or only `AttentionProcessor`):
-                The instantiated processor class or a dictionary of processor classes that will be set as the processor
-                for **all** `Attention` layers.
-
-                If `processor` is a dict, the key needs to define the path to the corresponding cross attention
-                processor. This is strongly recommended when setting trainable attention processors.
-
-        """
-        count = len(self.attn_processors.keys())
-
-        if isinstance(processor, dict) and len(processor) != count:
-            raise ValueError(
-                f"A dict of processors was passed, but the number of processors {len(processor)} does not match the"
-                f" number of attention layers: {count}. Please make sure to pass {count} processor classes."
-            )
-
-        def fn_recursive_attn_processor(name: str, module: torch.nn.Module, processor):
-            if hasattr(module, "set_processor"):
-                if not isinstance(processor, dict):
-                    module.set_processor(processor)
-                else:
-                    module.set_processor(processor.pop(f"{name}.processor"))
-
-            for sub_name, child in module.named_children():
-                fn_recursive_attn_processor(f"{name}.{sub_name}", child, processor)
-
-        for name, module in self.named_children():
-            fn_recursive_attn_processor(name, module, processor)
-
     # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.set_default_attn_processor
     def set_default_attn_processor(self):
         """
diff --git a/src/diffusers/models/vae_flax.py b/src/diffusers/models/vae_flax.py
index 13653b90372a..5aad386a89e8 100644
--- a/src/diffusers/models/vae_flax.py
+++ b/src/diffusers/models/vae_flax.py
@@ -532,8 +532,8 @@ class FlaxEncoder(nn.Module):
 
     in_channels: int = 3
     out_channels: int = 3
-    down_block_types: Tuple[str] = ("DownEncoderBlock2D",)
-    block_out_channels: Tuple[int] = (64,)
+    down_block_types: Tuple[str, ...] = ("DownEncoderBlock2D",)
+    block_out_channels: Tuple[int, ...] = (64,)
     layers_per_block: int = 2
     norm_num_groups: int = 32
     act_fn: str = "silu"
@@ -650,8 +650,8 @@ class FlaxDecoder(nn.Module):
 
     in_channels: int = 3
     out_channels: int = 3
-    up_block_types: Tuple[str] = ("UpDecoderBlock2D",)
-    block_out_channels: int = (64,)
+    up_block_types: Tuple[str, ...] = ("UpDecoderBlock2D",)
+    block_out_channels: Tuple[int, ...] = (64,)
     layers_per_block: int = 2
     norm_num_groups: int = 32
     act_fn: str = "silu"
@@ -823,9 +823,9 @@ class FlaxAutoencoderKL(nn.Module, FlaxModelMixin, ConfigMixin):
 
     in_channels: int = 3
     out_channels: int = 3
-    down_block_types: Tuple[str] = ("DownEncoderBlock2D",)
-    up_block_types: Tuple[str] = ("UpDecoderBlock2D",)
-    block_out_channels: Tuple[int] = (64,)
+    down_block_types: Tuple[str, ...] = ("DownEncoderBlock2D",)
+    up_block_types: Tuple[str, ...] = ("UpDecoderBlock2D",)
+    block_out_channels: Tuple[int, ...] = (64,)
     layers_per_block: int = 1
     act_fn: str = "silu"
     latent_channels: int = 4
diff --git a/src/diffusers/modular_pipelines/__init__.py b/src/diffusers/modular_pipelines/__init__.py
index 86ed735134ff..252b9f33dfe8 100644
--- a/src/diffusers/modular_pipelines/__init__.py
+++ b/src/diffusers/modular_pipelines/__init__.py
@@ -45,7 +45,7 @@
         "InsertableDict",
     ]
     _import_structure["stable_diffusion_xl"] = ["StableDiffusionXLAutoBlocks", "StableDiffusionXLModularPipeline"]
-    _import_structure["wan"] = ["WanAutoBlocks", "WanModularPipeline"]
+    _import_structure["wan"] = ["WanAutoBlocks", "Wan22AutoBlocks", "WanModularPipeline"]
     _import_structure["flux"] = [
         "FluxAutoBlocks",
         "FluxModularPipeline",
@@ -90,7 +90,7 @@
             QwenImageModularPipeline,
         )
         from .stable_diffusion_xl import StableDiffusionXLAutoBlocks, StableDiffusionXLModularPipeline
-        from .wan import WanAutoBlocks, WanModularPipeline
+        from .wan import Wan22AutoBlocks, WanAutoBlocks, WanModularPipeline
 else:
     import sys
 
diff --git a/src/diffusers/modular_pipelines/components_manager.py b/src/diffusers/modular_pipelines/components_manager.py
index 9dd8035c44e7..cb7e8fb73697 100644
--- a/src/diffusers/modular_pipelines/components_manager.py
+++ b/src/diffusers/modular_pipelines/components_manager.py
@@ -164,7 +164,11 @@ def __call__(self, hooks, model_id, model, execution_device):
 
         device_type = execution_device.type
         device_module = getattr(torch, device_type, torch.cuda)
-        mem_on_device = device_module.mem_get_info(execution_device.index)[0]
+        try:
+            mem_on_device = device_module.mem_get_info(execution_device.index)[0]
+        except AttributeError:
+            raise AttributeError(f"Do not know how to obtain obtain memory info for {str(device_module)}.")
+
         mem_on_device = mem_on_device - self.memory_reserve_margin
         if current_module_size < mem_on_device:
             return []
@@ -699,6 +703,8 @@ def enable_auto_cpu_offload(self, device: Union[str, int, torch.device] = None,
         if not is_accelerate_available():
             raise ImportError("Make sure to install accelerate to use auto_cpu_offload")
 
+        # TODO: add a warning if mem_get_info isn't available on `device`.
+
         for name, component in self.components.items():
             if isinstance(component, torch.nn.Module) and hasattr(component, "_hf_hook"):
                 remove_hook_from_module(component, recurse=True)
diff --git a/src/diffusers/modular_pipelines/flux/before_denoise.py b/src/diffusers/modular_pipelines/flux/before_denoise.py
index c098b7d4f1e5..daffec986535 100644
--- a/src/diffusers/modular_pipelines/flux/before_denoise.py
+++ b/src/diffusers/modular_pipelines/flux/before_denoise.py
@@ -598,7 +598,7 @@ def __call__(self, components: FluxModularPipeline, state: PipelineState) -> Pip
             and getattr(block_state, "image_width", None) is not None
         ):
             image_latent_height = 2 * (int(block_state.image_height) // (components.vae_scale_factor * 2))
-            image_latent_width = 2 * (int(block_state.width) // (components.vae_scale_factor * 2))
+            image_latent_width = 2 * (int(block_state.image_width) // (components.vae_scale_factor * 2))
             img_ids = FluxPipeline._prepare_latent_image_ids(
                 None, image_latent_height // 2, image_latent_width // 2, device, dtype
             )
diff --git a/src/diffusers/modular_pipelines/flux/denoise.py b/src/diffusers/modular_pipelines/flux/denoise.py
index b1796bb63cb0..5a769df1036d 100644
--- a/src/diffusers/modular_pipelines/flux/denoise.py
+++ b/src/diffusers/modular_pipelines/flux/denoise.py
@@ -59,7 +59,7 @@ def inputs(self) -> List[Tuple[str, Any]]:
             ),
             InputParam(
                 "guidance",
-                required=True,
+                required=False,
                 type_hint=torch.Tensor,
                 description="Guidance scale as a tensor",
             ),
@@ -141,7 +141,7 @@ def inputs(self) -> List[Tuple[str, Any]]:
             ),
             InputParam(
                 "guidance",
-                required=True,
+                required=False,
                 type_hint=torch.Tensor,
                 description="Guidance scale as a tensor",
             ),
diff --git a/src/diffusers/modular_pipelines/flux/encoders.py b/src/diffusers/modular_pipelines/flux/encoders.py
index b71962bd9313..f0314d4771b0 100644
--- a/src/diffusers/modular_pipelines/flux/encoders.py
+++ b/src/diffusers/modular_pipelines/flux/encoders.py
@@ -95,7 +95,7 @@ def expected_components(self) -> List[ComponentSpec]:
             ComponentSpec(
                 "image_processor",
                 VaeImageProcessor,
-                config=FrozenDict({"vae_scale_factor": 16}),
+                config=FrozenDict({"vae_scale_factor": 16, "vae_latent_channels": 16}),
                 default_creation_method="from_config",
             ),
         ]
@@ -143,10 +143,6 @@ def __call__(self, components: FluxModularPipeline, state: PipelineState):
 class FluxKontextProcessImagesInputStep(ModularPipelineBlocks):
     model_name = "flux-kontext"
 
-    def __init__(self, _auto_resize=True):
-        self._auto_resize = _auto_resize
-        super().__init__()
-
     @property
     def description(self) -> str:
         return (
@@ -167,7 +163,7 @@ def expected_components(self) -> List[ComponentSpec]:
 
     @property
     def inputs(self) -> List[InputParam]:
-        return [InputParam("image")]
+        return [InputParam("image"), InputParam("_auto_resize", type_hint=bool, default=True)]
 
     @property
     def intermediate_outputs(self) -> List[OutputParam]:
@@ -195,7 +191,8 @@ def __call__(self, components: FluxModularPipeline, state: PipelineState):
             img = images[0]
             image_height, image_width = components.image_processor.get_default_height_width(img)
             aspect_ratio = image_width / image_height
-            if self._auto_resize:
+            _auto_resize = block_state._auto_resize
+            if _auto_resize:
                 # Kontext is trained on specific resolutions, using one of them is recommended
                 _, image_width, image_height = min(
                     (abs(aspect_ratio - w / h), w, h) for w, h in PREFERRED_KONTEXT_RESOLUTIONS
diff --git a/src/diffusers/modular_pipelines/flux/inputs.py b/src/diffusers/modular_pipelines/flux/inputs.py
index e1bc17f5ff4e..8309eebfeb37 100644
--- a/src/diffusers/modular_pipelines/flux/inputs.py
+++ b/src/diffusers/modular_pipelines/flux/inputs.py
@@ -112,6 +112,10 @@ def __call__(self, components: FluxModularPipeline, state: PipelineState) -> Pip
         block_state.prompt_embeds = block_state.prompt_embeds.view(
             block_state.batch_size * block_state.num_images_per_prompt, seq_len, -1
         )
+        pooled_prompt_embeds = block_state.pooled_prompt_embeds.repeat(1, block_state.num_images_per_prompt)
+        block_state.pooled_prompt_embeds = pooled_prompt_embeds.view(
+            block_state.batch_size * block_state.num_images_per_prompt, -1
+        )
         self.set_block_state(state, block_state)
 
         return components, state
diff --git a/src/diffusers/modular_pipelines/modular_pipeline.py b/src/diffusers/modular_pipelines/modular_pipeline.py
index 55c261ab2f29..a6336de71a52 100644
--- a/src/diffusers/modular_pipelines/modular_pipeline.py
+++ b/src/diffusers/modular_pipelines/modular_pipeline.py
@@ -305,15 +305,15 @@ def from_pretrained(
             "cache_dir",
             "force_download",
             "local_files_only",
+            "local_dir",
             "proxies",
-            "resume_download",
             "revision",
             "subfolder",
             "token",
         ]
         hub_kwargs = {name: kwargs.pop(name) for name in hub_kwargs_names if name in kwargs}
 
-        config = cls.load_config(pretrained_model_name_or_path)
+        config = cls.load_config(pretrained_model_name_or_path, **hub_kwargs)
         has_remote_code = "auto_map" in config and cls.__name__ in config["auto_map"]
         trust_remote_code = resolve_trust_remote_code(
             trust_remote_code, pretrained_model_name_or_path, has_remote_code
@@ -331,7 +331,6 @@ def from_pretrained(
             module_file=module_file,
             class_name=class_name,
             **hub_kwargs,
-            **kwargs,
         )
         expected_kwargs, optional_kwargs = block_cls._get_signature_keys(block_cls)
         block_kwargs = {
@@ -361,7 +360,7 @@ def init_pipeline(
         collection: Optional[str] = None,
     ) -> "ModularPipeline":
         """
-        create a ModularPipeline, optionally accept modular_repo to load from hub.
+        create a ModularPipeline, optionally accept pretrained_model_name_or_path to load from hub.
         """
         pipeline_class_name = MODULAR_PIPELINE_MAPPING.get(self.model_name, ModularPipeline.__name__)
         diffusers_module = importlib.import_module("diffusers")
@@ -862,6 +861,10 @@ def __init__(self):
             else:
                 sub_blocks[block_name] = block
         self.sub_blocks = sub_blocks
+        if not len(self.block_names) == len(self.block_classes):
+            raise ValueError(
+                f"In {self.__class__.__name__}, the number of block_names and block_classes must be the same."
+            )
 
     def _get_inputs(self):
         inputs = []
@@ -1442,6 +1445,8 @@ def __init__(
         pretrained_model_name_or_path: Optional[Union[str, os.PathLike]] = None,
         components_manager: Optional[ComponentsManager] = None,
         collection: Optional[str] = None,
+        modular_config_dict: Optional[Dict[str, Any]] = None,
+        config_dict: Optional[Dict[str, Any]] = None,
         **kwargs,
     ):
         """
@@ -1493,23 +1498,8 @@ def __init__(
             - The pipeline's config dict is also used to store the pipeline blocks's class name, which will be saved as
               `_blocks_class_name` in the config dict
         """
-        if blocks is None:
-            blocks_class_name = self.default_blocks_name
-            if blocks_class_name is not None:
-                diffusers_module = importlib.import_module("diffusers")
-                blocks_class = getattr(diffusers_module, blocks_class_name)
-                blocks = blocks_class()
-            else:
-                logger.warning(f"`blocks` is `None`, no default blocks class found for {self.__class__.__name__}")
-
-        self.blocks = blocks
-        self._components_manager = components_manager
-        self._collection = collection
-        self._component_specs = {spec.name: deepcopy(spec) for spec in self.blocks.expected_components}
-        self._config_specs = {spec.name: deepcopy(spec) for spec in self.blocks.expected_configs}
 
-        # update component_specs and config_specs from modular_repo
-        if pretrained_model_name_or_path is not None:
+        if modular_config_dict is None and config_dict is None and pretrained_model_name_or_path is not None:
             cache_dir = kwargs.pop("cache_dir", None)
             force_download = kwargs.pop("force_download", False)
             proxies = kwargs.pop("proxies", None)
@@ -1525,52 +1515,59 @@ def __init__(
                 "local_files_only": local_files_only,
                 "revision": revision,
             }
-            # try to load modular_model_index.json
-            try:
-                config_dict = self.load_config(pretrained_model_name_or_path, **load_config_kwargs)
-            except EnvironmentError as e:
-                logger.debug(f"modular_model_index.json not found: {e}")
-                config_dict = None
-
-            # update component_specs and config_specs based on modular_model_index.json
-            if config_dict is not None:
-                for name, value in config_dict.items():
-                    # all the components in modular_model_index.json are from_pretrained components
-                    if name in self._component_specs and isinstance(value, (tuple, list)) and len(value) == 3:
-                        library, class_name, component_spec_dict = value
-                        component_spec = self._dict_to_component_spec(name, component_spec_dict)
-                        component_spec.default_creation_method = "from_pretrained"
-                        self._component_specs[name] = component_spec
-
-                    elif name in self._config_specs:
-                        self._config_specs[name].default = value
-
-            # if modular_model_index.json is not found, try to load model_index.json
+
+            modular_config_dict, config_dict = self._load_pipeline_config(
+                pretrained_model_name_or_path, **load_config_kwargs
+            )
+
+        if blocks is None:
+            if modular_config_dict is not None:
+                blocks_class_name = modular_config_dict.get("_blocks_class_name")
+            elif config_dict is not None:
+                blocks_class_name = self.get_default_blocks_name(config_dict)
+            else:
+                blocks_class_name = None
+            if blocks_class_name is not None:
+                diffusers_module = importlib.import_module("diffusers")
+                blocks_class = getattr(diffusers_module, blocks_class_name)
+                blocks = blocks_class()
             else:
-                logger.debug(" loading config from model_index.json")
-                try:
-                    from diffusers import DiffusionPipeline
-
-                    config_dict = DiffusionPipeline.load_config(pretrained_model_name_or_path, **load_config_kwargs)
-                except EnvironmentError as e:
-                    logger.debug(f" model_index.json not found in the repo: {e}")
-                    config_dict = None
-
-                # update component_specs and config_specs based on model_index.json
-                if config_dict is not None:
-                    for name, value in config_dict.items():
-                        if name in self._component_specs and isinstance(value, (tuple, list)) and len(value) == 2:
-                            library, class_name = value
-                            component_spec_dict = {
-                                "repo": pretrained_model_name_or_path,
-                                "subfolder": name,
-                                "type_hint": (library, class_name),
-                            }
-                            component_spec = self._dict_to_component_spec(name, component_spec_dict)
-                            component_spec.default_creation_method = "from_pretrained"
-                            self._component_specs[name] = component_spec
-                        elif name in self._config_specs:
-                            self._config_specs[name].default = value
+                logger.warning(f"`blocks` is `None`, no default blocks class found for {self.__class__.__name__}")
+
+        self.blocks = blocks
+        self._components_manager = components_manager
+        self._collection = collection
+        self._component_specs = {spec.name: deepcopy(spec) for spec in self.blocks.expected_components}
+        self._config_specs = {spec.name: deepcopy(spec) for spec in self.blocks.expected_configs}
+
+        # update component_specs and config_specs based on modular_model_index.json
+        if modular_config_dict is not None:
+            for name, value in modular_config_dict.items():
+                # all the components in modular_model_index.json are from_pretrained components
+                if name in self._component_specs and isinstance(value, (tuple, list)) and len(value) == 3:
+                    library, class_name, component_spec_dict = value
+                    component_spec = self._dict_to_component_spec(name, component_spec_dict)
+                    component_spec.default_creation_method = "from_pretrained"
+                    self._component_specs[name] = component_spec
+
+                elif name in self._config_specs:
+                    self._config_specs[name].default = value
+
+        # if `modular_config_dict` is None (i.e. `modular_model_index.json` is not found), update based on `config_dict` (i.e. `model_index.json`)
+        elif config_dict is not None:
+            for name, value in config_dict.items():
+                if name in self._component_specs and isinstance(value, (tuple, list)) and len(value) == 2:
+                    library, class_name = value
+                    component_spec_dict = {
+                        "repo": pretrained_model_name_or_path,
+                        "subfolder": name,
+                        "type_hint": (library, class_name),
+                    }
+                    component_spec = self._dict_to_component_spec(name, component_spec_dict)
+                    component_spec.default_creation_method = "from_pretrained"
+                    self._component_specs[name] = component_spec
+                elif name in self._config_specs:
+                    self._config_specs[name].default = value
 
         if len(kwargs) > 0:
             logger.warning(f"Unexpected input '{kwargs.keys()}' provided. This input will be ignored.")
@@ -1602,6 +1599,35 @@ def default_call_parameters(self) -> Dict[str, Any]:
             params[input_param.name] = input_param.default
         return params
 
+    def get_default_blocks_name(self, config_dict: Optional[Dict[str, Any]]) -> Optional[str]:
+        return self.default_blocks_name
+
+    @classmethod
+    def _load_pipeline_config(
+        cls,
+        pretrained_model_name_or_path: Optional[Union[str, os.PathLike]],
+        **load_config_kwargs,
+    ):
+        try:
+            # try to load modular_model_index.json
+            modular_config_dict = cls.load_config(pretrained_model_name_or_path, **load_config_kwargs)
+            return modular_config_dict, None
+
+        except EnvironmentError as e:
+            logger.debug(f" modular_model_index.json not found in the repo: {e}")
+
+        try:
+            logger.debug(" try to load model_index.json")
+            from diffusers import DiffusionPipeline
+
+            config_dict = DiffusionPipeline.load_config(pretrained_model_name_or_path, **load_config_kwargs)
+            return None, config_dict
+
+        except EnvironmentError as e:
+            logger.debug(f" model_index.json not found in the repo: {e}")
+
+        return None, None
+
     @classmethod
     @validate_hf_hub_args
     def from_pretrained(
@@ -1619,8 +1645,8 @@ def from_pretrained(
             pretrained_model_name_or_path (`str` or `os.PathLike`, optional):
                 Path to a pretrained pipeline configuration. It will first try to load config from
                 `modular_model_index.json`, then fallback to `model_index.json` for compatibility with standard
-                non-modular repositories. If the repo does not contain any pipeline config, it will be set to None
-                during initialization.
+                non-modular repositories. If the pretrained_model_name_or_path does not contain any pipeline config, it
+                will be set to None during initialization.
             trust_remote_code (`bool`, optional):
                 Whether to trust remote code when loading the pipeline, need to be set to True if you want to create
                 pipeline blocks based on the custom code in `pretrained_model_name_or_path`
@@ -1656,42 +1682,33 @@ def from_pretrained(
             "revision": revision,
         }
 
-        try:
-            # try to load modular_model_index.json
-            config_dict = cls.load_config(pretrained_model_name_or_path, **load_config_kwargs)
-        except EnvironmentError as e:
-            logger.debug(f" modular_model_index.json not found in the repo: {e}")
-            config_dict = None
+        modular_config_dict, config_dict = cls._load_pipeline_config(
+            pretrained_model_name_or_path, **load_config_kwargs
+        )
 
-        if config_dict is not None:
-            pipeline_class = _get_pipeline_class(cls, config=config_dict)
+        if modular_config_dict is not None:
+            pipeline_class = _get_pipeline_class(cls, config=modular_config_dict)
+        elif config_dict is not None:
+            from diffusers.pipelines.auto_pipeline import _get_model
+
+            logger.debug(" try to determine the modular pipeline class from model_index.json")
+            standard_pipeline_class = _get_pipeline_class(cls, config=config_dict)
+            model_name = _get_model(standard_pipeline_class.__name__)
+            pipeline_class_name = MODULAR_PIPELINE_MAPPING.get(model_name, ModularPipeline.__name__)
+            diffusers_module = importlib.import_module("diffusers")
+            pipeline_class = getattr(diffusers_module, pipeline_class_name)
         else:
-            try:
-                logger.debug(" try to load model_index.json")
-                from diffusers import DiffusionPipeline
-                from diffusers.pipelines.auto_pipeline import _get_model
-
-                config_dict = DiffusionPipeline.load_config(pretrained_model_name_or_path, **load_config_kwargs)
-            except EnvironmentError as e:
-                logger.debug(f" model_index.json not found in the repo: {e}")
-
-            if config_dict is not None:
-                logger.debug(" try to determine the modular pipeline class from model_index.json")
-                standard_pipeline_class = _get_pipeline_class(cls, config=config_dict)
-                model_name = _get_model(standard_pipeline_class.__name__)
-                pipeline_class_name = MODULAR_PIPELINE_MAPPING.get(model_name, ModularPipeline.__name__)
-                diffusers_module = importlib.import_module("diffusers")
-                pipeline_class = getattr(diffusers_module, pipeline_class_name)
-            else:
-                # there is no config for modular pipeline, assuming that the pipeline block does not need any from_pretrained components
-                pipeline_class = cls
-                pretrained_model_name_or_path = None
+            # there is no config for modular pipeline, assuming that the pipeline block does not need any from_pretrained components
+            pipeline_class = cls
+            pretrained_model_name_or_path = None
 
         pipeline = pipeline_class(
             blocks=blocks,
             pretrained_model_name_or_path=pretrained_model_name_or_path,
             components_manager=components_manager,
             collection=collection,
+            modular_config_dict=modular_config_dict,
+            config_dict=config_dict,
             **kwargs,
         )
         return pipeline
@@ -1790,7 +1807,7 @@ def register_components(self, **kwargs):
                 library, class_name = None, None
 
             # extract the loading spec from the updated component spec that'll be used as part of modular_model_index.json config
-            # e.g. {"repo": "stabilityai/stable-diffusion-2-1",
+            # e.g. {"pretrained_model_name_or_path": "stabilityai/stable-diffusion-2-1",
             #       "type_hint": ("diffusers", "UNet2DConditionModel"),
             #       "subfolder": "unet",
             #       "variant": None,
@@ -2094,8 +2111,10 @@ def load_components(self, names: Optional[Union[List[str], str]] = None, **kwarg
             **kwargs: additional kwargs to be passed to `from_pretrained()`.Can be:
              - a single value to be applied to all components to be loaded, e.g. torch_dtype=torch.bfloat16
              - a dict, e.g. torch_dtype={"unet": torch.bfloat16, "default": torch.float32}
-             - if potentially override ComponentSpec if passed a different loading field in kwargs, e.g. `repo`,
-               `variant`, `revision`, etc.
+             - if potentially override ComponentSpec if passed a different loading field in kwargs, e.g.
+               `pretrained_model_name_or_path`, `variant`, `revision`, etc.
+             - if potentially override ComponentSpec if passed a different loading field in kwargs, e.g.
+               `pretrained_model_name_or_path`, `variant`, `revision`, etc.
         """
 
         if names is None:
@@ -2131,8 +2150,15 @@ def load_components(self, names: Optional[Union[List[str], str]] = None, **kwarg
                         component_load_kwargs[key] = value["default"]
             try:
                 components_to_register[name] = spec.load(**component_load_kwargs)
-            except Exception as e:
-                logger.warning(f"Failed to create component '{name}': {e}")
+            except Exception:
+                logger.warning(
+                    f"\nFailed to create component {name}:\n"
+                    f"- Component spec: {spec}\n"
+                    f"- load() called with kwargs: {component_load_kwargs}\n"
+                    "If this component is not required for your workflow you can safely ignore this message.\n\n"
+                    "Traceback:\n"
+                    f"{traceback.format_exc()}"
+                )
 
         # Register all components at once
         self.register_components(**components_to_register)
@@ -2354,10 +2380,10 @@ def _component_spec_to_dict(component_spec: ComponentSpec) -> Any:
           - "type_hint": Tuple[str, str]
               Library name and class name of the component. (e.g. ("diffusers", "UNet2DConditionModel"))
           - All loading fields defined by `component_spec.loading_fields()`, typically:
-              - "repo": Optional[str]
-                  The model repository (e.g., "stabilityai/stable-diffusion-xl").
+              - "pretrained_model_name_or_path": Optional[str]
+                  The model pretrained_model_name_or_pathsitory (e.g., "stabilityai/stable-diffusion-xl").
               - "subfolder": Optional[str]
-                  A subfolder within the repo where this component lives.
+                  A subfolder within the pretrained_model_name_or_path where this component lives.
               - "variant": Optional[str]
                   An optional variant identifier for the model.
               - "revision": Optional[str]
@@ -2374,11 +2400,13 @@ def _component_spec_to_dict(component_spec: ComponentSpec) -> Any:
         Example:
             >>> from diffusers.pipelines.modular_pipeline_utils import ComponentSpec >>> from diffusers import
             UNet2DConditionModel >>> spec = ComponentSpec(
-                ... name="unet", ... type_hint=UNet2DConditionModel, ... config=None, ... repo="path/to/repo", ...
-                subfolder="subfolder", ... variant=None, ... revision=None, ...
-                default_creation_method="from_pretrained",
+                ... name="unet", ... type_hint=UNet2DConditionModel, ... config=None, ...
+                pretrained_model_name_or_path="path/to/pretrained_model_name_or_path", ... subfolder="subfolder", ...
+                variant=None, ... revision=None, ... default_creation_method="from_pretrained",
             ... ) >>> ModularPipeline._component_spec_to_dict(spec) {
-                "type_hint": ("diffusers", "UNet2DConditionModel"), "repo": "path/to/repo", "subfolder": "subfolder",
+                "type_hint": ("diffusers", "UNet2DConditionModel"), "pretrained_model_name_or_path": "path/to/repo",
+                "subfolder": "subfolder", "variant": None, "revision": None, "type_hint": ("diffusers",
+                "UNet2DConditionModel"), "pretrained_model_name_or_path": "path/to/repo", "subfolder": "subfolder",
                 "variant": None, "revision": None,
             }
         """
@@ -2408,10 +2436,10 @@ def _dict_to_component_spec(
           - "type_hint": Tuple[str, str]
               Library name and class name of the component. (e.g. ("diffusers", "UNet2DConditionModel"))
           - All loading fields defined by `component_spec.loading_fields()`, typically:
-              - "repo": Optional[str]
+              - "pretrained_model_name_or_path": Optional[str]
                   The model repository (e.g., "stabilityai/stable-diffusion-xl").
               - "subfolder": Optional[str]
-                  A subfolder within the repo where this component lives.
+                  A subfolder within the pretrained_model_name_or_path where this component lives.
               - "variant": Optional[str]
                   An optional variant identifier for the model.
               - "revision": Optional[str]
@@ -2428,11 +2456,20 @@ def _dict_to_component_spec(
             ComponentSpec: A reconstructed ComponentSpec object.
 
         Example:
-            >>> spec_dict = { ... "type_hint": ("diffusers", "UNet2DConditionModel"), ... "repo":
-            "stabilityai/stable-diffusion-xl", ... "subfolder": "unet", ... "variant": None, ... "revision": None, ...
-            } >>> ModularPipeline._dict_to_component_spec("unet", spec_dict) ComponentSpec(
-                name="unet", type_hint=UNet2DConditionModel, config=None, repo="stabilityai/stable-diffusion-xl",
-                subfolder="unet", variant=None, revision=None, default_creation_method="from_pretrained"
+            >>> spec_dict = { ... "type_hint": ("diffusers", "UNet2DConditionModel"), ...
+            "pretrained_model_name_or_path": "stabilityai/stable-diffusion-xl", ... "subfolder": "unet", ... "variant":
+            None, ... "revision": None, ... } >>> ModularPipeline._dict_to_component_spec("unet", spec_dict)
+            ComponentSpec(
+                name="unet", type_hint=UNet2DConditionModel, config=None,
+                pretrained_model_name_or_path="stabilityai/stable-diffusion-xl", subfolder="unet", variant=None,
+                revision=None, default_creation_method="from_pretrained"
+            >>> spec_dict = { ... "type_hint": ("diffusers", "UNet2DConditionModel"), ...
+            "pretrained_model_name_or_path": "stabilityai/stable-diffusion-xl", ... "subfolder": "unet", ... "variant":
+            None, ... "revision": None, ... } >>> ModularPipeline._dict_to_component_spec("unet", spec_dict)
+            ComponentSpec(
+                name="unet", type_hint=UNet2DConditionModel, config=None,
+                pretrained_model_name_or_path="stabilityai/stable-diffusion-xl", subfolder="unet", variant=None,
+                revision=None, default_creation_method="from_pretrained"
             )
         """
         # make a shallow copy so we can pop() safely
diff --git a/src/diffusers/modular_pipelines/modular_pipeline_utils.py b/src/diffusers/modular_pipelines/modular_pipeline_utils.py
index b15126868634..aa421a53727b 100644
--- a/src/diffusers/modular_pipelines/modular_pipeline_utils.py
+++ b/src/diffusers/modular_pipelines/modular_pipeline_utils.py
@@ -21,6 +21,7 @@
 import torch
 
 from ..configuration_utils import ConfigMixin, FrozenDict
+from ..loaders.single_file_utils import _is_single_file_path_or_url
 from ..utils import is_torch_available, logging
 
 
@@ -80,10 +81,10 @@ class ComponentSpec:
         type_hint: Type of the component (e.g. UNet2DConditionModel)
         description: Optional description of the component
         config: Optional config dict for __init__ creation
-        repo: Optional repo path for from_pretrained creation
-        subfolder: Optional subfolder in repo
-        variant: Optional variant in repo
-        revision: Optional revision in repo
+        pretrained_model_name_or_path: Optional pretrained_model_name_or_path path for from_pretrained creation
+        subfolder: Optional subfolder in pretrained_model_name_or_path
+        variant: Optional variant in pretrained_model_name_or_path
+        revision: Optional revision in pretrained_model_name_or_path
         default_creation_method: Preferred creation method - "from_config" or "from_pretrained"
     """
 
@@ -91,13 +92,20 @@ class ComponentSpec:
     type_hint: Optional[Type] = None
     description: Optional[str] = None
     config: Optional[FrozenDict] = None
-    # YiYi Notes: should we change it to pretrained_model_name_or_path for consistency? a bit long for a field name
-    repo: Optional[Union[str, List[str]]] = field(default=None, metadata={"loading": True})
+    pretrained_model_name_or_path: Optional[Union[str, List[str]]] = field(default=None, metadata={"loading": True})
     subfolder: Optional[str] = field(default="", metadata={"loading": True})
     variant: Optional[str] = field(default=None, metadata={"loading": True})
     revision: Optional[str] = field(default=None, metadata={"loading": True})
     default_creation_method: Literal["from_config", "from_pretrained"] = "from_pretrained"
 
+    # Deprecated
+    repo: Optional[Union[str, List[str]]] = field(default=None, metadata={"loading": False})
+
+    def __post_init__(self):
+        repo_value = self.repo
+        if repo_value is not None and self.pretrained_model_name_or_path is None:
+            object.__setattr__(self, "pretrained_model_name_or_path", repo_value)
+
     def __hash__(self):
         """Make ComponentSpec hashable, using load_id as the hash value."""
         return hash((self.name, self.load_id, self.default_creation_method))
@@ -182,8 +190,8 @@ def loading_fields(cls) -> List[str]:
     @property
     def load_id(self) -> str:
         """
-        Unique identifier for this spec's pretrained load, composed of repo|subfolder|variant|revision (no empty
-        segments).
+        Unique identifier for this spec's pretrained load, composed of
+        pretrained_model_name_or_path|subfolder|variant|revision (no empty segments).
         """
         if self.default_creation_method == "from_config":
             return "null"
@@ -197,12 +205,13 @@ def decode_load_id(cls, load_id: str) -> Dict[str, Optional[str]]:
         Decode a load_id string back into a dictionary of loading fields and values.
 
         Args:
-            load_id: The load_id string to decode, format: "repo|subfolder|variant|revision"
+            load_id: The load_id string to decode, format: "pretrained_model_name_or_path|subfolder|variant|revision"
                      where None values are represented as "null"
 
         Returns:
             Dict mapping loading field names to their values. e.g. {
-                "repo": "path/to/repo", "subfolder": "subfolder", "variant": "variant", "revision": "revision"
+                "pretrained_model_name_or_path": "path/to/repo", "subfolder": "subfolder", "variant": "variant",
+                "revision": "revision"
             } If a segment value is "null", it's replaced with None. Returns None if load_id is "null" (indicating
             component not created with `load` method).
         """
@@ -259,34 +268,45 @@ def create(self, config: Optional[Union[FrozenDict, Dict[str, Any]]] = None, **k
     # YiYi TODO: add guard for type of model, if it is supported by from_pretrained
     def load(self, **kwargs) -> Any:
         """Load component using from_pretrained."""
-
-        # select loading fields from kwargs passed from user: e.g. repo, subfolder, variant, revision, note the list could change
+        # select loading fields from kwargs passed from user: e.g. pretrained_model_name_or_path, subfolder, variant, revision, note the list could change
         passed_loading_kwargs = {key: kwargs.pop(key) for key in self.loading_fields() if key in kwargs}
         # merge loading field value in the spec with user passed values to create load_kwargs
         load_kwargs = {key: passed_loading_kwargs.get(key, getattr(self, key)) for key in self.loading_fields()}
-        # repo is a required argument for from_pretrained, a.k.a. pretrained_model_name_or_path
-        repo = load_kwargs.pop("repo", None)
-        if repo is None:
+
+        pretrained_model_name_or_path = load_kwargs.pop("pretrained_model_name_or_path", None)
+        if pretrained_model_name_or_path is None:
             raise ValueError(
-                "`repo` info is required when using `load` method (you can directly set it in `repo` field of the ComponentSpec or pass it as an argument)"
+                "`pretrained_model_name_or_path` info is required when using `load` method (you can directly set it in `pretrained_model_name_or_path` field of the ComponentSpec or pass it as an argument)"
+            )
+        is_single_file = _is_single_file_path_or_url(pretrained_model_name_or_path)
+        if is_single_file and self.type_hint is None:
+            raise ValueError(
+                f"`type_hint` is required when loading a single file model but is missing for component: {self.name}"
             )
 
         if self.type_hint is None:
             try:
                 from diffusers import AutoModel
 
-                component = AutoModel.from_pretrained(repo, **load_kwargs, **kwargs)
+                component = AutoModel.from_pretrained(pretrained_model_name_or_path, **load_kwargs, **kwargs)
             except Exception as e:
                 raise ValueError(f"Unable to load {self.name} without `type_hint`: {e}")
             # update type_hint if AutoModel load successfully
             self.type_hint = component.__class__
         else:
+            # determine load method
+            load_method = (
+                getattr(self.type_hint, "from_single_file")
+                if is_single_file
+                else getattr(self.type_hint, "from_pretrained")
+            )
+
             try:
-                component = self.type_hint.from_pretrained(repo, **load_kwargs, **kwargs)
+                component = load_method(pretrained_model_name_or_path, **load_kwargs, **kwargs)
             except Exception as e:
                 raise ValueError(f"Unable to load {self.name} using load method: {e}")
 
-        self.repo = repo
+        self.pretrained_model_name_or_path = pretrained_model_name_or_path
         for k, v in load_kwargs.items():
             setattr(self, k, v)
         component._diffusers_load_id = self.load_id
diff --git a/src/diffusers/modular_pipelines/qwenimage/before_denoise.py b/src/diffusers/modular_pipelines/qwenimage/before_denoise.py
index fdec95dc506e..0e470332c6f4 100644
--- a/src/diffusers/modular_pipelines/qwenimage/before_denoise.py
+++ b/src/diffusers/modular_pipelines/qwenimage/before_denoise.py
@@ -132,6 +132,7 @@ def expected_components(self) -> List[ComponentSpec]:
     @property
     def inputs(self) -> List[InputParam]:
         return [
+            InputParam("latents"),
             InputParam(name="height"),
             InputParam(name="width"),
             InputParam(name="num_images_per_prompt", default=1),
@@ -196,11 +197,11 @@ def __call__(self, components: QwenImageModularPipeline, state: PipelineState) -
                 f"You have passed a list of generators of length {len(block_state.generator)}, but requested an effective batch"
                 f" size of {batch_size}. Make sure the batch size matches the length of the generators."
             )
-
-        block_state.latents = randn_tensor(
-            shape, generator=block_state.generator, device=device, dtype=block_state.dtype
-        )
-        block_state.latents = components.pachifier.pack_latents(block_state.latents)
+        if block_state.latents is None:
+            block_state.latents = randn_tensor(
+                shape, generator=block_state.generator, device=device, dtype=block_state.dtype
+            )
+            block_state.latents = components.pachifier.pack_latents(block_state.latents)
 
         self.set_block_state(state, block_state)
         return components, state
@@ -549,8 +550,7 @@ def __call__(self, components: QwenImageModularPipeline, state: PipelineState) -
                     block_state.width // components.vae_scale_factor // 2,
                 )
             ]
-            * block_state.batch_size
-        ]
+        ] * block_state.batch_size
         block_state.txt_seq_lens = (
             block_state.prompt_embeds_mask.sum(dim=1).tolist() if block_state.prompt_embeds_mask is not None else None
         )
diff --git a/src/diffusers/modular_pipelines/qwenimage/decoders.py b/src/diffusers/modular_pipelines/qwenimage/decoders.py
index 6c82fe989e55..26417162deee 100644
--- a/src/diffusers/modular_pipelines/qwenimage/decoders.py
+++ b/src/diffusers/modular_pipelines/qwenimage/decoders.py
@@ -74,8 +74,9 @@ def __call__(self, components: QwenImageModularPipeline, state: PipelineState) -
         block_state = self.get_block_state(state)
 
         # YiYi Notes: remove support for output_type = "latents', we can just skip decode/encode step in modular
+        vae_scale_factor = components.vae_scale_factor
         block_state.latents = components.pachifier.unpack_latents(
-            block_state.latents, block_state.height, block_state.width
+            block_state.latents, block_state.height, block_state.width, vae_scale_factor=vae_scale_factor
         )
         block_state.latents = block_state.latents.to(components.vae.dtype)
 
diff --git a/src/diffusers/modular_pipelines/qwenimage/encoders.py b/src/diffusers/modular_pipelines/qwenimage/encoders.py
index 04fb3fdc947b..3b56981e5290 100644
--- a/src/diffusers/modular_pipelines/qwenimage/encoders.py
+++ b/src/diffusers/modular_pipelines/qwenimage/encoders.py
@@ -503,6 +503,8 @@ def __call__(self, components: QwenImageModularPipeline, state: PipelineState):
         block_state.prompt_embeds = block_state.prompt_embeds[:, : block_state.max_sequence_length]
         block_state.prompt_embeds_mask = block_state.prompt_embeds_mask[:, : block_state.max_sequence_length]
 
+        block_state.negative_prompt_embeds = None
+        block_state.negative_prompt_embeds_mask = None
         if components.requires_unconditional_embeds:
             negative_prompt = block_state.negative_prompt or ""
             block_state.negative_prompt_embeds, block_state.negative_prompt_embeds_mask = get_qwen_prompt_embeds(
@@ -627,6 +629,8 @@ def __call__(self, components: QwenImageModularPipeline, state: PipelineState):
             device=device,
         )
 
+        block_state.negative_prompt_embeds = None
+        block_state.negative_prompt_embeds_mask = None
         if components.requires_unconditional_embeds:
             negative_prompt = block_state.negative_prompt or " "
             block_state.negative_prompt_embeds, block_state.negative_prompt_embeds_mask = get_qwen_prompt_embeds_edit(
@@ -679,6 +683,8 @@ def __call__(self, components: QwenImageModularPipeline, state: PipelineState):
             device=device,
         )
 
+        block_state.negative_prompt_embeds = None
+        block_state.negative_prompt_embeds_mask = None
         if components.requires_unconditional_embeds:
             negative_prompt = block_state.negative_prompt or " "
             block_state.negative_prompt_embeds, block_state.negative_prompt_embeds_mask = (
diff --git a/src/diffusers/modular_pipelines/qwenimage/modular_blocks.py b/src/diffusers/modular_pipelines/qwenimage/modular_blocks.py
index 83bfcb3da4fd..419894164389 100644
--- a/src/diffusers/modular_pipelines/qwenimage/modular_blocks.py
+++ b/src/diffusers/modular_pipelines/qwenimage/modular_blocks.py
@@ -523,7 +523,7 @@ class QwenImageCoreDenoiseStep(SequentialPipelineBlocks):
         QwenImageOptionalControlNetBeforeDenoiseStep,
         QwenImageAutoDenoiseStep,
     ]
-    block_names = ["input", "controlnet_input", "before_denoise", "controlnet_before_denoise", "denoise", "decode"]
+    block_names = ["input", "controlnet_input", "before_denoise", "controlnet_before_denoise", "denoise"]
 
     @property
     def description(self):
@@ -534,7 +534,6 @@ def description(self):
             + " - `QwenImageAutoBeforeDenoiseStep` (before_denoise) prepares the inputs for the denoising step.\n"
             + " - `QwenImageOptionalControlNetBeforeDenoiseStep` (controlnet_before_denoise) prepares the controlnet input for the denoising step.\n"
             + " - `QwenImageAutoDenoiseStep` (denoise) iteratively denoises the latents.\n"
-            + " - `QwenImageAutoDecodeStep` (decode) decodes the latents into images.\n\n"
             + "This step support text-to-image, image-to-image, inpainting, and controlnet tasks for QwenImage:\n"
             + " - for image-to-image generation, you need to provide `image_latents`\n"
             + " - for inpainting, you need to provide `processed_mask_image` and `image_latents`\n"
diff --git a/src/diffusers/modular_pipelines/qwenimage/modular_pipeline.py b/src/diffusers/modular_pipelines/qwenimage/modular_pipeline.py
index d9e30864f660..59e1a13a5db2 100644
--- a/src/diffusers/modular_pipelines/qwenimage/modular_pipeline.py
+++ b/src/diffusers/modular_pipelines/qwenimage/modular_pipeline.py
@@ -26,10 +26,7 @@ class QwenImagePachifier(ConfigMixin):
     config_name = "config.json"
 
     @register_to_config
-    def __init__(
-        self,
-        patch_size: int = 2,
-    ):
+    def __init__(self, patch_size: int = 2):
         super().__init__()
 
     def pack_latents(self, latents):
diff --git a/src/diffusers/modular_pipelines/stable_diffusion_xl/decoders.py b/src/diffusers/modular_pipelines/stable_diffusion_xl/decoders.py
index feb78e1ef11b..6e0307260d1d 100644
--- a/src/diffusers/modular_pipelines/stable_diffusion_xl/decoders.py
+++ b/src/diffusers/modular_pipelines/stable_diffusion_xl/decoders.py
@@ -21,8 +21,7 @@
 from ...configuration_utils import FrozenDict
 from ...image_processor import VaeImageProcessor
 from ...models import AutoencoderKL
-from ...models.attention_processor import AttnProcessor2_0, XFormersAttnProcessor
-from ...utils import logging
+from ...utils import deprecate, logging
 from ..modular_pipeline import (
     ModularPipelineBlocks,
     PipelineState,
@@ -77,21 +76,12 @@ def intermediate_outputs(self) -> List[str]:
     @staticmethod
     # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_upscale.StableDiffusionUpscalePipeline.upcast_vae with self->components
     def upcast_vae(components):
-        dtype = components.vae.dtype
-        components.vae.to(dtype=torch.float32)
-        use_torch_2_0_or_xformers = isinstance(
-            components.vae.decoder.mid_block.attentions[0].processor,
-            (
-                AttnProcessor2_0,
-                XFormersAttnProcessor,
-            ),
+        deprecate(
+            "upcast_vae",
+            "1.0.0",
+            "`upcast_vae` is deprecated. Please use `pipe.vae.to(torch.float32)`. For more details, please refer to: https://github.com/huggingface/diffusers/pull/12619#issue-3606633695.",
         )
-        # if xformers or torch_2_0 is used attention block does not need
-        # to be in float32 which can save lots of memory
-        if use_torch_2_0_or_xformers:
-            components.vae.post_quant_conv.to(dtype)
-            components.vae.decoder.conv_in.to(dtype)
-            components.vae.decoder.mid_block.to(dtype)
+        components.vae.to(dtype=torch.float32)
 
     @torch.no_grad()
     def __call__(self, components, state: PipelineState) -> PipelineState:
diff --git a/src/diffusers/modular_pipelines/wan/__init__.py b/src/diffusers/modular_pipelines/wan/__init__.py
index 7b548e003c63..73f67c9afed2 100644
--- a/src/diffusers/modular_pipelines/wan/__init__.py
+++ b/src/diffusers/modular_pipelines/wan/__init__.py
@@ -21,16 +21,14 @@
 
     _dummy_objects.update(get_objects_from_module(dummy_torch_and_transformers_objects))
 else:
+    _import_structure["decoders"] = ["WanImageVaeDecoderStep"]
     _import_structure["encoders"] = ["WanTextEncoderStep"]
     _import_structure["modular_blocks"] = [
         "ALL_BLOCKS",
-        "AUTO_BLOCKS",
-        "TEXT2VIDEO_BLOCKS",
-        "WanAutoBeforeDenoiseStep",
+        "Wan22AutoBlocks",
         "WanAutoBlocks",
-        "WanAutoBlocks",
-        "WanAutoDecodeStep",
-        "WanAutoDenoiseStep",
+        "WanAutoImageEncoderStep",
+        "WanAutoVaeImageEncoderStep",
     ]
     _import_structure["modular_pipeline"] = ["WanModularPipeline"]
 
@@ -41,15 +39,14 @@
     except OptionalDependencyNotAvailable:
         from ...utils.dummy_torch_and_transformers_objects import *  # noqa F403
     else:
+        from .decoders import WanImageVaeDecoderStep
         from .encoders import WanTextEncoderStep
         from .modular_blocks import (
             ALL_BLOCKS,
-            AUTO_BLOCKS,
-            TEXT2VIDEO_BLOCKS,
-            WanAutoBeforeDenoiseStep,
+            Wan22AutoBlocks,
             WanAutoBlocks,
-            WanAutoDecodeStep,
-            WanAutoDenoiseStep,
+            WanAutoImageEncoderStep,
+            WanAutoVaeImageEncoderStep,
         )
         from .modular_pipeline import WanModularPipeline
 else:
diff --git a/src/diffusers/modular_pipelines/wan/before_denoise.py b/src/diffusers/modular_pipelines/wan/before_denoise.py
index d48f678edd59..e2f8d3e7d88b 100644
--- a/src/diffusers/modular_pipelines/wan/before_denoise.py
+++ b/src/diffusers/modular_pipelines/wan/before_denoise.py
@@ -13,10 +13,11 @@
 # limitations under the License.
 
 import inspect
-from typing import List, Optional, Union
+from typing import List, Optional, Tuple, Union
 
 import torch
 
+from ...models import WanTransformer3DModel
 from ...schedulers import UniPCMultistepScheduler
 from ...utils import logging
 from ...utils.torch_utils import randn_tensor
@@ -34,6 +35,97 @@
 # configuration of guider is.
 
 
+def repeat_tensor_to_batch_size(
+    input_name: str,
+    input_tensor: torch.Tensor,
+    batch_size: int,
+    num_videos_per_prompt: int = 1,
+) -> torch.Tensor:
+    """Repeat tensor elements to match the final batch size.
+
+    This function expands a tensor's batch dimension to match the final batch size (batch_size * num_videos_per_prompt)
+    by repeating each element along dimension 0.
+
+    The input tensor must have batch size 1 or batch_size. The function will:
+    - If batch size is 1: repeat each element (batch_size * num_videos_per_prompt) times
+    - If batch size equals batch_size: repeat each element num_videos_per_prompt times
+
+    Args:
+        input_name (str): Name of the input tensor (used for error messages)
+        input_tensor (torch.Tensor): The tensor to repeat. Must have batch size 1 or batch_size.
+        batch_size (int): The base batch size (number of prompts)
+        num_videos_per_prompt (int, optional): Number of videos to generate per prompt. Defaults to 1.
+
+    Returns:
+        torch.Tensor: The repeated tensor with final batch size (batch_size * num_videos_per_prompt)
+
+    Raises:
+        ValueError: If input_tensor is not a torch.Tensor or has invalid batch size
+
+    Examples:
+        tensor = torch.tensor([[1, 2, 3]]) # shape: [1, 3] repeated = repeat_tensor_to_batch_size("image", tensor,
+        batch_size=2, num_videos_per_prompt=2) repeated # tensor([[1, 2, 3], [1, 2, 3], [1, 2, 3], [1, 2, 3]]) - shape:
+        [4, 3]
+
+        tensor = torch.tensor([[1, 2, 3], [4, 5, 6]]) # shape: [2, 3] repeated = repeat_tensor_to_batch_size("image",
+        tensor, batch_size=2, num_videos_per_prompt=2) repeated # tensor([[1, 2, 3], [1, 2, 3], [4, 5, 6], [4, 5, 6]])
+        - shape: [4, 3]
+    """
+    # make sure input is a tensor
+    if not isinstance(input_tensor, torch.Tensor):
+        raise ValueError(f"`{input_name}` must be a tensor")
+
+    # make sure input tensor e.g. image_latents has batch size 1 or batch_size same as prompts
+    if input_tensor.shape[0] == 1:
+        repeat_by = batch_size * num_videos_per_prompt
+    elif input_tensor.shape[0] == batch_size:
+        repeat_by = num_videos_per_prompt
+    else:
+        raise ValueError(
+            f"`{input_name}` must have have batch size 1 or {batch_size}, but got {input_tensor.shape[0]}"
+        )
+
+    # expand the tensor to match the batch_size * num_videos_per_prompt
+    input_tensor = input_tensor.repeat_interleave(repeat_by, dim=0)
+
+    return input_tensor
+
+
+def calculate_dimension_from_latents(
+    latents: torch.Tensor, vae_scale_factor_temporal: int, vae_scale_factor_spatial: int
+) -> Tuple[int, int]:
+    """Calculate image dimensions from latent tensor dimensions.
+
+    This function converts latent temporal and spatial dimensions to image temporal and spatial dimensions by
+    multiplying the latent num_frames/height/width by the VAE scale factor.
+
+    Args:
+        latents (torch.Tensor): The latent tensor. Must have 4 or 5 dimensions.
+            Expected shapes: [batch, channels, height, width] or [batch, channels, frames, height, width]
+        vae_scale_factor_temporal (int): The scale factor used by the VAE to compress temporal dimension.
+            Typically 4 for most VAEs (video is 4x larger than latents in temporal dimension)
+        vae_scale_factor_spatial (int): The scale factor used by the VAE to compress spatial dimension.
+            Typically 8 for most VAEs (image is 8x larger than latents in each dimension)
+
+    Returns:
+        Tuple[int, int]: The calculated image dimensions as (height, width)
+
+    Raises:
+        ValueError: If latents tensor doesn't have 4 or 5 dimensions
+
+    """
+    if latents.ndim != 5:
+        raise ValueError(f"latents must have 5 dimensions, but got {latents.ndim}")
+
+    _, _, num_latent_frames, latent_height, latent_width = latents.shape
+
+    num_frames = (num_latent_frames - 1) * vae_scale_factor_temporal + 1
+    height = latent_height * vae_scale_factor_spatial
+    width = latent_width * vae_scale_factor_spatial
+
+    return num_frames, height, width
+
+
 # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.retrieve_timesteps
 def retrieve_timesteps(
     scheduler,
@@ -94,7 +186,7 @@ def retrieve_timesteps(
     return timesteps, num_inference_steps
 
 
-class WanInputStep(ModularPipelineBlocks):
+class WanTextInputStep(ModularPipelineBlocks):
     model_name = "wan"
 
     @property
@@ -109,14 +201,15 @@ def description(self) -> str:
         )
 
     @property
-    def inputs(self) -> List[InputParam]:
+    def expected_components(self) -> List[ComponentSpec]:
         return [
-            InputParam("num_videos_per_prompt", default=1),
+            ComponentSpec("transformer", WanTransformer3DModel),
         ]
 
     @property
-    def intermediate_inputs(self) -> List[str]:
+    def inputs(self) -> List[InputParam]:
         return [
+            InputParam("num_videos_per_prompt", default=1),
             InputParam(
                 "prompt_embeds",
                 required=True,
@@ -141,19 +234,7 @@ def intermediate_outputs(self) -> List[str]:
             OutputParam(
                 "dtype",
                 type_hint=torch.dtype,
-                description="Data type of model tensor inputs (determined by `prompt_embeds`)",
-            ),
-            OutputParam(
-                "prompt_embeds",
-                type_hint=torch.Tensor,
-                kwargs_type="denoiser_input_fields",  # already in intermedites state but declare here again for denoiser_input_fields
-                description="text embeddings used to guide the image generation",
-            ),
-            OutputParam(
-                "negative_prompt_embeds",
-                type_hint=torch.Tensor,
-                kwargs_type="denoiser_input_fields",  # already in intermedites state but declare here again for denoiser_input_fields
-                description="negative text embeddings used to guide the image generation",
+                description="Data type of model tensor inputs (determined by `transformer.dtype`)",
             ),
         ]
 
@@ -194,6 +275,140 @@ def __call__(self, components: WanModularPipeline, state: PipelineState) -> Pipe
         return components, state
 
 
+class WanAdditionalInputsStep(ModularPipelineBlocks):
+    model_name = "wan"
+
+    def __init__(
+        self,
+        image_latent_inputs: List[str] = ["first_frame_latents"],
+        additional_batch_inputs: List[str] = [],
+    ):
+        """Initialize a configurable step that standardizes the inputs for the denoising step. It:\n"
+
+        This step handles multiple common tasks to prepare inputs for the denoising step:
+        1. For encoded image latents, use it update height/width if None, and expands batch size
+        2. For additional_batch_inputs: Only expands batch dimensions to match final batch size
+
+        This is a dynamic block that allows you to configure which inputs to process.
+
+        Args:
+            image_latent_inputs (List[str], optional): Names of image latent tensors to process.
+                In additional to adjust batch size of these inputs, they will be used to determine height/width. Can be
+                a single string or list of strings. Defaults to ["first_frame_latents"].
+            additional_batch_inputs (List[str], optional):
+                Names of additional conditional input tensors to expand batch size. These tensors will only have their
+                batch dimensions adjusted to match the final batch size. Can be a single string or list of strings.
+                Defaults to [].
+
+        Examples:
+            # Configure to process first_frame_latents (default behavior) WanAdditionalInputsStep()
+
+            # Configure to process multiple image latent inputs
+            WanAdditionalInputsStep(image_latent_inputs=["first_frame_latents", "last_frame_latents"])
+
+            # Configure to process image latents and additional batch inputs WanAdditionalInputsStep(
+                image_latent_inputs=["first_frame_latents"], additional_batch_inputs=["image_embeds"]
+            )
+        """
+        if not isinstance(image_latent_inputs, list):
+            image_latent_inputs = [image_latent_inputs]
+        if not isinstance(additional_batch_inputs, list):
+            additional_batch_inputs = [additional_batch_inputs]
+
+        self._image_latent_inputs = image_latent_inputs
+        self._additional_batch_inputs = additional_batch_inputs
+        super().__init__()
+
+    @property
+    def description(self) -> str:
+        # Functionality section
+        summary_section = (
+            "Input processing step that:\n"
+            "  1. For image latent inputs: Updates height/width if None, and expands batch size\n"
+            "  2. For additional batch inputs: Expands batch dimensions to match final batch size"
+        )
+
+        # Inputs info
+        inputs_info = ""
+        if self._image_latent_inputs or self._additional_batch_inputs:
+            inputs_info = "\n\nConfigured inputs:"
+            if self._image_latent_inputs:
+                inputs_info += f"\n  - Image latent inputs: {self._image_latent_inputs}"
+            if self._additional_batch_inputs:
+                inputs_info += f"\n  - Additional batch inputs: {self._additional_batch_inputs}"
+
+        # Placement guidance
+        placement_section = "\n\nThis block should be placed after the encoder steps and the text input step."
+
+        return summary_section + inputs_info + placement_section
+
+    @property
+    def inputs(self) -> List[InputParam]:
+        inputs = [
+            InputParam(name="num_videos_per_prompt", default=1),
+            InputParam(name="batch_size", required=True),
+            InputParam(name="height"),
+            InputParam(name="width"),
+            InputParam(name="num_frames"),
+        ]
+
+        # Add image latent inputs
+        for image_latent_input_name in self._image_latent_inputs:
+            inputs.append(InputParam(name=image_latent_input_name))
+
+        # Add additional batch inputs
+        for input_name in self._additional_batch_inputs:
+            inputs.append(InputParam(name=input_name))
+
+        return inputs
+
+    def __call__(self, components: WanModularPipeline, state: PipelineState) -> PipelineState:
+        block_state = self.get_block_state(state)
+
+        # Process image latent inputs (height/width calculation, patchify, and batch expansion)
+        for image_latent_input_name in self._image_latent_inputs:
+            image_latent_tensor = getattr(block_state, image_latent_input_name)
+            if image_latent_tensor is None:
+                continue
+
+            # 1. Calculate num_frames, height/width from latents
+            num_frames, height, width = calculate_dimension_from_latents(
+                image_latent_tensor, components.vae_scale_factor_temporal, components.vae_scale_factor_spatial
+            )
+            block_state.num_frames = block_state.num_frames or num_frames
+            block_state.height = block_state.height or height
+            block_state.width = block_state.width or width
+
+            # 3. Expand batch size
+            image_latent_tensor = repeat_tensor_to_batch_size(
+                input_name=image_latent_input_name,
+                input_tensor=image_latent_tensor,
+                num_videos_per_prompt=block_state.num_videos_per_prompt,
+                batch_size=block_state.batch_size,
+            )
+
+            setattr(block_state, image_latent_input_name, image_latent_tensor)
+
+        # Process additional batch inputs (only batch expansion)
+        for input_name in self._additional_batch_inputs:
+            input_tensor = getattr(block_state, input_name)
+            if input_tensor is None:
+                continue
+
+            # Only expand batch size
+            input_tensor = repeat_tensor_to_batch_size(
+                input_name=input_name,
+                input_tensor=input_tensor,
+                num_videos_per_prompt=block_state.num_videos_per_prompt,
+                batch_size=block_state.batch_size,
+            )
+
+            setattr(block_state, input_name, input_tensor)
+
+        self.set_block_state(state, block_state)
+        return components, state
+
+
 class WanSetTimestepsStep(ModularPipelineBlocks):
     model_name = "wan"
 
@@ -215,26 +430,15 @@ def inputs(self) -> List[InputParam]:
             InputParam("sigmas"),
         ]
 
-    @property
-    def intermediate_outputs(self) -> List[OutputParam]:
-        return [
-            OutputParam("timesteps", type_hint=torch.Tensor, description="The timesteps to use for inference"),
-            OutputParam(
-                "num_inference_steps",
-                type_hint=int,
-                description="The number of denoising steps to perform at inference time",
-            ),
-        ]
-
     @torch.no_grad()
     def __call__(self, components: WanModularPipeline, state: PipelineState) -> PipelineState:
         block_state = self.get_block_state(state)
-        block_state.device = components._execution_device
+        device = components._execution_device
 
         block_state.timesteps, block_state.num_inference_steps = retrieve_timesteps(
             components.scheduler,
             block_state.num_inference_steps,
-            block_state.device,
+            device,
             block_state.timesteps,
             block_state.sigmas,
         )
@@ -246,10 +450,6 @@ def __call__(self, components: WanModularPipeline, state: PipelineState) -> Pipe
 class WanPrepareLatentsStep(ModularPipelineBlocks):
     model_name = "wan"
 
-    @property
-    def expected_components(self) -> List[ComponentSpec]:
-        return []
-
     @property
     def description(self) -> str:
         return "Prepare latents step that prepares the latents for the text-to-video generation process"
@@ -262,11 +462,6 @@ def inputs(self) -> List[InputParam]:
             InputParam("num_frames", type_hint=int),
             InputParam("latents", type_hint=Optional[torch.Tensor]),
             InputParam("num_videos_per_prompt", type_hint=int, default=1),
-        ]
-
-    @property
-    def intermediate_inputs(self) -> List[InputParam]:
-        return [
             InputParam("generator"),
             InputParam(
                 "batch_size",
@@ -337,29 +532,106 @@ def prepare_latents(
     @torch.no_grad()
     def __call__(self, components: WanModularPipeline, state: PipelineState) -> PipelineState:
         block_state = self.get_block_state(state)
+        self.check_inputs(components, block_state)
+
+        device = components._execution_device
+        dtype = torch.float32  # Wan latents should be torch.float32 for best quality
 
         block_state.height = block_state.height or components.default_height
         block_state.width = block_state.width or components.default_width
         block_state.num_frames = block_state.num_frames or components.default_num_frames
-        block_state.device = components._execution_device
-        block_state.dtype = torch.float32  # Wan latents should be torch.float32 for best quality
-        block_state.num_channels_latents = components.num_channels_latents
-
-        self.check_inputs(components, block_state)
 
         block_state.latents = self.prepare_latents(
             components,
-            block_state.batch_size * block_state.num_videos_per_prompt,
-            block_state.num_channels_latents,
-            block_state.height,
-            block_state.width,
-            block_state.num_frames,
-            block_state.dtype,
-            block_state.device,
-            block_state.generator,
-            block_state.latents,
+            batch_size=block_state.batch_size * block_state.num_videos_per_prompt,
+            num_channels_latents=components.num_channels_latents,
+            height=block_state.height,
+            width=block_state.width,
+            num_frames=block_state.num_frames,
+            dtype=dtype,
+            device=device,
+            generator=block_state.generator,
+            latents=block_state.latents,
         )
 
         self.set_block_state(state, block_state)
 
         return components, state
+
+
+class WanPrepareFirstFrameLatentsStep(ModularPipelineBlocks):
+    model_name = "wan"
+
+    @property
+    def description(self) -> str:
+        return "step that prepares the masked first frame latents and add it to the latent condition"
+
+    @property
+    def inputs(self) -> List[InputParam]:
+        return [
+            InputParam("first_frame_latents", type_hint=Optional[torch.Tensor]),
+            InputParam("num_frames", type_hint=int),
+        ]
+
+    def __call__(self, components: WanModularPipeline, state: PipelineState) -> PipelineState:
+        block_state = self.get_block_state(state)
+
+        batch_size, _, _, latent_height, latent_width = block_state.first_frame_latents.shape
+
+        mask_lat_size = torch.ones(batch_size, 1, block_state.num_frames, latent_height, latent_width)
+        mask_lat_size[:, :, list(range(1, block_state.num_frames))] = 0
+
+        first_frame_mask = mask_lat_size[:, :, 0:1]
+        first_frame_mask = torch.repeat_interleave(
+            first_frame_mask, dim=2, repeats=components.vae_scale_factor_temporal
+        )
+        mask_lat_size = torch.concat([first_frame_mask, mask_lat_size[:, :, 1:, :]], dim=2)
+        mask_lat_size = mask_lat_size.view(
+            batch_size, -1, components.vae_scale_factor_temporal, latent_height, latent_width
+        )
+        mask_lat_size = mask_lat_size.transpose(1, 2)
+        mask_lat_size = mask_lat_size.to(block_state.first_frame_latents.device)
+        block_state.first_frame_latents = torch.concat([mask_lat_size, block_state.first_frame_latents], dim=1)
+
+        self.set_block_state(state, block_state)
+        return components, state
+
+
+class WanPrepareFirstLastFrameLatentsStep(ModularPipelineBlocks):
+    model_name = "wan"
+
+    @property
+    def description(self) -> str:
+        return "step that prepares the masked latents with first and last frames and add it to the latent condition"
+
+    @property
+    def inputs(self) -> List[InputParam]:
+        return [
+            InputParam("first_last_frame_latents", type_hint=Optional[torch.Tensor]),
+            InputParam("num_frames", type_hint=int),
+        ]
+
+    def __call__(self, components: WanModularPipeline, state: PipelineState) -> PipelineState:
+        block_state = self.get_block_state(state)
+
+        batch_size, _, _, latent_height, latent_width = block_state.first_last_frame_latents.shape
+
+        mask_lat_size = torch.ones(batch_size, 1, block_state.num_frames, latent_height, latent_width)
+        mask_lat_size[:, :, list(range(1, block_state.num_frames - 1))] = 0
+
+        first_frame_mask = mask_lat_size[:, :, 0:1]
+        first_frame_mask = torch.repeat_interleave(
+            first_frame_mask, dim=2, repeats=components.vae_scale_factor_temporal
+        )
+        mask_lat_size = torch.concat([first_frame_mask, mask_lat_size[:, :, 1:, :]], dim=2)
+        mask_lat_size = mask_lat_size.view(
+            batch_size, -1, components.vae_scale_factor_temporal, latent_height, latent_width
+        )
+        mask_lat_size = mask_lat_size.transpose(1, 2)
+        mask_lat_size = mask_lat_size.to(block_state.first_last_frame_latents.device)
+        block_state.first_last_frame_latents = torch.concat(
+            [mask_lat_size, block_state.first_last_frame_latents], dim=1
+        )
+
+        self.set_block_state(state, block_state)
+        return components, state
diff --git a/src/diffusers/modular_pipelines/wan/decoders.py b/src/diffusers/modular_pipelines/wan/decoders.py
index 8c751172d858..7cec318c1706 100644
--- a/src/diffusers/modular_pipelines/wan/decoders.py
+++ b/src/diffusers/modular_pipelines/wan/decoders.py
@@ -29,7 +29,7 @@
 logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
 
 
-class WanDecodeStep(ModularPipelineBlocks):
+class WanImageVaeDecoderStep(ModularPipelineBlocks):
     model_name = "wan"
 
     @property
@@ -50,12 +50,6 @@ def description(self) -> str:
 
     @property
     def inputs(self) -> List[Tuple[str, Any]]:
-        return [
-            InputParam("output_type", default="pil"),
-        ]
-
-    @property
-    def intermediate_inputs(self) -> List[str]:
         return [
             InputParam(
                 "latents",
@@ -80,25 +74,20 @@ def __call__(self, components, state: PipelineState) -> PipelineState:
         block_state = self.get_block_state(state)
         vae_dtype = components.vae.dtype
 
-        if not block_state.output_type == "latent":
-            latents = block_state.latents
-            latents_mean = (
-                torch.tensor(components.vae.config.latents_mean)
-                .view(1, components.vae.config.z_dim, 1, 1, 1)
-                .to(latents.device, latents.dtype)
-            )
-            latents_std = 1.0 / torch.tensor(components.vae.config.latents_std).view(
-                1, components.vae.config.z_dim, 1, 1, 1
-            ).to(latents.device, latents.dtype)
-            latents = latents / latents_std + latents_mean
-            latents = latents.to(vae_dtype)
-            block_state.videos = components.vae.decode(latents, return_dict=False)[0]
-        else:
-            block_state.videos = block_state.latents
-
-        block_state.videos = components.video_processor.postprocess_video(
-            block_state.videos, output_type=block_state.output_type
+        latents = block_state.latents
+        latents_mean = (
+            torch.tensor(components.vae.config.latents_mean)
+            .view(1, components.vae.config.z_dim, 1, 1, 1)
+            .to(latents.device, latents.dtype)
         )
+        latents_std = 1.0 / torch.tensor(components.vae.config.latents_std).view(
+            1, components.vae.config.z_dim, 1, 1, 1
+        ).to(latents.device, latents.dtype)
+        latents = latents / latents_std + latents_mean
+        latents = latents.to(vae_dtype)
+        block_state.videos = components.vae.decode(latents, return_dict=False)[0]
+
+        block_state.videos = components.video_processor.postprocess_video(block_state.videos, output_type="np")
 
         self.set_block_state(state, block_state)
 
diff --git a/src/diffusers/modular_pipelines/wan/denoise.py b/src/diffusers/modular_pipelines/wan/denoise.py
index 4f3ca80acc70..2da36f52da87 100644
--- a/src/diffusers/modular_pipelines/wan/denoise.py
+++ b/src/diffusers/modular_pipelines/wan/denoise.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from typing import Any, List, Tuple
+from typing import Any, Dict, List, Tuple
 
 import torch
 
@@ -27,44 +27,102 @@
     ModularPipelineBlocks,
     PipelineState,
 )
-from ..modular_pipeline_utils import ComponentSpec, InputParam, OutputParam
+from ..modular_pipeline_utils import ComponentSpec, ConfigSpec, InputParam
 from .modular_pipeline import WanModularPipeline
 
 
 logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
 
 
-class WanLoopDenoiser(ModularPipelineBlocks):
+class WanLoopBeforeDenoiser(ModularPipelineBlocks):
     model_name = "wan"
 
     @property
-    def expected_components(self) -> List[ComponentSpec]:
+    def description(self) -> str:
+        return (
+            "step within the denoising loop that prepares the latent input for the denoiser. "
+            "This block should be used to compose the `sub_blocks` attribute of a `LoopSequentialPipelineBlocks` "
+            "object (e.g. `WanDenoiseLoopWrapper`)"
+        )
+
+    @property
+    def inputs(self) -> List[InputParam]:
         return [
-            ComponentSpec(
-                "guider",
-                ClassifierFreeGuidance,
-                config=FrozenDict({"guidance_scale": 5.0}),
-                default_creation_method="from_config",
+            InputParam(
+                "latents",
+                required=True,
+                type_hint=torch.Tensor,
+                description="The initial latents to use for the denoising process. Can be generated in prepare_latent step.",
+            ),
+            InputParam(
+                "dtype",
+                required=True,
+                type_hint=torch.dtype,
+                description="The dtype of the model inputs. Can be generated in input step.",
             ),
-            ComponentSpec("transformer", WanTransformer3DModel),
         ]
 
+    @torch.no_grad()
+    def __call__(self, components: WanModularPipeline, block_state: BlockState, i: int, t: torch.Tensor):
+        block_state.latent_model_input = block_state.latents.to(block_state.dtype)
+        return components, block_state
+
+
+class WanImage2VideoLoopBeforeDenoiser(ModularPipelineBlocks):
+    model_name = "wan"
+
     @property
     def description(self) -> str:
         return (
-            "Step within the denoising loop that denoise the latents with guidance. "
+            "step within the denoising loop that prepares the latent input for the denoiser. "
             "This block should be used to compose the `sub_blocks` attribute of a `LoopSequentialPipelineBlocks` "
             "object (e.g. `WanDenoiseLoopWrapper`)"
         )
 
     @property
-    def inputs(self) -> List[Tuple[str, Any]]:
+    def inputs(self) -> List[InputParam]:
         return [
-            InputParam("attention_kwargs"),
+            InputParam(
+                "latents",
+                required=True,
+                type_hint=torch.Tensor,
+                description="The initial latents to use for the denoising process. Can be generated in prepare_latent step.",
+            ),
+            InputParam(
+                "first_frame_latents",
+                required=True,
+                type_hint=torch.Tensor,
+                description="The first frame latents to use for the denoising process. Can be generated in prepare_first_frame_latents step.",
+            ),
+            InputParam(
+                "dtype",
+                required=True,
+                type_hint=torch.dtype,
+                description="The dtype of the model inputs. Can be generated in input step.",
+            ),
         ]
 
+    @torch.no_grad()
+    def __call__(self, components: WanModularPipeline, block_state: BlockState, i: int, t: torch.Tensor):
+        block_state.latent_model_input = torch.cat([block_state.latents, block_state.first_frame_latents], dim=1).to(
+            block_state.dtype
+        )
+        return components, block_state
+
+
+class WanFLF2VLoopBeforeDenoiser(ModularPipelineBlocks):
+    model_name = "wan"
+
     @property
-    def intermediate_inputs(self) -> List[str]:
+    def description(self) -> str:
+        return (
+            "step within the denoising loop that prepares the latent input for the denoiser. "
+            "This block should be used to compose the `sub_blocks` attribute of a `LoopSequentialPipelineBlocks` "
+            "object (e.g. `WanDenoiseLoopWrapper`)"
+        )
+
+    @property
+    def inputs(self) -> List[InputParam]:
         return [
             InputParam(
                 "latents",
@@ -72,36 +130,99 @@ def intermediate_inputs(self) -> List[str]:
                 type_hint=torch.Tensor,
                 description="The initial latents to use for the denoising process. Can be generated in prepare_latent step.",
             ),
+            InputParam(
+                "first_last_frame_latents",
+                required=True,
+                type_hint=torch.Tensor,
+                description="The first and last frame latents to use for the denoising process. Can be generated in prepare_first_last_frame_latents step.",
+            ),
+            InputParam(
+                "dtype",
+                required=True,
+                type_hint=torch.dtype,
+                description="The dtype of the model inputs. Can be generated in input step.",
+            ),
+        ]
+
+    @torch.no_grad()
+    def __call__(self, components: WanModularPipeline, block_state: BlockState, i: int, t: torch.Tensor):
+        block_state.latent_model_input = torch.cat(
+            [block_state.latents, block_state.first_last_frame_latents], dim=1
+        ).to(block_state.dtype)
+        return components, block_state
+
+
+class WanLoopDenoiser(ModularPipelineBlocks):
+    model_name = "wan"
+
+    def __init__(
+        self,
+        guider_input_fields: Dict[str, Any] = {"encoder_hidden_states": ("prompt_embeds", "negative_prompt_embeds")},
+    ):
+        """Initialize a denoiser block that calls the denoiser model. This block is used in Wan2.1.
+
+        Args:
+            guider_input_fields: A dictionary that maps each argument expected by the denoiser model
+                (for example, "encoder_hidden_states") to data stored on 'block_state'. The value can be either:
+
+                - A tuple of strings. For instance, {"encoder_hidden_states": ("prompt_embeds",
+                  "negative_prompt_embeds")} tells the guider to read `block_state.prompt_embeds` and
+                  `block_state.negative_prompt_embeds` and pass them as the conditional and unconditional batches of
+                  'encoder_hidden_states'.
+                - A string. For example, {"encoder_hidden_image": "image_embeds"} makes the guider forward
+                  `block_state.image_embeds` for both conditional and unconditional batches.
+        """
+        if not isinstance(guider_input_fields, dict):
+            raise ValueError(f"guider_input_fields must be a dictionary but is {type(guider_input_fields)}")
+        self._guider_input_fields = guider_input_fields
+        super().__init__()
+
+    @property
+    def expected_components(self) -> List[ComponentSpec]:
+        return [
+            ComponentSpec(
+                "guider",
+                ClassifierFreeGuidance,
+                config=FrozenDict({"guidance_scale": 5.0}),
+                default_creation_method="from_config",
+            ),
+            ComponentSpec("transformer", WanTransformer3DModel),
+        ]
+
+    @property
+    def description(self) -> str:
+        return (
+            "Step within the denoising loop that denoise the latents with guidance. "
+            "This block should be used to compose the `sub_blocks` attribute of a `LoopSequentialPipelineBlocks` "
+            "object (e.g. `WanDenoiseLoopWrapper`)"
+        )
+
+    @property
+    def inputs(self) -> List[Tuple[str, Any]]:
+        inputs = [
+            InputParam("attention_kwargs"),
             InputParam(
                 "num_inference_steps",
                 required=True,
                 type_hint=int,
                 description="The number of inference steps to use for the denoising process. Can be generated in set_timesteps step.",
             ),
-            InputParam(
-                kwargs_type="denoiser_input_fields",
-                description=(
-                    "All conditional model inputs that need to be prepared with guider. "
-                    "It should contain prompt_embeds/negative_prompt_embeds. "
-                    "Please add `kwargs_type=denoiser_input_fields` to their parameter spec (`OutputParam`) when they are created and added to the pipeline state"
-                ),
-            ),
         ]
+        guider_input_names = []
+        for value in self._guider_input_fields.values():
+            if isinstance(value, tuple):
+                guider_input_names.extend(value)
+            else:
+                guider_input_names.append(value)
+
+        for name in guider_input_names:
+            inputs.append(InputParam(name=name, required=True, type_hint=torch.Tensor))
+        return inputs
 
     @torch.no_grad()
     def __call__(
         self, components: WanModularPipeline, block_state: BlockState, i: int, t: torch.Tensor
     ) -> PipelineState:
-        #  Map the keys we'll see on each `guider_state_batch` (e.g. guider_state_batch.prompt_embeds)
-        #  to the corresponding (cond, uncond) fields on block_state. (e.g. block_state.prompt_embeds, block_state.negative_prompt_embeds)
-        guider_inputs = {
-            "prompt_embeds": (
-                getattr(block_state, "prompt_embeds", None),
-                getattr(block_state, "negative_prompt_embeds", None),
-            ),
-        }
-        transformer_dtype = components.transformer.dtype
-
         components.guider.set_state(step=i, num_inference_steps=block_state.num_inference_steps, timestep=t)
 
         # The guider splits model inputs into separate batches for conditional/unconditional predictions.
@@ -112,22 +233,26 @@ def __call__(
         #       {"encoder_hidden_states": negative_prompt_embeds, "__guidance_identifier__": "pred_uncond"},  # unconditional batch
         #   ]
         # Other guidance methods may return 1 batch (no guidance) or 3+ batches (e.g., PAG, APG).
-        guider_state = components.guider.prepare_inputs(guider_inputs)
+        guider_state = components.guider.prepare_inputs_from_block_state(block_state, self._guider_input_fields)
 
         # run the denoiser for each guidance batch
         for guider_state_batch in guider_state:
             components.guider.prepare_models(components.transformer)
-            cond_kwargs = {input_name: getattr(guider_state_batch, input_name) for input_name in guider_inputs.keys()}
-            prompt_embeds = cond_kwargs.pop("prompt_embeds")
+            cond_kwargs = guider_state_batch.as_dict()
+            cond_kwargs = {
+                k: v.to(block_state.dtype) if isinstance(v, torch.Tensor) else v
+                for k, v in cond_kwargs.items()
+                if k in self._guider_input_fields.keys()
+            }
 
             # Predict the noise residual
             # store the noise_pred in guider_state_batch so that we can apply guidance across all batches
             guider_state_batch.noise_pred = components.transformer(
-                hidden_states=block_state.latents.to(transformer_dtype),
-                timestep=t.flatten(),
-                encoder_hidden_states=prompt_embeds,
+                hidden_states=block_state.latent_model_input.to(block_state.dtype),
+                timestep=t.expand(block_state.latent_model_input.shape[0]).to(block_state.dtype),
                 attention_kwargs=block_state.attention_kwargs,
                 return_dict=False,
+                **cond_kwargs,
             )[0]
             components.guider.cleanup_models(components.transformer)
 
@@ -137,36 +262,157 @@ def __call__(
         return components, block_state
 
 
-class WanLoopAfterDenoiser(ModularPipelineBlocks):
+class Wan22LoopDenoiser(ModularPipelineBlocks):
     model_name = "wan"
 
+    def __init__(
+        self,
+        guider_input_fields: Dict[str, Any] = {"encoder_hidden_states": ("prompt_embeds", "negative_prompt_embeds")},
+    ):
+        """Initialize a denoiser block that calls the denoiser model. This block is used in Wan2.2.
+
+        Args:
+            guider_input_fields: A dictionary that maps each argument expected by the denoiser model
+                (for example, "encoder_hidden_states") to data stored on `block_state`. The value can be either:
+
+                - A tuple of strings. For instance, `{"encoder_hidden_states": ("prompt_embeds",
+                  "negative_prompt_embeds")}` tells the guider to read `block_state.prompt_embeds` and
+                  `block_state.negative_prompt_embeds` and pass them as the conditional and unconditional batches of
+                  `encoder_hidden_states`.
+                - A string. For example, `{"encoder_hidden_image": "image_embeds"}` makes the guider forward
+                  `block_state.image_embeds` for both conditional and unconditional batches.
+        """
+        if not isinstance(guider_input_fields, dict):
+            raise ValueError(f"guider_input_fields must be a dictionary but is {type(guider_input_fields)}")
+        self._guider_input_fields = guider_input_fields
+        super().__init__()
+
     @property
     def expected_components(self) -> List[ComponentSpec]:
         return [
-            ComponentSpec("scheduler", UniPCMultistepScheduler),
+            ComponentSpec(
+                "guider",
+                ClassifierFreeGuidance,
+                config=FrozenDict({"guidance_scale": 4.0}),
+                default_creation_method="from_config",
+            ),
+            ComponentSpec(
+                "guider_2",
+                ClassifierFreeGuidance,
+                config=FrozenDict({"guidance_scale": 3.0}),
+                default_creation_method="from_config",
+            ),
+            ComponentSpec("transformer", WanTransformer3DModel),
+            ComponentSpec("transformer_2", WanTransformer3DModel),
         ]
 
     @property
     def description(self) -> str:
         return (
-            "step within the denoising loop that update the latents. "
+            "Step within the denoising loop that denoise the latents with guidance. "
             "This block should be used to compose the `sub_blocks` attribute of a `LoopSequentialPipelineBlocks` "
             "object (e.g. `WanDenoiseLoopWrapper`)"
         )
 
+    @property
+    def expected_configs(self) -> List[ConfigSpec]:
+        return [
+            ConfigSpec(
+                name="boundary_ratio",
+                default=0.875,
+                description="The boundary ratio to divide the denoising loop into high noise and low noise stages.",
+            ),
+        ]
+
     @property
     def inputs(self) -> List[Tuple[str, Any]]:
-        return []
+        inputs = [
+            InputParam("attention_kwargs"),
+            InputParam(
+                "num_inference_steps",
+                required=True,
+                type_hint=int,
+                description="The number of inference steps to use for the denoising process. Can be generated in set_timesteps step.",
+            ),
+        ]
+        guider_input_names = []
+        for value in self._guider_input_fields.values():
+            if isinstance(value, tuple):
+                guider_input_names.extend(value)
+            else:
+                guider_input_names.append(value)
+
+        for name in guider_input_names:
+            inputs.append(InputParam(name=name, required=True, type_hint=torch.Tensor))
+        return inputs
+
+    @torch.no_grad()
+    def __call__(
+        self, components: WanModularPipeline, block_state: BlockState, i: int, t: torch.Tensor
+    ) -> PipelineState:
+        boundary_timestep = components.config.boundary_ratio * components.num_train_timesteps
+        if t >= boundary_timestep:
+            block_state.current_model = components.transformer
+            block_state.guider = components.guider
+        else:
+            block_state.current_model = components.transformer_2
+            block_state.guider = components.guider_2
+
+        block_state.guider.set_state(step=i, num_inference_steps=block_state.num_inference_steps, timestep=t)
+
+        # The guider splits model inputs into separate batches for conditional/unconditional predictions.
+        # For CFG with guider_inputs = {"encoder_hidden_states": (prompt_embeds, negative_prompt_embeds)}:
+        # you will get a guider_state with two batches:
+        #   guider_state = [
+        #       {"encoder_hidden_states": prompt_embeds, "__guidance_identifier__": "pred_cond"},      # conditional batch
+        #       {"encoder_hidden_states": negative_prompt_embeds, "__guidance_identifier__": "pred_uncond"},  # unconditional batch
+        #   ]
+        # Other guidance methods may return 1 batch (no guidance) or 3+ batches (e.g., PAG, APG).
+        guider_state = block_state.guider.prepare_inputs_from_block_state(block_state, self._guider_input_fields)
+
+        # run the denoiser for each guidance batch
+        for guider_state_batch in guider_state:
+            block_state.guider.prepare_models(block_state.current_model)
+            cond_kwargs = guider_state_batch.as_dict()
+            cond_kwargs = {
+                k: v.to(block_state.dtype) if isinstance(v, torch.Tensor) else v
+                for k, v in cond_kwargs.items()
+                if k in self._guider_input_fields.keys()
+            }
+
+            # Predict the noise residual
+            # store the noise_pred in guider_state_batch so that we can apply guidance across all batches
+            guider_state_batch.noise_pred = block_state.current_model(
+                hidden_states=block_state.latent_model_input.to(block_state.dtype),
+                timestep=t.expand(block_state.latent_model_input.shape[0]).to(block_state.dtype),
+                attention_kwargs=block_state.attention_kwargs,
+                return_dict=False,
+                **cond_kwargs,
+            )[0]
+            block_state.guider.cleanup_models(block_state.current_model)
+
+        # Perform guidance
+        block_state.noise_pred = block_state.guider(guider_state)[0]
+
+        return components, block_state
+
+
+class WanLoopAfterDenoiser(ModularPipelineBlocks):
+    model_name = "wan"
 
     @property
-    def intermediate_inputs(self) -> List[str]:
+    def expected_components(self) -> List[ComponentSpec]:
         return [
-            InputParam("generator"),
+            ComponentSpec("scheduler", UniPCMultistepScheduler),
         ]
 
     @property
-    def intermediate_outputs(self) -> List[OutputParam]:
-        return [OutputParam("latents", type_hint=torch.Tensor, description="The denoised latents")]
+    def description(self) -> str:
+        return (
+            "step within the denoising loop that update the latents. "
+            "This block should be used to compose the `sub_blocks` attribute of a `LoopSequentialPipelineBlocks` "
+            "object (e.g. `WanDenoiseLoopWrapper`)"
+        )
 
     @torch.no_grad()
     def __call__(self, components: WanModularPipeline, block_state: BlockState, i: int, t: torch.Tensor):
@@ -198,18 +444,11 @@ def description(self) -> str:
     @property
     def loop_expected_components(self) -> List[ComponentSpec]:
         return [
-            ComponentSpec(
-                "guider",
-                ClassifierFreeGuidance,
-                config=FrozenDict({"guidance_scale": 5.0}),
-                default_creation_method="from_config",
-            ),
             ComponentSpec("scheduler", UniPCMultistepScheduler),
-            ComponentSpec("transformer", WanTransformer3DModel),
         ]
 
     @property
-    def loop_intermediate_inputs(self) -> List[InputParam]:
+    def loop_inputs(self) -> List[InputParam]:
         return [
             InputParam(
                 "timesteps",
@@ -248,7 +487,114 @@ def __call__(self, components: WanModularPipeline, state: PipelineState) -> Pipe
 
 class WanDenoiseStep(WanDenoiseLoopWrapper):
     block_classes = [
-        WanLoopDenoiser,
+        WanLoopBeforeDenoiser,
+        WanLoopDenoiser(
+            guider_input_fields={
+                "encoder_hidden_states": ("prompt_embeds", "negative_prompt_embeds"),
+            }
+        ),
+        WanLoopAfterDenoiser,
+    ]
+    block_names = ["before_denoiser", "denoiser", "after_denoiser"]
+
+    @property
+    def description(self) -> str:
+        return (
+            "Denoise step that iteratively denoise the latents. \n"
+            "Its loop logic is defined in `WanDenoiseLoopWrapper.__call__` method \n"
+            "At each iteration, it runs blocks defined in `sub_blocks` sequentially:\n"
+            " - `WanLoopBeforeDenoiser`\n"
+            " - `WanLoopDenoiser`\n"
+            " - `WanLoopAfterDenoiser`\n"
+            "This block supports text-to-video tasks for wan2.1."
+        )
+
+
+class Wan22DenoiseStep(WanDenoiseLoopWrapper):
+    block_classes = [
+        WanLoopBeforeDenoiser,
+        Wan22LoopDenoiser(
+            guider_input_fields={
+                "encoder_hidden_states": ("prompt_embeds", "negative_prompt_embeds"),
+            }
+        ),
+        WanLoopAfterDenoiser,
+    ]
+    block_names = ["before_denoiser", "denoiser", "after_denoiser"]
+
+    @property
+    def description(self) -> str:
+        return (
+            "Denoise step that iteratively denoise the latents. \n"
+            "Its loop logic is defined in `WanDenoiseLoopWrapper.__call__` method \n"
+            "At each iteration, it runs blocks defined in `sub_blocks` sequentially:\n"
+            " - `WanLoopBeforeDenoiser`\n"
+            " - `Wan22LoopDenoiser`\n"
+            " - `WanLoopAfterDenoiser`\n"
+            "This block supports text-to-video tasks for Wan2.2."
+        )
+
+
+class WanImage2VideoDenoiseStep(WanDenoiseLoopWrapper):
+    block_classes = [
+        WanImage2VideoLoopBeforeDenoiser,
+        WanLoopDenoiser(
+            guider_input_fields={
+                "encoder_hidden_states": ("prompt_embeds", "negative_prompt_embeds"),
+                "encoder_hidden_states_image": "image_embeds",
+            }
+        ),
+        WanLoopAfterDenoiser,
+    ]
+    block_names = ["before_denoiser", "denoiser", "after_denoiser"]
+
+    @property
+    def description(self) -> str:
+        return (
+            "Denoise step that iteratively denoise the latents. \n"
+            "Its loop logic is defined in `WanDenoiseLoopWrapper.__call__` method \n"
+            "At each iteration, it runs blocks defined in `sub_blocks` sequentially:\n"
+            " - `WanImage2VideoLoopBeforeDenoiser`\n"
+            " - `WanLoopDenoiser`\n"
+            " - `WanLoopAfterDenoiser`\n"
+            "This block supports image-to-video tasks for wan2.1."
+        )
+
+
+class Wan22Image2VideoDenoiseStep(WanDenoiseLoopWrapper):
+    block_classes = [
+        WanImage2VideoLoopBeforeDenoiser,
+        Wan22LoopDenoiser(
+            guider_input_fields={
+                "encoder_hidden_states": ("prompt_embeds", "negative_prompt_embeds"),
+            }
+        ),
+        WanLoopAfterDenoiser,
+    ]
+    block_names = ["before_denoiser", "denoiser", "after_denoiser"]
+
+    @property
+    def description(self) -> str:
+        return (
+            "Denoise step that iteratively denoise the latents. \n"
+            "Its loop logic is defined in `WanDenoiseLoopWrapper.__call__` method \n"
+            "At each iteration, it runs blocks defined in `sub_blocks` sequentially:\n"
+            " - `WanImage2VideoLoopBeforeDenoiser`\n"
+            " - `WanLoopDenoiser`\n"
+            " - `WanLoopAfterDenoiser`\n"
+            "This block supports image-to-video tasks for Wan2.2."
+        )
+
+
+class WanFLF2VDenoiseStep(WanDenoiseLoopWrapper):
+    block_classes = [
+        WanFLF2VLoopBeforeDenoiser,
+        WanLoopDenoiser(
+            guider_input_fields={
+                "encoder_hidden_states": ("prompt_embeds", "negative_prompt_embeds"),
+                "encoder_hidden_states_image": "image_embeds",
+            }
+        ),
         WanLoopAfterDenoiser,
     ]
     block_names = ["before_denoiser", "denoiser", "after_denoiser"]
@@ -259,7 +605,8 @@ def description(self) -> str:
             "Denoise step that iteratively denoise the latents. \n"
             "Its loop logic is defined in `WanDenoiseLoopWrapper.__call__` method \n"
             "At each iteration, it runs blocks defined in `sub_blocks` sequentially:\n"
+            " - `WanFLF2VLoopBeforeDenoiser`\n"
             " - `WanLoopDenoiser`\n"
             " - `WanLoopAfterDenoiser`\n"
-            "This block supports both text2vid tasks."
+            "This block supports FLF2V tasks for wan2.1."
         )
diff --git a/src/diffusers/modular_pipelines/wan/encoders.py b/src/diffusers/modular_pipelines/wan/encoders.py
index cb2fc242383c..dc49df8eab8c 100644
--- a/src/diffusers/modular_pipelines/wan/encoders.py
+++ b/src/diffusers/modular_pipelines/wan/encoders.py
@@ -15,21 +15,29 @@
 import html
 from typing import List, Optional, Union
 
+import numpy as np
+import PIL
 import regex as re
 import torch
-from transformers import AutoTokenizer, UMT5EncoderModel
+from transformers import AutoTokenizer, CLIPImageProcessor, CLIPVisionModel, UMT5EncoderModel
 
 from ...configuration_utils import FrozenDict
 from ...guiders import ClassifierFreeGuidance
-from ...utils import is_ftfy_available, logging
+from ...image_processor import PipelineImageInput
+from ...models import AutoencoderKLWan
+from ...utils import is_ftfy_available, is_torchvision_available, logging
+from ...video_processor import VideoProcessor
 from ..modular_pipeline import ModularPipelineBlocks, PipelineState
-from ..modular_pipeline_utils import ComponentSpec, ConfigSpec, InputParam, OutputParam
+from ..modular_pipeline_utils import ComponentSpec, InputParam, OutputParam
 from .modular_pipeline import WanModularPipeline
 
 
 if is_ftfy_available():
     import ftfy
 
+if is_torchvision_available():
+    from torchvision import transforms
+
 
 logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
 
@@ -51,6 +59,103 @@ def prompt_clean(text):
     return text
 
 
+def get_t5_prompt_embeds(
+    text_encoder: UMT5EncoderModel,
+    tokenizer: AutoTokenizer,
+    prompt: Union[str, List[str]],
+    max_sequence_length: int,
+    device: torch.device,
+):
+    dtype = text_encoder.dtype
+    prompt = [prompt] if isinstance(prompt, str) else prompt
+    prompt = [prompt_clean(u) for u in prompt]
+
+    text_inputs = tokenizer(
+        prompt,
+        padding="max_length",
+        max_length=max_sequence_length,
+        truncation=True,
+        add_special_tokens=True,
+        return_attention_mask=True,
+        return_tensors="pt",
+    )
+    text_input_ids, mask = text_inputs.input_ids, text_inputs.attention_mask
+    seq_lens = mask.gt(0).sum(dim=1).long()
+    prompt_embeds = text_encoder(text_input_ids.to(device), mask.to(device)).last_hidden_state
+    prompt_embeds = prompt_embeds.to(dtype=dtype, device=device)
+    prompt_embeds = [u[:v] for u, v in zip(prompt_embeds, seq_lens)]
+    prompt_embeds = torch.stack(
+        [torch.cat([u, u.new_zeros(max_sequence_length - u.size(0), u.size(1))]) for u in prompt_embeds], dim=0
+    )
+
+    return prompt_embeds
+
+
+def encode_image(
+    image: PipelineImageInput,
+    image_processor: CLIPImageProcessor,
+    image_encoder: CLIPVisionModel,
+    device: Optional[torch.device] = None,
+):
+    image = image_processor(images=image, return_tensors="pt").to(device)
+    image_embeds = image_encoder(**image, output_hidden_states=True)
+    return image_embeds.hidden_states[-2]
+
+
+# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img.retrieve_latents
+def retrieve_latents(
+    encoder_output: torch.Tensor, generator: Optional[torch.Generator] = None, sample_mode: str = "sample"
+):
+    if hasattr(encoder_output, "latent_dist") and sample_mode == "sample":
+        return encoder_output.latent_dist.sample(generator)
+    elif hasattr(encoder_output, "latent_dist") and sample_mode == "argmax":
+        return encoder_output.latent_dist.mode()
+    elif hasattr(encoder_output, "latents"):
+        return encoder_output.latents
+    else:
+        raise AttributeError("Could not access latents of provided encoder_output")
+
+
+def encode_vae_image(
+    video_tensor: torch.Tensor,
+    vae: AutoencoderKLWan,
+    generator: torch.Generator,
+    device: torch.device,
+    dtype: torch.dtype,
+    latent_channels: int = 16,
+):
+    if not isinstance(video_tensor, torch.Tensor):
+        raise ValueError(f"Expected video_tensor to be a tensor, got {type(video_tensor)}.")
+
+    if isinstance(generator, list) and len(generator) != video_tensor.shape[0]:
+        raise ValueError(
+            f"You have passed a list of generators of length {len(generator)}, but it is not same as number of images {video_tensor.shape[0]}."
+        )
+
+    video_tensor = video_tensor.to(device=device, dtype=dtype)
+
+    if isinstance(generator, list):
+        video_latents = [
+            retrieve_latents(vae.encode(video_tensor[i : i + 1]), generator=generator[i], sample_mode="argmax")
+            for i in range(video_tensor.shape[0])
+        ]
+        video_latents = torch.cat(video_latents, dim=0)
+    else:
+        video_latents = retrieve_latents(vae.encode(video_tensor), sample_mode="argmax")
+
+    latents_mean = (
+        torch.tensor(vae.config.latents_mean)
+        .view(1, latent_channels, 1, 1, 1)
+        .to(video_latents.device, video_latents.dtype)
+    )
+    latents_std = 1.0 / torch.tensor(vae.config.latents_std).view(1, latent_channels, 1, 1, 1).to(
+        video_latents.device, video_latents.dtype
+    )
+    video_latents = (video_latents - latents_mean) * latents_std
+
+    return video_latents
+
+
 class WanTextEncoderStep(ModularPipelineBlocks):
     model_name = "wan"
 
@@ -71,16 +176,12 @@ def expected_components(self) -> List[ComponentSpec]:
             ),
         ]
 
-    @property
-    def expected_configs(self) -> List[ConfigSpec]:
-        return []
-
     @property
     def inputs(self) -> List[InputParam]:
         return [
             InputParam("prompt"),
             InputParam("negative_prompt"),
-            InputParam("attention_kwargs"),
+            InputParam("max_sequence_length", default=512),
         ]
 
     @property
@@ -107,47 +208,13 @@ def check_inputs(block_state):
         ):
             raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(block_state.prompt)}")
 
-    @staticmethod
-    def _get_t5_prompt_embeds(
-        components,
-        prompt: Union[str, List[str]],
-        max_sequence_length: int,
-        device: torch.device,
-    ):
-        dtype = components.text_encoder.dtype
-        prompt = [prompt] if isinstance(prompt, str) else prompt
-        prompt = [prompt_clean(u) for u in prompt]
-
-        text_inputs = components.tokenizer(
-            prompt,
-            padding="max_length",
-            max_length=max_sequence_length,
-            truncation=True,
-            add_special_tokens=True,
-            return_attention_mask=True,
-            return_tensors="pt",
-        )
-        text_input_ids, mask = text_inputs.input_ids, text_inputs.attention_mask
-        seq_lens = mask.gt(0).sum(dim=1).long()
-        prompt_embeds = components.text_encoder(text_input_ids.to(device), mask.to(device)).last_hidden_state
-        prompt_embeds = prompt_embeds.to(dtype=dtype, device=device)
-        prompt_embeds = [u[:v] for u, v in zip(prompt_embeds, seq_lens)]
-        prompt_embeds = torch.stack(
-            [torch.cat([u, u.new_zeros(max_sequence_length - u.size(0), u.size(1))]) for u in prompt_embeds], dim=0
-        )
-
-        return prompt_embeds
-
     @staticmethod
     def encode_prompt(
         components,
         prompt: str,
         device: Optional[torch.device] = None,
-        num_videos_per_prompt: int = 1,
         prepare_unconditional_embeds: bool = True,
         negative_prompt: Optional[str] = None,
-        prompt_embeds: Optional[torch.Tensor] = None,
-        negative_prompt_embeds: Optional[torch.Tensor] = None,
         max_sequence_length: int = 512,
     ):
         r"""
@@ -158,32 +225,29 @@ def encode_prompt(
                 prompt to be encoded
             device: (`torch.device`):
                 torch device
-            num_videos_per_prompt (`int`):
-                number of videos that should be generated per prompt
             prepare_unconditional_embeds (`bool`):
                 whether to use prepare unconditional embeddings or not
             negative_prompt (`str` or `List[str]`, *optional*):
                 The prompt or prompts not to guide the image generation. If not defined, one has to pass
                 `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
                 less than `1`).
-            prompt_embeds (`torch.Tensor`, *optional*):
-                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
-                provided, text embeddings will be generated from `prompt` input argument.
-            negative_prompt_embeds (`torch.Tensor`, *optional*):
-                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
-                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
-                argument.
             max_sequence_length (`int`, defaults to `512`):
                 The maximum number of text tokens to be used for the generation process.
         """
         device = device or components._execution_device
-        prompt = [prompt] if isinstance(prompt, str) else prompt
-        batch_size = len(prompt) if prompt is not None else prompt_embeds.shape[0]
-
-        if prompt_embeds is None:
-            prompt_embeds = WanTextEncoderStep._get_t5_prompt_embeds(components, prompt, max_sequence_length, device)
+        if not isinstance(prompt, list):
+            prompt = [prompt]
+        batch_size = len(prompt)
+
+        prompt_embeds = get_t5_prompt_embeds(
+            text_encoder=components.text_encoder,
+            tokenizer=components.tokenizer,
+            prompt=prompt,
+            max_sequence_length=max_sequence_length,
+            device=device,
+        )
 
-        if prepare_unconditional_embeds and negative_prompt_embeds is None:
+        if prepare_unconditional_embeds:
             negative_prompt = negative_prompt or ""
             negative_prompt = batch_size * [negative_prompt] if isinstance(negative_prompt, str) else negative_prompt
 
@@ -199,18 +263,14 @@ def encode_prompt(
                     " the batch size of `prompt`."
                 )
 
-            negative_prompt_embeds = WanTextEncoderStep._get_t5_prompt_embeds(
-                components, negative_prompt, max_sequence_length, device
+            negative_prompt_embeds = get_t5_prompt_embeds(
+                text_encoder=components.text_encoder,
+                tokenizer=components.tokenizer,
+                prompt=negative_prompt,
+                max_sequence_length=max_sequence_length,
+                device=device,
             )
 
-        bs_embed, seq_len, _ = prompt_embeds.shape
-        prompt_embeds = prompt_embeds.repeat(1, num_videos_per_prompt, 1)
-        prompt_embeds = prompt_embeds.view(bs_embed * num_videos_per_prompt, seq_len, -1)
-
-        if prepare_unconditional_embeds:
-            negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_videos_per_prompt, 1)
-            negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_videos_per_prompt, seq_len, -1)
-
         return prompt_embeds, negative_prompt_embeds
 
     @torch.no_grad()
@@ -219,7 +279,6 @@ def __call__(self, components: WanModularPipeline, state: PipelineState) -> Pipe
         block_state = self.get_block_state(state)
         self.check_inputs(block_state)
 
-        block_state.prepare_unconditional_embeds = components.guider.num_conditions > 1
         block_state.device = components._execution_device
 
         # Encode input prompt
@@ -227,16 +286,382 @@ def __call__(self, components: WanModularPipeline, state: PipelineState) -> Pipe
             block_state.prompt_embeds,
             block_state.negative_prompt_embeds,
         ) = self.encode_prompt(
-            components,
-            block_state.prompt,
-            block_state.device,
-            1,
-            block_state.prepare_unconditional_embeds,
-            block_state.negative_prompt,
-            prompt_embeds=None,
-            negative_prompt_embeds=None,
+            components=components,
+            prompt=block_state.prompt,
+            device=block_state.device,
+            prepare_unconditional_embeds=components.requires_unconditional_embeds,
+            negative_prompt=block_state.negative_prompt,
+            max_sequence_length=block_state.max_sequence_length,
         )
 
         # Add outputs
         self.set_block_state(state, block_state)
         return components, state
+
+
+class WanImageResizeStep(ModularPipelineBlocks):
+    model_name = "wan"
+
+    @property
+    def description(self) -> str:
+        return "Image Resize step that resize the image to the target area (height * width) while maintaining the aspect ratio."
+
+    @property
+    def inputs(self) -> List[InputParam]:
+        return [
+            InputParam("image", type_hint=PIL.Image.Image, required=True),
+            InputParam("height", type_hint=int, default=480),
+            InputParam("width", type_hint=int, default=832),
+        ]
+
+    @property
+    def intermediate_outputs(self) -> List[OutputParam]:
+        return [
+            OutputParam("resized_image", type_hint=PIL.Image.Image),
+        ]
+
+    def __call__(self, components: WanModularPipeline, state: PipelineState) -> PipelineState:
+        block_state = self.get_block_state(state)
+        max_area = block_state.height * block_state.width
+
+        image = block_state.image
+        aspect_ratio = image.height / image.width
+        mod_value = components.vae_scale_factor_spatial * components.patch_size_spatial
+        block_state.height = round(np.sqrt(max_area * aspect_ratio)) // mod_value * mod_value
+        block_state.width = round(np.sqrt(max_area / aspect_ratio)) // mod_value * mod_value
+        block_state.resized_image = image.resize((block_state.width, block_state.height))
+
+        self.set_block_state(state, block_state)
+        return components, state
+
+
+class WanImageCropResizeStep(ModularPipelineBlocks):
+    model_name = "wan"
+
+    @property
+    def description(self) -> str:
+        return "Image Resize step that resize the last_image to the same size of first frame image with center crop."
+
+    @property
+    def inputs(self) -> List[InputParam]:
+        return [
+            InputParam(
+                "resized_image", type_hint=PIL.Image.Image, required=True, description="The resized first frame image"
+            ),
+            InputParam("last_image", type_hint=PIL.Image.Image, required=True, description="The last frameimage"),
+        ]
+
+    @property
+    def intermediate_outputs(self) -> List[OutputParam]:
+        return [
+            OutputParam("resized_last_image", type_hint=PIL.Image.Image),
+        ]
+
+    def __call__(self, components: WanModularPipeline, state: PipelineState) -> PipelineState:
+        block_state = self.get_block_state(state)
+
+        height = block_state.resized_image.height
+        width = block_state.resized_image.width
+        image = block_state.last_image
+
+        # Calculate resize ratio to match first frame dimensions
+        resize_ratio = max(width / image.width, height / image.height)
+
+        # Resize the image
+        width = round(image.width * resize_ratio)
+        height = round(image.height * resize_ratio)
+        size = [width, height]
+        resized_image = transforms.functional.center_crop(image, size)
+        block_state.resized_last_image = resized_image
+
+        self.set_block_state(state, block_state)
+        return components, state
+
+
+class WanImageEncoderStep(ModularPipelineBlocks):
+    model_name = "wan"
+
+    @property
+    def description(self) -> str:
+        return "Image Encoder step that generate image_embeds based on first frame image to guide the video generation"
+
+    @property
+    def expected_components(self) -> List[ComponentSpec]:
+        return [
+            ComponentSpec("image_processor", CLIPImageProcessor),
+            ComponentSpec("image_encoder", CLIPVisionModel),
+        ]
+
+    @property
+    def inputs(self) -> List[InputParam]:
+        return [
+            InputParam("resized_image", type_hint=PIL.Image.Image, required=True),
+        ]
+
+    @property
+    def intermediate_outputs(self) -> List[OutputParam]:
+        return [
+            OutputParam("image_embeds", type_hint=torch.Tensor, description="The image embeddings"),
+        ]
+
+    def __call__(self, components: WanModularPipeline, state: PipelineState) -> PipelineState:
+        block_state = self.get_block_state(state)
+
+        device = components._execution_device
+
+        image = block_state.resized_image
+
+        image_embeds = encode_image(
+            image_processor=components.image_processor,
+            image_encoder=components.image_encoder,
+            image=image,
+            device=device,
+        )
+        block_state.image_embeds = image_embeds
+        self.set_block_state(state, block_state)
+        return components, state
+
+
+class WanFirstLastFrameImageEncoderStep(ModularPipelineBlocks):
+    model_name = "wan"
+
+    @property
+    def description(self) -> str:
+        return "Image Encoder step that generate image_embeds based on first and last frame images to guide the video generation"
+
+    @property
+    def expected_components(self) -> List[ComponentSpec]:
+        return [
+            ComponentSpec("image_processor", CLIPImageProcessor),
+            ComponentSpec("image_encoder", CLIPVisionModel),
+        ]
+
+    @property
+    def inputs(self) -> List[InputParam]:
+        return [
+            InputParam("resized_image", type_hint=PIL.Image.Image, required=True),
+            InputParam("resized_last_image", type_hint=PIL.Image.Image, required=True),
+        ]
+
+    @property
+    def intermediate_outputs(self) -> List[OutputParam]:
+        return [
+            OutputParam("image_embeds", type_hint=torch.Tensor, description="The image embeddings"),
+        ]
+
+    def __call__(self, components: WanModularPipeline, state: PipelineState) -> PipelineState:
+        block_state = self.get_block_state(state)
+
+        device = components._execution_device
+
+        first_frame_image = block_state.resized_image
+        last_frame_image = block_state.resized_last_image
+
+        image_embeds = encode_image(
+            image_processor=components.image_processor,
+            image_encoder=components.image_encoder,
+            image=[first_frame_image, last_frame_image],
+            device=device,
+        )
+        block_state.image_embeds = image_embeds
+        self.set_block_state(state, block_state)
+        return components, state
+
+
+class WanVaeImageEncoderStep(ModularPipelineBlocks):
+    model_name = "wan"
+
+    @property
+    def description(self) -> str:
+        return "Vae Image Encoder step that generate condition_latents based on first frame image to guide the video generation"
+
+    @property
+    def expected_components(self) -> List[ComponentSpec]:
+        return [
+            ComponentSpec("vae", AutoencoderKLWan),
+            ComponentSpec(
+                "video_processor",
+                VideoProcessor,
+                config=FrozenDict({"vae_scale_factor": 8}),
+                default_creation_method="from_config",
+            ),
+        ]
+
+    @property
+    def inputs(self) -> List[InputParam]:
+        return [
+            InputParam("resized_image", type_hint=PIL.Image.Image, required=True),
+            InputParam("height"),
+            InputParam("width"),
+            InputParam("num_frames"),
+            InputParam("generator"),
+        ]
+
+    @property
+    def intermediate_outputs(self) -> List[OutputParam]:
+        return [
+            OutputParam(
+                "first_frame_latents",
+                type_hint=torch.Tensor,
+                description="video latent representation with the first frame image condition",
+            ),
+        ]
+
+    @staticmethod
+    def check_inputs(components, block_state):
+        if (block_state.height is not None and block_state.height % components.vae_scale_factor_spatial != 0) or (
+            block_state.width is not None and block_state.width % components.vae_scale_factor_spatial != 0
+        ):
+            raise ValueError(
+                f"`height` and `width` have to be divisible by {components.vae_scale_factor_spatial} but are {block_state.height} and {block_state.width}."
+            )
+        if block_state.num_frames is not None and (
+            block_state.num_frames < 1 or (block_state.num_frames - 1) % components.vae_scale_factor_temporal != 0
+        ):
+            raise ValueError(
+                f"`num_frames` has to be greater than 0, and (num_frames - 1) must be divisible by {components.vae_scale_factor_temporal}, but got {block_state.num_frames}."
+            )
+
+    def __call__(self, components: WanModularPipeline, state: PipelineState) -> PipelineState:
+        block_state = self.get_block_state(state)
+        self.check_inputs(components, block_state)
+
+        image = block_state.resized_image
+
+        device = components._execution_device
+        dtype = torch.float32
+
+        height = block_state.height or components.default_height
+        width = block_state.width or components.default_width
+        num_frames = block_state.num_frames or components.default_num_frames
+
+        image_tensor = components.video_processor.preprocess(image, height=height, width=width).to(
+            device=device, dtype=dtype
+        )
+
+        if image_tensor.dim() == 4:
+            image_tensor = image_tensor.unsqueeze(2)
+
+        video_tensor = torch.cat(
+            [
+                image_tensor,
+                image_tensor.new_zeros(image_tensor.shape[0], image_tensor.shape[1], num_frames - 1, height, width),
+            ],
+            dim=2,
+        ).to(device=device, dtype=dtype)
+
+        block_state.first_frame_latents = encode_vae_image(
+            video_tensor=video_tensor,
+            vae=components.vae,
+            generator=block_state.generator,
+            device=device,
+            dtype=dtype,
+            latent_channels=components.num_channels_latents,
+        )
+
+        self.set_block_state(state, block_state)
+        return components, state
+
+
+class WanFirstLastFrameVaeImageEncoderStep(ModularPipelineBlocks):
+    model_name = "wan"
+
+    @property
+    def description(self) -> str:
+        return "Vae Image Encoder step that generate condition_latents based on first and last frame images to guide the video generation"
+
+    @property
+    def expected_components(self) -> List[ComponentSpec]:
+        return [
+            ComponentSpec("vae", AutoencoderKLWan),
+            ComponentSpec(
+                "video_processor",
+                VideoProcessor,
+                config=FrozenDict({"vae_scale_factor": 8}),
+                default_creation_method="from_config",
+            ),
+        ]
+
+    @property
+    def inputs(self) -> List[InputParam]:
+        return [
+            InputParam("resized_image", type_hint=PIL.Image.Image, required=True),
+            InputParam("resized_last_image", type_hint=PIL.Image.Image, required=True),
+            InputParam("height"),
+            InputParam("width"),
+            InputParam("num_frames"),
+            InputParam("generator"),
+        ]
+
+    @property
+    def intermediate_outputs(self) -> List[OutputParam]:
+        return [
+            OutputParam(
+                "first_last_frame_latents",
+                type_hint=torch.Tensor,
+                description="video latent representation with the first and last frame images condition",
+            ),
+        ]
+
+    @staticmethod
+    def check_inputs(components, block_state):
+        if (block_state.height is not None and block_state.height % components.vae_scale_factor_spatial != 0) or (
+            block_state.width is not None and block_state.width % components.vae_scale_factor_spatial != 0
+        ):
+            raise ValueError(
+                f"`height` and `width` have to be divisible by {components.vae_scale_factor_spatial} but are {block_state.height} and {block_state.width}."
+            )
+        if block_state.num_frames is not None and (
+            block_state.num_frames < 1 or (block_state.num_frames - 1) % components.vae_scale_factor_temporal != 0
+        ):
+            raise ValueError(
+                f"`num_frames` has to be greater than 0, and (num_frames - 1) must be divisible by {components.vae_scale_factor_temporal}, but got {block_state.num_frames}."
+            )
+
+    def __call__(self, components: WanModularPipeline, state: PipelineState) -> PipelineState:
+        block_state = self.get_block_state(state)
+        self.check_inputs(components, block_state)
+
+        first_frame_image = block_state.resized_image
+        last_frame_image = block_state.resized_last_image
+
+        device = components._execution_device
+        dtype = torch.float32
+
+        height = block_state.height or components.default_height
+        width = block_state.width or components.default_width
+        num_frames = block_state.num_frames or components.default_num_frames
+
+        first_image_tensor = components.video_processor.preprocess(first_frame_image, height=height, width=width).to(
+            device=device, dtype=dtype
+        )
+        first_image_tensor = first_image_tensor.unsqueeze(2)
+
+        last_image_tensor = components.video_processor.preprocess(last_frame_image, height=height, width=width).to(
+            device=device, dtype=dtype
+        )
+
+        last_image_tensor = last_image_tensor.unsqueeze(2)
+
+        video_tensor = torch.cat(
+            [
+                first_image_tensor,
+                first_image_tensor.new_zeros(
+                    first_image_tensor.shape[0], first_image_tensor.shape[1], num_frames - 2, height, width
+                ),
+                last_image_tensor,
+            ],
+            dim=2,
+        ).to(device=device, dtype=dtype)
+
+        block_state.first_last_frame_latents = encode_vae_image(
+            video_tensor=video_tensor,
+            vae=components.vae,
+            generator=block_state.generator,
+            device=device,
+            dtype=dtype,
+            latent_channels=components.num_channels_latents,
+        )
+
+        self.set_block_state(state, block_state)
+        return components, state
diff --git a/src/diffusers/modular_pipelines/wan/modular_blocks.py b/src/diffusers/modular_pipelines/wan/modular_blocks.py
index 5f4c1a983566..b3b70b2f9be1 100644
--- a/src/diffusers/modular_pipelines/wan/modular_blocks.py
+++ b/src/diffusers/modular_pipelines/wan/modular_blocks.py
@@ -16,96 +16,244 @@
 from ..modular_pipeline import AutoPipelineBlocks, SequentialPipelineBlocks
 from ..modular_pipeline_utils import InsertableDict
 from .before_denoise import (
-    WanInputStep,
+    WanAdditionalInputsStep,
+    WanPrepareFirstFrameLatentsStep,
+    WanPrepareFirstLastFrameLatentsStep,
     WanPrepareLatentsStep,
     WanSetTimestepsStep,
+    WanTextInputStep,
+)
+from .decoders import WanImageVaeDecoderStep
+from .denoise import (
+    Wan22DenoiseStep,
+    Wan22Image2VideoDenoiseStep,
+    WanDenoiseStep,
+    WanFLF2VDenoiseStep,
+    WanImage2VideoDenoiseStep,
+)
+from .encoders import (
+    WanFirstLastFrameImageEncoderStep,
+    WanFirstLastFrameVaeImageEncoderStep,
+    WanImageCropResizeStep,
+    WanImageEncoderStep,
+    WanImageResizeStep,
+    WanTextEncoderStep,
+    WanVaeImageEncoderStep,
 )
-from .decoders import WanDecodeStep
-from .denoise import WanDenoiseStep
-from .encoders import WanTextEncoderStep
 
 
 logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
 
 
-# before_denoise: text2vid
-class WanBeforeDenoiseStep(SequentialPipelineBlocks):
+# wan2.1
+# wan2.1: text2vid
+class WanCoreDenoiseStep(SequentialPipelineBlocks):
     block_classes = [
-        WanInputStep,
+        WanTextInputStep,
         WanSetTimestepsStep,
         WanPrepareLatentsStep,
+        WanDenoiseStep,
     ]
-    block_names = ["input", "set_timesteps", "prepare_latents"]
+    block_names = ["input", "set_timesteps", "prepare_latents", "denoise"]
 
     @property
     def description(self):
         return (
-            "Before denoise step that prepare the inputs for the denoise step.\n"
+            "denoise block that takes encoded conditions and runs the denoising process.\n"
             + "This is a sequential pipeline blocks:\n"
-            + " - `WanInputStep` is used to adjust the batch size of the model inputs\n"
+            + " - `WanTextInputStep` is used to adjust the batch size of the model inputs\n"
             + " - `WanSetTimestepsStep` is used to set the timesteps\n"
             + " - `WanPrepareLatentsStep` is used to prepare the latents\n"
+            + " - `WanDenoiseStep` is used to denoise the latents\n"
         )
 
 
-# before_denoise: all task (text2vid,)
-class WanAutoBeforeDenoiseStep(AutoPipelineBlocks):
+# wan2.1: image2video
+## image encoder
+class WanImage2VideoImageEncoderStep(SequentialPipelineBlocks):
+    model_name = "wan"
+    block_classes = [WanImageResizeStep, WanImageEncoderStep]
+    block_names = ["image_resize", "image_encoder"]
+
+    @property
+    def description(self):
+        return "Image2Video Image Encoder step that resize the image and encode the image to generate the image embeddings"
+
+
+## vae encoder
+class WanImage2VideoVaeImageEncoderStep(SequentialPipelineBlocks):
+    model_name = "wan"
+    block_classes = [WanImageResizeStep, WanVaeImageEncoderStep]
+    block_names = ["image_resize", "vae_image_encoder"]
+
+    @property
+    def description(self):
+        return "Image2Video Vae Image Encoder step that resize the image and encode the first frame image to its latent representation"
+
+
+## denoise
+class WanImage2VideoCoreDenoiseStep(SequentialPipelineBlocks):
     block_classes = [
-        WanBeforeDenoiseStep,
+        WanTextInputStep,
+        WanAdditionalInputsStep(image_latent_inputs=["first_frame_latents"]),
+        WanSetTimestepsStep,
+        WanPrepareLatentsStep,
+        WanPrepareFirstFrameLatentsStep,
+        WanImage2VideoDenoiseStep,
+    ]
+    block_names = [
+        "input",
+        "additional_inputs",
+        "set_timesteps",
+        "prepare_latents",
+        "prepare_first_frame_latents",
+        "denoise",
     ]
-    block_names = ["text2vid"]
-    block_trigger_inputs = [None]
 
     @property
     def description(self):
         return (
-            "Before denoise step that prepare the inputs for the denoise step.\n"
-            + "This is an auto pipeline block that works for text2vid.\n"
-            + " - `WanBeforeDenoiseStep` (text2vid) is used.\n"
+            "denoise block that takes encoded text and image latent conditions and runs the denoising process.\n"
+            + "This is a sequential pipeline blocks:\n"
+            + " - `WanTextInputStep` is used to adjust the batch size of the model inputs\n"
+            + " - `WanAdditionalInputsStep` is used to adjust the batch size of the latent conditions\n"
+            + " - `WanSetTimestepsStep` is used to set the timesteps\n"
+            + " - `WanPrepareLatentsStep` is used to prepare the latents\n"
+            + " - `WanPrepareFirstFrameLatentsStep` is used to prepare the first frame latent conditions\n"
+            + " - `WanImage2VideoDenoiseStep` is used to denoise the latents\n"
         )
 
 
-# denoise: text2vid
-class WanAutoDenoiseStep(AutoPipelineBlocks):
+# wan2.1: FLF2v
+
+
+## image encoder
+class WanFLF2VImageEncoderStep(SequentialPipelineBlocks):
+    model_name = "wan"
+    block_classes = [WanImageResizeStep, WanImageCropResizeStep, WanFirstLastFrameImageEncoderStep]
+    block_names = ["image_resize", "last_image_resize", "image_encoder"]
+
+    @property
+    def description(self):
+        return "FLF2V Image Encoder step that resize and encode and encode the first and last frame images to generate the image embeddings"
+
+
+## vae encoder
+class WanFLF2VVaeImageEncoderStep(SequentialPipelineBlocks):
+    model_name = "wan"
+    block_classes = [WanImageResizeStep, WanImageCropResizeStep, WanFirstLastFrameVaeImageEncoderStep]
+    block_names = ["image_resize", "last_image_resize", "vae_image_encoder"]
+
+    @property
+    def description(self):
+        return "FLF2V Vae Image Encoder step that resize and encode and encode the first and last frame images to generate the latent conditions"
+
+
+## denoise
+class WanFLF2VCoreDenoiseStep(SequentialPipelineBlocks):
     block_classes = [
-        WanDenoiseStep,
+        WanTextInputStep,
+        WanAdditionalInputsStep(image_latent_inputs=["first_last_frame_latents"]),
+        WanSetTimestepsStep,
+        WanPrepareLatentsStep,
+        WanPrepareFirstLastFrameLatentsStep,
+        WanFLF2VDenoiseStep,
+    ]
+    block_names = [
+        "input",
+        "additional_inputs",
+        "set_timesteps",
+        "prepare_latents",
+        "prepare_first_last_frame_latents",
+        "denoise",
     ]
-    block_names = ["denoise"]
-    block_trigger_inputs = [None]
 
     @property
-    def description(self) -> str:
+    def description(self):
         return (
-            "Denoise step that iteratively denoise the latents. "
-            "This is a auto pipeline block that works for text2vid tasks.."
-            " - `WanDenoiseStep` (denoise) for text2vid tasks."
+            "denoise block that takes encoded text and image latent conditions and runs the denoising process.\n"
+            + "This is a sequential pipeline blocks:\n"
+            + " - `WanTextInputStep` is used to adjust the batch size of the model inputs\n"
+            + " - `WanAdditionalInputsStep` is used to adjust the batch size of the latent conditions\n"
+            + " - `WanSetTimestepsStep` is used to set the timesteps\n"
+            + " - `WanPrepareLatentsStep` is used to prepare the latents\n"
+            + " - `WanPrepareFirstLastFrameLatentsStep` is used to prepare the latent conditions\n"
+            + " - `WanImage2VideoDenoiseStep` is used to denoise the latents\n"
+        )
+
+
+# wan2.1: auto blocks
+## image encoder
+class WanAutoImageEncoderStep(AutoPipelineBlocks):
+    block_classes = [WanFLF2VImageEncoderStep, WanImage2VideoImageEncoderStep]
+    block_names = ["flf2v_image_encoder", "image2video_image_encoder"]
+    block_trigger_inputs = ["last_image", "image"]
+
+    @property
+    def description(self):
+        return (
+            "Image Encoder step that encode the image to generate the image embeddings"
+            + "This is an auto pipeline block that works for image2video tasks."
+            + " - `WanFLF2VImageEncoderStep` (flf2v) is used when `last_image` is provided."
+            + " - `WanImage2VideoImageEncoderStep` (image2video) is used when `image` is provided."
+            + " - if `last_image` or `image` is not provided, step will be skipped."
         )
 
 
-# decode: all task (text2img, img2img, inpainting)
-class WanAutoDecodeStep(AutoPipelineBlocks):
-    block_classes = [WanDecodeStep]
-    block_names = ["non-inpaint"]
-    block_trigger_inputs = [None]
+## vae encoder
+class WanAutoVaeImageEncoderStep(AutoPipelineBlocks):
+    block_classes = [WanFLF2VVaeImageEncoderStep, WanImage2VideoVaeImageEncoderStep]
+    block_names = ["flf2v_vae_image_encoder", "image2video_vae_image_encoder"]
+    block_trigger_inputs = ["last_image", "image"]
 
     @property
     def description(self):
-        return "Decode step that decode the denoised latents into videos outputs.\n - `WanDecodeStep`"
+        return (
+            "Vae Image Encoder step that encode the image to generate the image latents"
+            + "This is an auto pipeline block that works for image2video tasks."
+            + " - `WanFLF2VVaeImageEncoderStep` (flf2v) is used when `last_image` is provided."
+            + " - `WanImage2VideoVaeImageEncoderStep` (image2video) is used when `image` is provided."
+            + " - if `last_image` or `image` is not provided, step will be skipped."
+        )
+
 
+## denoise
+class WanAutoDenoiseStep(AutoPipelineBlocks):
+    block_classes = [
+        WanFLF2VCoreDenoiseStep,
+        WanImage2VideoCoreDenoiseStep,
+        WanCoreDenoiseStep,
+    ]
+    block_names = ["flf2v", "image2video", "text2video"]
+    block_trigger_inputs = ["first_last_frame_latents", "first_frame_latents", None]
 
-# text2vid
+    @property
+    def description(self) -> str:
+        return (
+            "Denoise step that iteratively denoise the latents. "
+            "This is a auto pipeline block that works for text2video and image2video tasks."
+            " - `WanCoreDenoiseStep` (text2video) for text2vid tasks."
+            " - `WanCoreImage2VideoCoreDenoiseStep` (image2video) for image2video tasks."
+            + " - if `first_frame_latents` is provided, `WanCoreImage2VideoDenoiseStep` will be used.\n"
+            + " - if `first_frame_latents` is not provided, `WanCoreDenoiseStep` will be used.\n"
+        )
+
+
+# auto pipeline blocks
 class WanAutoBlocks(SequentialPipelineBlocks):
     block_classes = [
         WanTextEncoderStep,
-        WanAutoBeforeDenoiseStep,
+        WanAutoImageEncoderStep,
+        WanAutoVaeImageEncoderStep,
         WanAutoDenoiseStep,
-        WanAutoDecodeStep,
+        WanImageVaeDecoderStep,
     ]
     block_names = [
         "text_encoder",
-        "before_denoise",
+        "image_encoder",
+        "vae_image_encoder",
         "denoise",
-        "decoder",
+        "decode",
     ]
 
     @property
@@ -116,29 +264,211 @@ def description(self):
         )
 
 
+# wan22
+# wan2.2: text2vid
+
+
+## denoise
+class Wan22CoreDenoiseStep(SequentialPipelineBlocks):
+    block_classes = [
+        WanTextInputStep,
+        WanSetTimestepsStep,
+        WanPrepareLatentsStep,
+        Wan22DenoiseStep,
+    ]
+    block_names = ["input", "set_timesteps", "prepare_latents", "denoise"]
+
+    @property
+    def description(self):
+        return (
+            "denoise block that takes encoded conditions and runs the denoising process.\n"
+            + "This is a sequential pipeline blocks:\n"
+            + " - `WanTextInputStep` is used to adjust the batch size of the model inputs\n"
+            + " - `WanSetTimestepsStep` is used to set the timesteps\n"
+            + " - `WanPrepareLatentsStep` is used to prepare the latents\n"
+            + " - `Wan22DenoiseStep` is used to denoise the latents in wan2.2\n"
+        )
+
+
+# wan2.2: image2video
+## denoise
+class Wan22Image2VideoCoreDenoiseStep(SequentialPipelineBlocks):
+    block_classes = [
+        WanTextInputStep,
+        WanAdditionalInputsStep(image_latent_inputs=["first_frame_latents"]),
+        WanSetTimestepsStep,
+        WanPrepareLatentsStep,
+        WanPrepareFirstFrameLatentsStep,
+        Wan22Image2VideoDenoiseStep,
+    ]
+    block_names = [
+        "input",
+        "additional_inputs",
+        "set_timesteps",
+        "prepare_latents",
+        "prepare_first_frame_latents",
+        "denoise",
+    ]
+
+    @property
+    def description(self):
+        return (
+            "denoise block that takes encoded text and image latent conditions and runs the denoising process.\n"
+            + "This is a sequential pipeline blocks:\n"
+            + " - `WanTextInputStep` is used to adjust the batch size of the model inputs\n"
+            + " - `WanAdditionalInputsStep` is used to adjust the batch size of the latent conditions\n"
+            + " - `WanSetTimestepsStep` is used to set the timesteps\n"
+            + " - `WanPrepareLatentsStep` is used to prepare the latents\n"
+            + " - `WanPrepareFirstFrameLatentsStep` is used to prepare the first frame latent conditions\n"
+            + " - `Wan22Image2VideoDenoiseStep` is used to denoise the latents in wan2.2\n"
+        )
+
+
+class Wan22AutoDenoiseStep(AutoPipelineBlocks):
+    block_classes = [
+        Wan22Image2VideoCoreDenoiseStep,
+        Wan22CoreDenoiseStep,
+    ]
+    block_names = ["image2video", "text2video"]
+    block_trigger_inputs = ["first_frame_latents", None]
+
+    @property
+    def description(self) -> str:
+        return (
+            "Denoise step that iteratively denoise the latents. "
+            "This is a auto pipeline block that works for text2video and image2video tasks."
+            " - `Wan22Image2VideoCoreDenoiseStep` (image2video) for image2video tasks."
+            " - `Wan22CoreDenoiseStep` (text2video) for text2vid tasks."
+            + " - if `first_frame_latents` is provided, `Wan22Image2VideoCoreDenoiseStep` will be used.\n"
+            + " - if `first_frame_latents` is not provided, `Wan22CoreDenoiseStep` will be used.\n"
+        )
+
+
+class Wan22AutoBlocks(SequentialPipelineBlocks):
+    block_classes = [
+        WanTextEncoderStep,
+        WanAutoVaeImageEncoderStep,
+        Wan22AutoDenoiseStep,
+        WanImageVaeDecoderStep,
+    ]
+    block_names = [
+        "text_encoder",
+        "vae_image_encoder",
+        "denoise",
+        "decode",
+    ]
+
+    @property
+    def description(self):
+        return (
+            "Auto Modular pipeline for text-to-video using Wan2.2.\n"
+            + "- for text-to-video generation, all you need to provide is `prompt`"
+        )
+
+
+# presets for wan2.1 and wan2.2
+# YiYi Notes: should we move these to doc?
+# wan2.1
 TEXT2VIDEO_BLOCKS = InsertableDict(
     [
         ("text_encoder", WanTextEncoderStep),
-        ("input", WanInputStep),
+        ("input", WanTextInputStep),
         ("set_timesteps", WanSetTimestepsStep),
         ("prepare_latents", WanPrepareLatentsStep),
         ("denoise", WanDenoiseStep),
-        ("decode", WanDecodeStep),
+        ("decode", WanImageVaeDecoderStep),
+    ]
+)
+
+IMAGE2VIDEO_BLOCKS = InsertableDict(
+    [
+        ("image_resize", WanImageResizeStep),
+        ("image_encoder", WanImage2VideoImageEncoderStep),
+        ("vae_image_encoder", WanImage2VideoVaeImageEncoderStep),
+        ("input", WanTextInputStep),
+        ("additional_inputs", WanAdditionalInputsStep(image_latent_inputs=["first_frame_latents"])),
+        ("set_timesteps", WanSetTimestepsStep),
+        ("prepare_latents", WanPrepareLatentsStep),
+        ("prepare_first_frame_latents", WanPrepareFirstFrameLatentsStep),
+        ("denoise", WanImage2VideoDenoiseStep),
+        ("decode", WanImageVaeDecoderStep),
     ]
 )
 
 
+FLF2V_BLOCKS = InsertableDict(
+    [
+        ("image_resize", WanImageResizeStep),
+        ("last_image_resize", WanImageCropResizeStep),
+        ("image_encoder", WanFLF2VImageEncoderStep),
+        ("vae_image_encoder", WanFLF2VVaeImageEncoderStep),
+        ("input", WanTextInputStep),
+        ("additional_inputs", WanAdditionalInputsStep(image_latent_inputs=["first_last_frame_latents"])),
+        ("set_timesteps", WanSetTimestepsStep),
+        ("prepare_latents", WanPrepareLatentsStep),
+        ("prepare_first_last_frame_latents", WanPrepareFirstLastFrameLatentsStep),
+        ("denoise", WanFLF2VDenoiseStep),
+        ("decode", WanImageVaeDecoderStep),
+    ]
+)
+
 AUTO_BLOCKS = InsertableDict(
     [
         ("text_encoder", WanTextEncoderStep),
-        ("before_denoise", WanAutoBeforeDenoiseStep),
+        ("image_encoder", WanAutoImageEncoderStep),
+        ("vae_image_encoder", WanAutoVaeImageEncoderStep),
         ("denoise", WanAutoDenoiseStep),
-        ("decode", WanAutoDecodeStep),
+        ("decode", WanImageVaeDecoderStep),
     ]
 )
 
+# wan2.2 presets
+
+TEXT2VIDEO_BLOCKS_WAN22 = InsertableDict(
+    [
+        ("text_encoder", WanTextEncoderStep),
+        ("input", WanTextInputStep),
+        ("set_timesteps", WanSetTimestepsStep),
+        ("prepare_latents", WanPrepareLatentsStep),
+        ("denoise", Wan22DenoiseStep),
+        ("decode", WanImageVaeDecoderStep),
+    ]
+)
+
+IMAGE2VIDEO_BLOCKS_WAN22 = InsertableDict(
+    [
+        ("image_resize", WanImageResizeStep),
+        ("vae_image_encoder", WanImage2VideoVaeImageEncoderStep),
+        ("input", WanTextInputStep),
+        ("set_timesteps", WanSetTimestepsStep),
+        ("prepare_latents", WanPrepareLatentsStep),
+        ("denoise", Wan22DenoiseStep),
+        ("decode", WanImageVaeDecoderStep),
+    ]
+)
+
+AUTO_BLOCKS_WAN22 = InsertableDict(
+    [
+        ("text_encoder", WanTextEncoderStep),
+        ("vae_image_encoder", WanAutoVaeImageEncoderStep),
+        ("denoise", Wan22AutoDenoiseStep),
+        ("decode", WanImageVaeDecoderStep),
+    ]
+)
+
+# presets all blocks (wan and wan22)
+
 
 ALL_BLOCKS = {
-    "text2video": TEXT2VIDEO_BLOCKS,
-    "auto": AUTO_BLOCKS,
+    "wan2.1": {
+        "text2video": TEXT2VIDEO_BLOCKS,
+        "image2video": IMAGE2VIDEO_BLOCKS,
+        "flf2v": FLF2V_BLOCKS,
+        "auto": AUTO_BLOCKS,
+    },
+    "wan2.2": {
+        "text2video": TEXT2VIDEO_BLOCKS_WAN22,
+        "image2video": IMAGE2VIDEO_BLOCKS_WAN22,
+        "auto": AUTO_BLOCKS_WAN22,
+    },
 }
diff --git a/src/diffusers/modular_pipelines/wan/modular_pipeline.py b/src/diffusers/modular_pipelines/wan/modular_pipeline.py
index e4adf3d151d6..930b25e4b905 100644
--- a/src/diffusers/modular_pipelines/wan/modular_pipeline.py
+++ b/src/diffusers/modular_pipelines/wan/modular_pipeline.py
@@ -13,6 +13,8 @@
 # limitations under the License.
 
 
+from typing import Any, Dict, Optional
+
 from ...loaders import WanLoraLoaderMixin
 from ...pipelines.pipeline_utils import StableDiffusionMixin
 from ...utils import logging
@@ -35,6 +37,13 @@ class WanModularPipeline(
 
     default_blocks_name = "WanAutoBlocks"
 
+    # override the default_blocks_name in base class, which is just return self.default_blocks_name
+    def get_default_blocks_name(self, config_dict: Optional[Dict[str, Any]]) -> Optional[str]:
+        if config_dict is not None and "boundary_ratio" in config_dict and config_dict["boundary_ratio"] is not None:
+            return "Wan22AutoBlocks"
+        else:
+            return "WanAutoBlocks"
+
     @property
     def default_height(self):
         return self.default_sample_height * self.vae_scale_factor_spatial
@@ -59,6 +68,13 @@ def default_sample_width(self):
     def default_sample_num_frames(self):
         return 21
 
+    @property
+    def patch_size_spatial(self):
+        patch_size_spatial = 2
+        if hasattr(self, "transformer") and self.transformer is not None:
+            patch_size_spatial = self.transformer.config.patch_size[1]
+        return patch_size_spatial
+
     @property
     def vae_scale_factor_spatial(self):
         vae_scale_factor = 8
@@ -86,3 +102,19 @@ def num_channels_latents(self):
         if hasattr(self, "vae") and self.vae is not None:
             num_channels_latents = self.vae.config.z_dim
         return num_channels_latents
+
+    @property
+    def requires_unconditional_embeds(self):
+        requires_unconditional_embeds = False
+
+        if hasattr(self, "guider") and self.guider is not None:
+            requires_unconditional_embeds = self.guider._enabled and self.guider.num_conditions > 1
+
+        return requires_unconditional_embeds
+
+    @property
+    def num_train_timesteps(self):
+        num_train_timesteps = 1000
+        if hasattr(self, "scheduler") and self.scheduler is not None:
+            num_train_timesteps = self.scheduler.config.num_train_timesteps
+        return num_train_timesteps
diff --git a/src/diffusers/pipelines/__init__.py b/src/diffusers/pipelines/__init__.py
index db357669b6f3..388551f812f8 100644
--- a/src/diffusers/pipelines/__init__.py
+++ b/src/diffusers/pipelines/__init__.py
@@ -129,6 +129,7 @@
     ]
     _import_structure["bria"] = ["BriaPipeline"]
     _import_structure["bria_fibo"] = ["BriaFiboPipeline"]
+    _import_structure["flux2"] = ["Flux2Pipeline"]
     _import_structure["flux"] = [
         "FluxControlPipeline",
         "FluxControlInpaintPipeline",
@@ -242,6 +243,7 @@
         "HunyuanVideoImageToVideoPipeline",
         "HunyuanVideoFramepackPipeline",
     ]
+    _import_structure["hunyuan_video1_5"] = ["HunyuanVideo15Pipeline", "HunyuanVideo15ImageToVideoPipeline"]
     _import_structure["hunyuan_image"] = ["HunyuanImagePipeline", "HunyuanImageRefinerPipeline"]
     _import_structure["kandinsky"] = [
         "KandinskyCombinedPipeline",
@@ -299,6 +301,7 @@
     _import_structure["mochi"] = ["MochiPipeline"]
     _import_structure["musicldm"] = ["MusicLDMPipeline"]
     _import_structure["omnigen"] = ["OmniGenPipeline"]
+    _import_structure["ovis_image"] = ["OvisImagePipeline"]
     _import_structure["visualcloze"] = ["VisualClozePipeline", "VisualClozeGenerationPipeline"]
     _import_structure["paint_by_example"] = ["PaintByExamplePipeline"]
     _import_structure["pia"] = ["PIAPipeline"]
@@ -309,6 +312,10 @@
         "SanaControlNetPipeline",
         "SanaSprintImg2ImgPipeline",
     ]
+    _import_structure["sana_video"] = [
+        "SanaVideoPipeline",
+        "SanaImageToVideoPipeline",
+    ]
     _import_structure["semantic_stable_diffusion"] = ["SemanticStableDiffusionPipeline"]
     _import_structure["shap_e"] = ["ShapEImg2ImgPipeline", "ShapEPipeline"]
     _import_structure["stable_audio"] = [
@@ -384,8 +391,20 @@
         "WuerstchenDecoderPipeline",
         "WuerstchenPriorPipeline",
     ]
-    _import_structure["wan"] = ["WanPipeline", "WanImageToVideoPipeline", "WanVideoToVideoPipeline", "WanVACEPipeline"]
-    _import_structure["kandinsky5"] = ["Kandinsky5T2VPipeline"]
+    _import_structure["wan"] = [
+        "WanPipeline",
+        "WanImageToVideoPipeline",
+        "WanVideoToVideoPipeline",
+        "WanVACEPipeline",
+        "WanAnimatePipeline",
+    ]
+    _import_structure["kandinsky5"] = [
+        "Kandinsky5T2VPipeline",
+        "Kandinsky5I2VPipeline",
+        "Kandinsky5T2IPipeline",
+        "Kandinsky5I2IPipeline",
+    ]
+    _import_structure["z_image"] = ["ZImageImg2ImgPipeline", "ZImagePipeline"]
     _import_structure["skyreels_v2"] = [
         "SkyReelsV2DiffusionForcingPipeline",
         "SkyReelsV2DiffusionForcingImageToVideoPipeline",
@@ -403,6 +422,7 @@
         "QwenImageControlNetInpaintPipeline",
         "QwenImageControlNetPipeline",
     ]
+    _import_structure["chronoedit"] = ["ChronoEditPipeline"]
 try:
     if not is_onnx_available():
         raise OptionalDependencyNotAvailable()
@@ -565,6 +585,7 @@
         from .bria import BriaPipeline
         from .bria_fibo import BriaFiboPipeline
         from .chroma import ChromaImg2ImgPipeline, ChromaPipeline
+        from .chronoedit import ChronoEditPipeline
         from .cogvideo import (
             CogVideoXFunControlPipeline,
             CogVideoXImageToVideoPipeline,
@@ -642,6 +663,7 @@
             FluxPriorReduxPipeline,
             ReduxImageEncoder,
         )
+        from .flux2 import Flux2Pipeline
         from .hidream_image import HiDreamImagePipeline
         from .hunyuan_image import HunyuanImagePipeline, HunyuanImageRefinerPipeline
         from .hunyuan_video import (
@@ -650,6 +672,7 @@
             HunyuanVideoImageToVideoPipeline,
             HunyuanVideoPipeline,
         )
+        from .hunyuan_video1_5 import HunyuanVideo15ImageToVideoPipeline, HunyuanVideo15Pipeline
         from .hunyuandit import HunyuanDiTPipeline
         from .i2vgen_xl import I2VGenXLPipeline
         from .kandinsky import (
@@ -677,7 +700,12 @@
             Kandinsky3Img2ImgPipeline,
             Kandinsky3Pipeline,
         )
-        from .kandinsky5 import Kandinsky5T2VPipeline
+        from .kandinsky5 import (
+            Kandinsky5I2IPipeline,
+            Kandinsky5I2VPipeline,
+            Kandinsky5T2IPipeline,
+            Kandinsky5T2VPipeline,
+        )
         from .latent_consistency_models import (
             LatentConsistencyModelImg2ImgPipeline,
             LatentConsistencyModelPipeline,
@@ -702,6 +730,7 @@
         from .mochi import MochiPipeline
         from .musicldm import MusicLDMPipeline
         from .omnigen import OmniGenPipeline
+        from .ovis_image import OvisImagePipeline
         from .pag import (
             AnimateDiffPAGPipeline,
             HunyuanDiTPAGPipeline,
@@ -735,7 +764,13 @@
             QwenImageInpaintPipeline,
             QwenImagePipeline,
         )
-        from .sana import SanaControlNetPipeline, SanaPipeline, SanaSprintImg2ImgPipeline, SanaSprintPipeline
+        from .sana import (
+            SanaControlNetPipeline,
+            SanaPipeline,
+            SanaSprintImg2ImgPipeline,
+            SanaSprintPipeline,
+        )
+        from .sana_video import SanaImageToVideoPipeline, SanaVideoPipeline
         from .semantic_stable_diffusion import SemanticStableDiffusionPipeline
         from .shap_e import ShapEImg2ImgPipeline, ShapEPipeline
         from .stable_audio import StableAudioPipeline, StableAudioProjectionModel
@@ -794,12 +829,19 @@
             UniDiffuserTextDecoder,
         )
         from .visualcloze import VisualClozeGenerationPipeline, VisualClozePipeline
-        from .wan import WanImageToVideoPipeline, WanPipeline, WanVACEPipeline, WanVideoToVideoPipeline
+        from .wan import (
+            WanAnimatePipeline,
+            WanImageToVideoPipeline,
+            WanPipeline,
+            WanVACEPipeline,
+            WanVideoToVideoPipeline,
+        )
         from .wuerstchen import (
             WuerstchenCombinedPipeline,
             WuerstchenDecoderPipeline,
             WuerstchenPriorPipeline,
         )
+        from .z_image import ZImageImg2ImgPipeline, ZImagePipeline
 
         try:
             if not is_onnx_available():
diff --git a/src/diffusers/pipelines/audioldm2/modeling_audioldm2.py b/src/diffusers/pipelines/audioldm2/modeling_audioldm2.py
index b6b40cd6e616..878f6f08db42 100644
--- a/src/diffusers/pipelines/audioldm2/modeling_audioldm2.py
+++ b/src/diffusers/pipelines/audioldm2/modeling_audioldm2.py
@@ -21,10 +21,10 @@
 from ...configuration_utils import ConfigMixin, register_to_config
 from ...loaders import UNet2DConditionLoadersMixin
 from ...models.activations import get_activation
+from ...models.attention import AttentionMixin
 from ...models.attention_processor import (
     ADDED_KV_ATTENTION_PROCESSORS,
     CROSS_ATTENTION_PROCESSORS,
-    AttentionProcessor,
     AttnAddedKVProcessor,
     AttnProcessor,
 )
@@ -163,7 +163,7 @@ def forward(
         )
 
 
-class AudioLDM2UNet2DConditionModel(ModelMixin, ConfigMixin, UNet2DConditionLoadersMixin):
+class AudioLDM2UNet2DConditionModel(ModelMixin, AttentionMixin, ConfigMixin, UNet2DConditionLoadersMixin):
     r"""
     A conditional 2D UNet model that takes a noisy sample, conditional state, and a timestep and returns a sample
     shaped output. Compared to the vanilla [`UNet2DConditionModel`], this variant optionally includes an additional
@@ -245,16 +245,21 @@ def __init__(
         out_channels: int = 4,
         flip_sin_to_cos: bool = True,
         freq_shift: int = 0,
-        down_block_types: Tuple[str] = (
+        down_block_types: Tuple[str, ...] = (
             "CrossAttnDownBlock2D",
             "CrossAttnDownBlock2D",
             "CrossAttnDownBlock2D",
             "DownBlock2D",
         ),
         mid_block_type: Optional[str] = "UNetMidBlock2DCrossAttn",
-        up_block_types: Tuple[str] = ("UpBlock2D", "CrossAttnUpBlock2D", "CrossAttnUpBlock2D", "CrossAttnUpBlock2D"),
+        up_block_types: Tuple[str, ...] = (
+            "UpBlock2D",
+            "CrossAttnUpBlock2D",
+            "CrossAttnUpBlock2D",
+            "CrossAttnUpBlock2D",
+        ),
         only_cross_attention: Union[bool, Tuple[bool]] = False,
-        block_out_channels: Tuple[int] = (320, 640, 1280, 1280),
+        block_out_channels: Tuple[int, ...] = (320, 640, 1280, 1280),
         layers_per_block: Union[int, Tuple[int]] = 2,
         downsample_padding: int = 1,
         mid_block_scale_factor: float = 1,
@@ -530,66 +535,6 @@ def __init__(
             block_out_channels[0], out_channels, kernel_size=conv_out_kernel, padding=conv_out_padding
         )
 
-    @property
-    # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.attn_processors
-    def attn_processors(self) -> Dict[str, AttentionProcessor]:
-        r"""
-        Returns:
-            `dict` of attention processors: A dictionary containing all attention processors used in the model with
-            indexed by its weight name.
-        """
-        # set recursively
-        processors = {}
-
-        def fn_recursive_add_processors(name: str, module: torch.nn.Module, processors: Dict[str, AttentionProcessor]):
-            if hasattr(module, "get_processor"):
-                processors[f"{name}.processor"] = module.get_processor()
-
-            for sub_name, child in module.named_children():
-                fn_recursive_add_processors(f"{name}.{sub_name}", child, processors)
-
-            return processors
-
-        for name, module in self.named_children():
-            fn_recursive_add_processors(name, module, processors)
-
-        return processors
-
-    # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.set_attn_processor
-    def set_attn_processor(self, processor: Union[AttentionProcessor, Dict[str, AttentionProcessor]]):
-        r"""
-        Sets the attention processor to use to compute attention.
-
-        Parameters:
-            processor (`dict` of `AttentionProcessor` or only `AttentionProcessor`):
-                The instantiated processor class or a dictionary of processor classes that will be set as the processor
-                for **all** `Attention` layers.
-
-                If `processor` is a dict, the key needs to define the path to the corresponding cross attention
-                processor. This is strongly recommended when setting trainable attention processors.
-
-        """
-        count = len(self.attn_processors.keys())
-
-        if isinstance(processor, dict) and len(processor) != count:
-            raise ValueError(
-                f"A dict of processors was passed, but the number of processors {len(processor)} does not match the"
-                f" number of attention layers: {count}. Please make sure to pass {count} processor classes."
-            )
-
-        def fn_recursive_attn_processor(name: str, module: torch.nn.Module, processor):
-            if hasattr(module, "set_processor"):
-                if not isinstance(processor, dict):
-                    module.set_processor(processor)
-                else:
-                    module.set_processor(processor.pop(f"{name}.processor"))
-
-            for sub_name, child in module.named_children():
-                fn_recursive_attn_processor(f"{name}.{sub_name}", child, processor)
-
-        for name, module in self.named_children():
-            fn_recursive_attn_processor(name, module, processor)
-
     # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.set_default_attn_processor
     def set_default_attn_processor(self):
         """
diff --git a/src/diffusers/pipelines/aura_flow/pipeline_aura_flow.py b/src/diffusers/pipelines/aura_flow/pipeline_aura_flow.py
index 6251ca443533..bb9884e41381 100644
--- a/src/diffusers/pipelines/aura_flow/pipeline_aura_flow.py
+++ b/src/diffusers/pipelines/aura_flow/pipeline_aura_flow.py
@@ -21,10 +21,10 @@
 from ...image_processor import VaeImageProcessor
 from ...loaders import AuraFlowLoraLoaderMixin
 from ...models import AuraFlowTransformer2DModel, AutoencoderKL
-from ...models.attention_processor import AttnProcessor2_0, FusedAttnProcessor2_0, XFormersAttnProcessor
 from ...schedulers import FlowMatchEulerDiscreteScheduler
 from ...utils import (
     USE_PEFT_BACKEND,
+    deprecate,
     is_torch_xla_available,
     logging,
     replace_example_docstring,
@@ -406,22 +406,12 @@ def prepare_latents(
 
     # Copied from diffusers.pipelines.stable_diffusion_xl.pipeline_stable_diffusion_xl.StableDiffusionXLPipeline.upcast_vae
     def upcast_vae(self):
-        dtype = self.vae.dtype
-        self.vae.to(dtype=torch.float32)
-        use_torch_2_0_or_xformers = isinstance(
-            self.vae.decoder.mid_block.attentions[0].processor,
-            (
-                AttnProcessor2_0,
-                XFormersAttnProcessor,
-                FusedAttnProcessor2_0,
-            ),
+        deprecate(
+            "upcast_vae",
+            "1.0.0",
+            "`upcast_vae` is deprecated. Please use `pipe.vae.to(torch.float32)`. For more details, please refer to: https://github.com/huggingface/diffusers/pull/12619#issue-3606633695.",
         )
-        # if xformers or torch_2_0 is used attention block does not need
-        # to be in float32 which can save lots of memory
-        if use_torch_2_0_or_xformers:
-            self.vae.post_quant_conv.to(dtype)
-            self.vae.decoder.conv_in.to(dtype)
-            self.vae.decoder.mid_block.to(dtype)
+        self.vae.to(dtype=torch.float32)
 
     @property
     def guidance_scale(self):
diff --git a/src/diffusers/pipelines/auto_pipeline.py b/src/diffusers/pipelines/auto_pipeline.py
index 8a32d4c367a3..db0268a2a73d 100644
--- a/src/diffusers/pipelines/auto_pipeline.py
+++ b/src/diffusers/pipelines/auto_pipeline.py
@@ -117,7 +117,9 @@
     StableDiffusionXLInpaintPipeline,
     StableDiffusionXLPipeline,
 )
+from .wan import WanImageToVideoPipeline, WanPipeline, WanVideoToVideoPipeline
 from .wuerstchen import WuerstchenCombinedPipeline, WuerstchenDecoderPipeline
+from .z_image import ZImageImg2ImgPipeline, ZImagePipeline
 
 
 AUTO_TEXT2IMAGE_PIPELINES_MAPPING = OrderedDict(
@@ -161,6 +163,7 @@
         ("cogview4-control", CogView4ControlPipeline),
         ("qwenimage", QwenImagePipeline),
         ("qwenimage-controlnet", QwenImageControlNetPipeline),
+        ("z-image", ZImagePipeline),
     ]
 )
 
@@ -188,6 +191,7 @@
         ("qwenimage", QwenImageImg2ImgPipeline),
         ("qwenimage-edit", QwenImageEditPipeline),
         ("qwenimage-edit-plus", QwenImageEditPlusPipeline),
+        ("z-image", ZImageImg2ImgPipeline),
     ]
 )
 
@@ -214,6 +218,24 @@
     ]
 )
 
+AUTO_TEXT2VIDEO_PIPELINES_MAPPING = OrderedDict(
+    [
+        ("wan", WanPipeline),
+    ]
+)
+
+AUTO_IMAGE2VIDEO_PIPELINES_MAPPING = OrderedDict(
+    [
+        ("wan", WanImageToVideoPipeline),
+    ]
+)
+
+AUTO_VIDEO2VIDEO_PIPELINES_MAPPING = OrderedDict(
+    [
+        ("wan", WanVideoToVideoPipeline),
+    ]
+)
+
 _AUTO_TEXT2IMAGE_DECODER_PIPELINES_MAPPING = OrderedDict(
     [
         ("kandinsky", KandinskyPipeline),
@@ -247,6 +269,9 @@
     AUTO_TEXT2IMAGE_PIPELINES_MAPPING,
     AUTO_IMAGE2IMAGE_PIPELINES_MAPPING,
     AUTO_INPAINT_PIPELINES_MAPPING,
+    AUTO_TEXT2VIDEO_PIPELINES_MAPPING,
+    AUTO_IMAGE2VIDEO_PIPELINES_MAPPING,
+    AUTO_VIDEO2VIDEO_PIPELINES_MAPPING,
     _AUTO_TEXT2IMAGE_DECODER_PIPELINES_MAPPING,
     _AUTO_IMAGE2IMAGE_DECODER_PIPELINES_MAPPING,
     _AUTO_INPAINT_DECODER_PIPELINES_MAPPING,
diff --git a/src/diffusers/pipelines/bria/pipeline_bria.py b/src/diffusers/pipelines/bria/pipeline_bria.py
index ebddfb0c0eee..a22a756005ac 100644
--- a/src/diffusers/pipelines/bria/pipeline_bria.py
+++ b/src/diffusers/pipelines/bria/pipeline_bria.py
@@ -245,7 +245,7 @@ def guidance_scale(self):
         return self._guidance_scale
 
     # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
-    # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+    # of the Imagen paper: https://huggingface.co/papers/2205.11487 . `guidance_scale = 1`
     # corresponds to doing no classifier free guidance.
     @property
     def do_classifier_free_guidance(self):
@@ -489,11 +489,11 @@ def __call__(
                 in their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is
                 passed will be used. Must be in descending order.
             guidance_scale (`float`, *optional*, defaults to 5.0):
-                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
-                `guidance_scale` is defined as `w` of equation 2. of [Imagen
-                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
-                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
-                usually at the expense of lower image quality.
+                Guidance scale as defined in [Classifier-Free Diffusion
+                Guidance](https://huggingface.co/papers/2207.12598). `guidance_scale` is defined as `w` of equation 2.
+                of [Imagen Paper](https://huggingface.co/papers/2205.11487). Guidance scale is enabled by setting
+                `guidance_scale > 1`. Higher guidance scale encourages to generate images that are closely linked to
+                the text `prompt`, usually at the expense of lower image quality.
             negative_prompt (`str` or `List[str]`, *optional*):
                 The prompt or prompts not to guide the image generation. If not defined, one has to pass
                 `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
diff --git a/src/diffusers/pipelines/bria_fibo/pipeline_bria_fibo.py b/src/diffusers/pipelines/bria_fibo/pipeline_bria_fibo.py
index 85d29029e667..8fd29756b290 100644
--- a/src/diffusers/pipelines/bria_fibo/pipeline_bria_fibo.py
+++ b/src/diffusers/pipelines/bria_fibo/pipeline_bria_fibo.py
@@ -73,7 +73,7 @@
 """
 
 
-class BriaFiboPipeline(DiffusionPipeline):
+class BriaFiboPipeline(DiffusionPipeline, FluxLoraLoaderMixin):
     r"""
     Args:
         transformer (`BriaFiboTransformer2DModel`):
@@ -337,7 +337,7 @@ def guidance_scale(self):
         return self._guidance_scale
 
     # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
-    # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+    # of the Imagen paper: https://huggingface.co/papers/2205.11487 . `guidance_scale = 1`
     # corresponds to doing no classifier free guidance.
 
     @property
@@ -498,11 +498,11 @@ def __call__(
                 in their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is
                 passed will be used. Must be in descending order.
             guidance_scale (`float`, *optional*, defaults to 5.0):
-                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
-                `guidance_scale` is defined as `w` of equation 2. of [Imagen
-                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
-                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
-                usually at the expense of lower image quality.
+                Guidance scale as defined in [Classifier-Free Diffusion
+                Guidance](https://huggingface.co/papers/2207.12598). `guidance_scale` is defined as `w` of equation 2.
+                of [Imagen Paper](https://huggingface.co/papers/2205.11487). Guidance scale is enabled by setting
+                `guidance_scale > 1`. Higher guidance scale encourages to generate images that are closely linked to
+                the text `prompt`, usually at the expense of lower image quality.
             negative_prompt (`str` or `List[str]`, *optional*):
                 The prompt or prompts not to guide the image generation. If not defined, one has to pass
                 `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
diff --git a/src/diffusers/pipelines/chronoedit/__init__.py b/src/diffusers/pipelines/chronoedit/__init__.py
new file mode 100644
index 000000000000..cffe4660977f
--- /dev/null
+++ b/src/diffusers/pipelines/chronoedit/__init__.py
@@ -0,0 +1,47 @@
+from typing import TYPE_CHECKING
+
+from ...utils import (
+    DIFFUSERS_SLOW_IMPORT,
+    OptionalDependencyNotAvailable,
+    _LazyModule,
+    get_objects_from_module,
+    is_torch_available,
+    is_transformers_available,
+)
+
+
+_dummy_objects = {}
+_import_structure = {}
+
+
+try:
+    if not (is_transformers_available() and is_torch_available()):
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    from ...utils import dummy_torch_and_transformers_objects  # noqa F403
+
+    _dummy_objects.update(get_objects_from_module(dummy_torch_and_transformers_objects))
+else:
+    _import_structure["pipeline_chronoedit"] = ["ChronoEditPipeline"]
+if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
+    try:
+        if not (is_transformers_available() and is_torch_available()):
+            raise OptionalDependencyNotAvailable()
+
+    except OptionalDependencyNotAvailable:
+        from ...utils.dummy_torch_and_transformers_objects import *
+    else:
+        from .pipeline_chronoedit import ChronoEditPipeline
+
+else:
+    import sys
+
+    sys.modules[__name__] = _LazyModule(
+        __name__,
+        globals()["__file__"],
+        _import_structure,
+        module_spec=__spec__,
+    )
+
+    for name, value in _dummy_objects.items():
+        setattr(sys.modules[__name__], name, value)
diff --git a/src/diffusers/pipelines/chronoedit/pipeline_chronoedit.py b/src/diffusers/pipelines/chronoedit/pipeline_chronoedit.py
new file mode 100644
index 000000000000..79f6580fbed6
--- /dev/null
+++ b/src/diffusers/pipelines/chronoedit/pipeline_chronoedit.py
@@ -0,0 +1,752 @@
+# Copyright 2025 The ChronoEdit Team and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import html
+from typing import Any, Callable, Dict, List, Optional, Tuple, Union
+
+import PIL
+import regex as re
+import torch
+from transformers import AutoTokenizer, CLIPImageProcessor, CLIPVisionModel, UMT5EncoderModel
+
+from ...callbacks import MultiPipelineCallbacks, PipelineCallback
+from ...image_processor import PipelineImageInput
+from ...loaders import WanLoraLoaderMixin
+from ...models import AutoencoderKLWan, ChronoEditTransformer3DModel
+from ...schedulers import FlowMatchEulerDiscreteScheduler
+from ...utils import is_ftfy_available, is_torch_xla_available, logging, replace_example_docstring
+from ...utils.torch_utils import randn_tensor
+from ...video_processor import VideoProcessor
+from ..pipeline_utils import DiffusionPipeline
+from .pipeline_output import ChronoEditPipelineOutput
+
+
+if is_torch_xla_available():
+    import torch_xla.core.xla_model as xm
+
+    XLA_AVAILABLE = True
+else:
+    XLA_AVAILABLE = False
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+if is_ftfy_available():
+    import ftfy
+
+EXAMPLE_DOC_STRING = """
+    Examples:
+        ```python
+        >>> import torch
+        >>> import numpy as np
+        >>> from diffusers import AutoencoderKLWan, ChronoEditTransformer3DModel, ChronoEditPipeline
+        >>> from diffusers.utils import export_to_video, load_image
+        >>> from transformers import CLIPVisionModel
+
+        >>> # Available models: nvidia/ChronoEdit-14B-Diffusers
+        >>> model_id = "nvidia/ChronoEdit-14B-Diffusers"
+        >>> image_encoder = CLIPVisionModel.from_pretrained(
+        ...     model_id, subfolder="image_encoder", torch_dtype=torch.float32
+        ... )
+        >>> vae = AutoencoderKLWan.from_pretrained(model_id, subfolder="vae", torch_dtype=torch.float32)
+        >>> transformer = ChronoEditTransformer3DModel.from_pretrained(
+        ...     model_id, subfolder="transformer", torch_dtype=torch.bfloat16
+        ... )
+        >>> pipe = ChronoEditPipeline.from_pretrained(
+        ...     model_id, vae=vae, image_encoder=image_encoder, transformer=transformer, torch_dtype=torch.bfloat16
+        ... )
+        >>> pipe.to("cuda")
+
+        >>> image = load_image("https://huggingface.co/spaces/nvidia/ChronoEdit/resolve/main/examples/3.png")
+        >>> max_area = 720 * 1280
+        >>> aspect_ratio = image.height / image.width
+        >>> mod_value = pipe.vae_scale_factor_spatial * pipe.transformer.config.patch_size[1]
+        >>> height = round(np.sqrt(max_area * aspect_ratio)) // mod_value * mod_value
+        >>> width = round(np.sqrt(max_area / aspect_ratio)) // mod_value * mod_value
+        >>> image = image.resize((width, height))
+        >>> prompt = (
+        ...     "The user wants to transform the image by adding a small, cute mouse sitting inside the floral teacup, enjoying a spa bath. The mouse should appear relaxed and cheerful, with a tiny white bath towel draped over its head like a turban. It should be positioned comfortably in the cup’s liquid, with gentle steam rising around it to blend with the cozy atmosphere. "
+        ...     "The mouse’s pose should be natural—perhaps sitting upright with paws resting lightly on the rim or submerged in the tea. The teacup’s floral design, gold trim, and warm lighting must remain unchanged to preserve the original aesthetic. The steam should softly swirl around the mouse, enhancing the spa-like, whimsical mood."
+        ... )
+
+        >>> output = pipe(
+        ...     image=image,
+        ...     prompt=prompt,
+        ...     height=height,
+        ...     width=width,
+        ...     num_frames=5,
+        ...     guidance_scale=5.0,
+        ...     enable_temporal_reasoning=False,
+        ...     num_temporal_reasoning_steps=0,
+        ... ).frames[0]
+        >>> export_to_video(output, "output.mp4", fps=16)
+        ```
+"""
+
+
+def basic_clean(text):
+    text = ftfy.fix_text(text)
+    text = html.unescape(html.unescape(text))
+    return text.strip()
+
+
+def whitespace_clean(text):
+    text = re.sub(r"\s+", " ", text)
+    text = text.strip()
+    return text
+
+
+def prompt_clean(text):
+    text = whitespace_clean(basic_clean(text))
+    return text
+
+
+# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img.retrieve_latents
+def retrieve_latents(
+    encoder_output: torch.Tensor, generator: Optional[torch.Generator] = None, sample_mode: str = "sample"
+):
+    if hasattr(encoder_output, "latent_dist") and sample_mode == "sample":
+        return encoder_output.latent_dist.sample(generator)
+    elif hasattr(encoder_output, "latent_dist") and sample_mode == "argmax":
+        return encoder_output.latent_dist.mode()
+    elif hasattr(encoder_output, "latents"):
+        return encoder_output.latents
+    else:
+        raise AttributeError("Could not access latents of provided encoder_output")
+
+
+class ChronoEditPipeline(DiffusionPipeline, WanLoraLoaderMixin):
+    r"""
+    Pipeline for image-to-video generation using Wan.
+
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods
+    implemented for all pipelines (downloading, saving, running on a particular device, etc.).
+
+    Args:
+        tokenizer ([`T5Tokenizer`]):
+            Tokenizer from [T5](https://huggingface.co/docs/transformers/en/model_doc/t5#transformers.T5Tokenizer),
+            specifically the [google/umt5-xxl](https://huggingface.co/google/umt5-xxl) variant.
+        text_encoder ([`T5EncoderModel`]):
+            [T5](https://huggingface.co/docs/transformers/en/model_doc/t5#transformers.T5EncoderModel), specifically
+            the [google/umt5-xxl](https://huggingface.co/google/umt5-xxl) variant.
+        image_encoder ([`CLIPVisionModel`]):
+            [CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPVisionModel), specifically
+            the
+            [clip-vit-huge-patch14](https://github.com/mlfoundations/open_clip/blob/main/docs/PRETRAINED.md#vit-h14-xlm-roberta-large)
+            variant.
+        transformer ([`WanTransformer3DModel`]):
+            Conditional Transformer to denoise the input latents.
+        scheduler ([`UniPCMultistepScheduler`]):
+            A scheduler to be used in combination with `transformer` to denoise the encoded image latents.
+        vae ([`AutoencoderKLWan`]):
+            Variational Auto-Encoder (VAE) Model to encode and decode videos to and from latent representations.
+    """
+
+    model_cpu_offload_seq = "text_encoder->image_encoder->transformer->vae"
+    _callback_tensor_inputs = ["latents", "prompt_embeds", "negative_prompt_embeds"]
+
+    def __init__(
+        self,
+        tokenizer: AutoTokenizer,
+        text_encoder: UMT5EncoderModel,
+        image_encoder: CLIPVisionModel,
+        image_processor: CLIPImageProcessor,
+        transformer: ChronoEditTransformer3DModel,
+        vae: AutoencoderKLWan,
+        scheduler: FlowMatchEulerDiscreteScheduler,
+    ):
+        super().__init__()
+
+        self.register_modules(
+            vae=vae,
+            text_encoder=text_encoder,
+            tokenizer=tokenizer,
+            image_encoder=image_encoder,
+            transformer=transformer,
+            scheduler=scheduler,
+            image_processor=image_processor,
+        )
+
+        self.vae_scale_factor_temporal = self.vae.config.scale_factor_temporal if getattr(self, "vae", None) else 4
+        self.vae_scale_factor_spatial = self.vae.config.scale_factor_spatial if getattr(self, "vae", None) else 8
+        self.video_processor = VideoProcessor(vae_scale_factor=self.vae_scale_factor_spatial)
+        self.image_processor = image_processor
+
+    # Copied from diffusers.pipelines.wan.pipeline_wan_i2v.WanImageToVideoPipeline._get_t5_prompt_embeds
+    def _get_t5_prompt_embeds(
+        self,
+        prompt: Union[str, List[str]] = None,
+        num_videos_per_prompt: int = 1,
+        max_sequence_length: int = 512,
+        device: Optional[torch.device] = None,
+        dtype: Optional[torch.dtype] = None,
+    ):
+        device = device or self._execution_device
+        dtype = dtype or self.text_encoder.dtype
+
+        prompt = [prompt] if isinstance(prompt, str) else prompt
+        prompt = [prompt_clean(u) for u in prompt]
+        batch_size = len(prompt)
+
+        text_inputs = self.tokenizer(
+            prompt,
+            padding="max_length",
+            max_length=max_sequence_length,
+            truncation=True,
+            add_special_tokens=True,
+            return_attention_mask=True,
+            return_tensors="pt",
+        )
+        text_input_ids, mask = text_inputs.input_ids, text_inputs.attention_mask
+        seq_lens = mask.gt(0).sum(dim=1).long()
+
+        prompt_embeds = self.text_encoder(text_input_ids.to(device), mask.to(device)).last_hidden_state
+        prompt_embeds = prompt_embeds.to(dtype=dtype, device=device)
+        prompt_embeds = [u[:v] for u, v in zip(prompt_embeds, seq_lens)]
+        prompt_embeds = torch.stack(
+            [torch.cat([u, u.new_zeros(max_sequence_length - u.size(0), u.size(1))]) for u in prompt_embeds], dim=0
+        )
+
+        # duplicate text embeddings for each generation per prompt, using mps friendly method
+        _, seq_len, _ = prompt_embeds.shape
+        prompt_embeds = prompt_embeds.repeat(1, num_videos_per_prompt, 1)
+        prompt_embeds = prompt_embeds.view(batch_size * num_videos_per_prompt, seq_len, -1)
+
+        return prompt_embeds
+
+    # Copied from diffusers.pipelines.wan.pipeline_wan_i2v.WanImageToVideoPipeline.encode_image
+    def encode_image(
+        self,
+        image: PipelineImageInput,
+        device: Optional[torch.device] = None,
+    ):
+        device = device or self._execution_device
+        image = self.image_processor(images=image, return_tensors="pt").to(device)
+        image_embeds = self.image_encoder(**image, output_hidden_states=True)
+        return image_embeds.hidden_states[-2]
+
+    # Copied from diffusers.pipelines.wan.pipeline_wan.WanPipeline.encode_prompt
+    def encode_prompt(
+        self,
+        prompt: Union[str, List[str]],
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        do_classifier_free_guidance: bool = True,
+        num_videos_per_prompt: int = 1,
+        prompt_embeds: Optional[torch.Tensor] = None,
+        negative_prompt_embeds: Optional[torch.Tensor] = None,
+        max_sequence_length: int = 226,
+        device: Optional[torch.device] = None,
+        dtype: Optional[torch.dtype] = None,
+    ):
+        r"""
+        Encodes the prompt into text encoder hidden states.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                prompt to be encoded
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. If not defined, one has to pass
+                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
+                less than `1`).
+            do_classifier_free_guidance (`bool`, *optional*, defaults to `True`):
+                Whether to use classifier free guidance or not.
+            num_videos_per_prompt (`int`, *optional*, defaults to 1):
+                Number of videos that should be generated per prompt. torch device to place the resulting embeddings on
+            prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+            device: (`torch.device`, *optional*):
+                torch device
+            dtype: (`torch.dtype`, *optional*):
+                torch dtype
+        """
+        device = device or self._execution_device
+
+        prompt = [prompt] if isinstance(prompt, str) else prompt
+        if prompt is not None:
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        if prompt_embeds is None:
+            prompt_embeds = self._get_t5_prompt_embeds(
+                prompt=prompt,
+                num_videos_per_prompt=num_videos_per_prompt,
+                max_sequence_length=max_sequence_length,
+                device=device,
+                dtype=dtype,
+            )
+
+        if do_classifier_free_guidance and negative_prompt_embeds is None:
+            negative_prompt = negative_prompt or ""
+            negative_prompt = batch_size * [negative_prompt] if isinstance(negative_prompt, str) else negative_prompt
+
+            if prompt is not None and type(prompt) is not type(negative_prompt):
+                raise TypeError(
+                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
+                    f" {type(prompt)}."
+                )
+            elif batch_size != len(negative_prompt):
+                raise ValueError(
+                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
+                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
+                    " the batch size of `prompt`."
+                )
+
+            negative_prompt_embeds = self._get_t5_prompt_embeds(
+                prompt=negative_prompt,
+                num_videos_per_prompt=num_videos_per_prompt,
+                max_sequence_length=max_sequence_length,
+                device=device,
+                dtype=dtype,
+            )
+
+        return prompt_embeds, negative_prompt_embeds
+
+    # modified from diffusers.pipelines.wan.pipeline_wan_i2v.WanImageToVideoPipeline.check_inputs
+    def check_inputs(
+        self,
+        prompt,
+        negative_prompt,
+        image,
+        height,
+        width,
+        prompt_embeds=None,
+        negative_prompt_embeds=None,
+        image_embeds=None,
+        callback_on_step_end_tensor_inputs=None,
+    ):
+        if image is not None and image_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `image`: {image} and `image_embeds`: {image_embeds}. Please make sure to"
+                " only forward one of the two."
+            )
+        if image is None and image_embeds is None:
+            raise ValueError(
+                "Provide either `image` or `prompt_embeds`. Cannot leave both `image` and `image_embeds` undefined."
+            )
+        if image is not None and not isinstance(image, torch.Tensor) and not isinstance(image, PIL.Image.Image):
+            raise ValueError(f"`image` has to be of type `torch.Tensor` or `PIL.Image.Image` but is {type(image)}")
+        if height % 16 != 0 or width % 16 != 0:
+            raise ValueError(f"`height` and `width` have to be divisible by 16 but are {height} and {width}.")
+
+        if callback_on_step_end_tensor_inputs is not None and not all(
+            k in self._callback_tensor_inputs for k in callback_on_step_end_tensor_inputs
+        ):
+            raise ValueError(
+                f"`callback_on_step_end_tensor_inputs` has to be in {self._callback_tensor_inputs}, but found {[k for k in callback_on_step_end_tensor_inputs if k not in self._callback_tensor_inputs]}"
+            )
+
+        if prompt is not None and prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
+                " only forward one of the two."
+            )
+        elif negative_prompt is not None and negative_prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`: {negative_prompt_embeds}. Please make sure to"
+                " only forward one of the two."
+            )
+        elif prompt is None and prompt_embeds is None:
+            raise ValueError(
+                "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
+            )
+        elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+        elif negative_prompt is not None and (
+            not isinstance(negative_prompt, str) and not isinstance(negative_prompt, list)
+        ):
+            raise ValueError(f"`negative_prompt` has to be of type `str` or `list` but is {type(negative_prompt)}")
+
+    # modified from diffusers.pipelines.wan.pipeline_wan_i2v.WanImageToVideoPipeline.prepare_latents
+    def prepare_latents(
+        self,
+        image: PipelineImageInput,
+        batch_size: int,
+        num_channels_latents: int = 16,
+        height: int = 480,
+        width: int = 832,
+        num_frames: int = 81,
+        dtype: Optional[torch.dtype] = None,
+        device: Optional[torch.device] = None,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.Tensor] = None,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        num_latent_frames = (num_frames - 1) // self.vae_scale_factor_temporal + 1
+        latent_height = height // self.vae_scale_factor_spatial
+        latent_width = width // self.vae_scale_factor_spatial
+
+        shape = (batch_size, num_channels_latents, num_latent_frames, latent_height, latent_width)
+        if isinstance(generator, list) and len(generator) != batch_size:
+            raise ValueError(
+                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+            )
+
+        if latents is None:
+            latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+        else:
+            latents = latents.to(device=device, dtype=dtype)
+
+        image = image.unsqueeze(2)  # [batch_size, channels, 1, height, width]
+        video_condition = torch.cat(
+            [image, image.new_zeros(image.shape[0], image.shape[1], num_frames - 1, height, width)], dim=2
+        )
+        video_condition = video_condition.to(device=device, dtype=self.vae.dtype)
+
+        latents_mean = (
+            torch.tensor(self.vae.config.latents_mean)
+            .view(1, self.vae.config.z_dim, 1, 1, 1)
+            .to(latents.device, latents.dtype)
+        )
+        latents_std = 1.0 / torch.tensor(self.vae.config.latents_std).view(1, self.vae.config.z_dim, 1, 1, 1).to(
+            latents.device, latents.dtype
+        )
+
+        if isinstance(generator, list):
+            latent_condition = [
+                retrieve_latents(self.vae.encode(video_condition), sample_mode="argmax") for _ in generator
+            ]
+            latent_condition = torch.cat(latent_condition)
+        else:
+            latent_condition = retrieve_latents(self.vae.encode(video_condition), sample_mode="argmax")
+            latent_condition = latent_condition.repeat(batch_size, 1, 1, 1, 1)
+
+        latent_condition = latent_condition.to(dtype)
+        latent_condition = (latent_condition - latents_mean) * latents_std
+
+        mask_lat_size = torch.ones(batch_size, 1, num_frames, latent_height, latent_width)
+        mask_lat_size[:, :, list(range(1, num_frames))] = 0
+        first_frame_mask = mask_lat_size[:, :, 0:1]
+        first_frame_mask = torch.repeat_interleave(first_frame_mask, dim=2, repeats=self.vae_scale_factor_temporal)
+        mask_lat_size = torch.concat([first_frame_mask, mask_lat_size[:, :, 1:, :]], dim=2)
+        mask_lat_size = mask_lat_size.view(batch_size, -1, self.vae_scale_factor_temporal, latent_height, latent_width)
+        mask_lat_size = mask_lat_size.transpose(1, 2)
+        mask_lat_size = mask_lat_size.to(latent_condition.device)
+
+        return latents, torch.concat([mask_lat_size, latent_condition], dim=1)
+
+    @property
+    def guidance_scale(self):
+        return self._guidance_scale
+
+    @property
+    def do_classifier_free_guidance(self):
+        return self._guidance_scale > 1
+
+    @property
+    def num_timesteps(self):
+        return self._num_timesteps
+
+    @property
+    def current_timestep(self):
+        return self._current_timestep
+
+    @property
+    def interrupt(self):
+        return self._interrupt
+
+    @property
+    def attention_kwargs(self):
+        return self._attention_kwargs
+
+    @torch.no_grad()
+    @replace_example_docstring(EXAMPLE_DOC_STRING)
+    def __call__(
+        self,
+        image: PipelineImageInput,
+        prompt: Union[str, List[str]] = None,
+        negative_prompt: Union[str, List[str]] = None,
+        height: int = 480,
+        width: int = 832,
+        num_frames: int = 81,
+        num_inference_steps: int = 50,
+        guidance_scale: float = 5.0,
+        num_videos_per_prompt: Optional[int] = 1,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.Tensor] = None,
+        prompt_embeds: Optional[torch.Tensor] = None,
+        negative_prompt_embeds: Optional[torch.Tensor] = None,
+        image_embeds: Optional[torch.Tensor] = None,
+        output_type: Optional[str] = "np",
+        return_dict: bool = True,
+        attention_kwargs: Optional[Dict[str, Any]] = None,
+        callback_on_step_end: Optional[
+            Union[Callable[[int, int, Dict], None], PipelineCallback, MultiPipelineCallbacks]
+        ] = None,
+        callback_on_step_end_tensor_inputs: List[str] = ["latents"],
+        max_sequence_length: int = 512,
+        enable_temporal_reasoning: bool = False,
+        num_temporal_reasoning_steps: int = 0,
+    ):
+        r"""
+        The call function to the pipeline for generation.
+
+        Args:
+            image (`PipelineImageInput`):
+                The input image to condition the generation on. Must be an image, a list of images or a `torch.Tensor`.
+            prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
+                instead.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. If not defined, one has to pass
+                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
+                less than `1`).
+            height (`int`, defaults to `480`):
+                The height of the generated video.
+            width (`int`, defaults to `832`):
+                The width of the generated video.
+            num_frames (`int`, defaults to `81`):
+                The number of frames in the generated video.
+            num_inference_steps (`int`, defaults to `50`):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            guidance_scale (`float`, defaults to `5.0`):
+                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
+                `guidance_scale` is defined as `w` of equation 2. of [Imagen
+                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
+                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
+                usually at the expense of lower image quality.
+            num_videos_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
+                generation deterministic.
+            latents (`torch.Tensor`, *optional*):
+                Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor is generated by sampling using the supplied random `generator`.
+            prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not
+                provided, text embeddings are generated from the `prompt` input argument.
+            negative_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not
+                provided, text embeddings are generated from the `negative_prompt` input argument.
+            image_embeds (`torch.Tensor`, *optional*):
+                Pre-generated image embeddings. Can be used to easily tweak image inputs (weighting). If not provided,
+                image embeddings are generated from the `image` input argument.
+            output_type (`str`, *optional*, defaults to `"np"`):
+                The output format of the generated image. Choose between `PIL.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`ChronoEditPipelineOutput`] instead of a plain tuple.
+            attention_kwargs (`dict`, *optional*):
+                A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
+                `self.processor` in
+                [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
+            callback_on_step_end (`Callable`, `PipelineCallback`, `MultiPipelineCallbacks`, *optional*):
+                A function or a subclass of `PipelineCallback` or `MultiPipelineCallbacks` that is called at the end of
+                each denoising step during the inference. with the following arguments: `callback_on_step_end(self:
+                DiffusionPipeline, step: int, timestep: int, callback_kwargs: Dict)`. `callback_kwargs` will include a
+                list of all tensors as specified by `callback_on_step_end_tensor_inputs`.
+            callback_on_step_end_tensor_inputs (`List`, *optional*):
+                The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
+                will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
+                `._callback_tensor_inputs` attribute of your pipeline class.
+            max_sequence_length (`int`, defaults to `512`):
+                The maximum sequence length of the text encoder. If the prompt is longer than this, it will be
+                truncated. If the prompt is shorter, it will be padded to this length.
+            enable_temporal_reasoning (`bool`, *optional*, defaults to `False`):
+                Whether to enable temporal reasoning.
+            num_temporal_reasoning_steps (`int`, *optional*, defaults to `0`):
+                The number of steps to enable temporal reasoning.
+
+        Examples:
+
+        Returns:
+            [`~ChronoEditPipelineOutput`] or `tuple`:
+                If `return_dict` is `True`, [`ChronoEditPipelineOutput`] is returned, otherwise a `tuple` is returned
+                where the first element is a list with the generated images and the second element is a list of `bool`s
+                indicating whether the corresponding generated image contains "not-safe-for-work" (nsfw) content.
+        """
+
+        if isinstance(callback_on_step_end, (PipelineCallback, MultiPipelineCallbacks)):
+            callback_on_step_end_tensor_inputs = callback_on_step_end.tensor_inputs
+
+        # 1. Check inputs. Raise error if not correct
+        self.check_inputs(
+            prompt,
+            negative_prompt,
+            image,
+            height,
+            width,
+            prompt_embeds,
+            negative_prompt_embeds,
+            image_embeds,
+            callback_on_step_end_tensor_inputs,
+        )
+
+        num_frames = 5 if not enable_temporal_reasoning else num_frames
+
+        if num_frames % self.vae_scale_factor_temporal != 1:
+            logger.warning(
+                f"`num_frames - 1` has to be divisible by {self.vae_scale_factor_temporal}. Rounding to the nearest number."
+            )
+            num_frames = num_frames // self.vae_scale_factor_temporal * self.vae_scale_factor_temporal + 1
+        num_frames = max(num_frames, 1)
+
+        self._guidance_scale = guidance_scale
+        self._attention_kwargs = attention_kwargs
+        self._current_timestep = None
+        self._interrupt = False
+
+        device = self._execution_device
+
+        # 2. Define call parameters
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        # 3. Encode input prompt
+        prompt_embeds, negative_prompt_embeds = self.encode_prompt(
+            prompt=prompt,
+            negative_prompt=negative_prompt,
+            do_classifier_free_guidance=self.do_classifier_free_guidance,
+            num_videos_per_prompt=num_videos_per_prompt,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            max_sequence_length=max_sequence_length,
+            device=device,
+        )
+
+        # Encode image embedding
+        transformer_dtype = self.transformer.dtype
+        prompt_embeds = prompt_embeds.to(transformer_dtype)
+        if negative_prompt_embeds is not None:
+            negative_prompt_embeds = negative_prompt_embeds.to(transformer_dtype)
+
+        if image_embeds is None:
+            image_embeds = self.encode_image(image, device)
+        image_embeds = image_embeds.repeat(batch_size, 1, 1)
+        image_embeds = image_embeds.to(transformer_dtype)
+
+        # 4. Prepare timesteps
+        self.scheduler.set_timesteps(num_inference_steps, device=device)
+        timesteps = self.scheduler.timesteps
+
+        # 5. Prepare latent variables
+        num_channels_latents = self.vae.config.z_dim
+        image = self.video_processor.preprocess(image, height=height, width=width).to(device, dtype=torch.float32)
+        latents, condition = self.prepare_latents(
+            image,
+            batch_size * num_videos_per_prompt,
+            num_channels_latents,
+            height,
+            width,
+            num_frames,
+            torch.float32,
+            device,
+            generator,
+            latents,
+        )
+
+        # 6. Denoising loop
+        num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
+        self._num_timesteps = len(timesteps)
+
+        with self.progress_bar(total=num_inference_steps) as progress_bar:
+            for i, t in enumerate(timesteps):
+                if self.interrupt:
+                    continue
+
+                if enable_temporal_reasoning and i == num_temporal_reasoning_steps:
+                    latents = latents[:, :, [0, -1]]
+                    condition = condition[:, :, [0, -1]]
+
+                    for j in range(len(self.scheduler.model_outputs)):
+                        if self.scheduler.model_outputs[j] is not None:
+                            if latents.shape[-3] != self.scheduler.model_outputs[j].shape[-3]:
+                                self.scheduler.model_outputs[j] = self.scheduler.model_outputs[j][:, :, [0, -1]]
+                    if self.scheduler.last_sample is not None:
+                        self.scheduler.last_sample = self.scheduler.last_sample[:, :, [0, -1]]
+
+                self._current_timestep = t
+                latent_model_input = torch.cat([latents, condition], dim=1).to(transformer_dtype)
+                timestep = t.expand(latents.shape[0])
+
+                noise_pred = self.transformer(
+                    hidden_states=latent_model_input,
+                    timestep=timestep,
+                    encoder_hidden_states=prompt_embeds,
+                    encoder_hidden_states_image=image_embeds,
+                    attention_kwargs=attention_kwargs,
+                    return_dict=False,
+                )[0]
+
+                if self.do_classifier_free_guidance:
+                    noise_uncond = self.transformer(
+                        hidden_states=latent_model_input,
+                        timestep=timestep,
+                        encoder_hidden_states=negative_prompt_embeds,
+                        encoder_hidden_states_image=image_embeds,
+                        attention_kwargs=attention_kwargs,
+                        return_dict=False,
+                    )[0]
+                    noise_pred = noise_uncond + guidance_scale * (noise_pred - noise_uncond)
+
+                # compute the previous noisy sample x_t -> x_t-1
+                latents = self.scheduler.step(noise_pred, t, latents, return_dict=False)[0]
+
+                if callback_on_step_end is not None:
+                    callback_kwargs = {}
+                    for k in callback_on_step_end_tensor_inputs:
+                        callback_kwargs[k] = locals()[k]
+                    callback_outputs = callback_on_step_end(self, i, t, callback_kwargs)
+
+                    latents = callback_outputs.pop("latents", latents)
+                    prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds)
+                    negative_prompt_embeds = callback_outputs.pop("negative_prompt_embeds", negative_prompt_embeds)
+
+                # call the callback, if provided
+                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+                    progress_bar.update()
+
+                if XLA_AVAILABLE:
+                    xm.mark_step()
+
+        self._current_timestep = None
+
+        if not output_type == "latent":
+            latents = latents.to(self.vae.dtype)
+            latents_mean = (
+                torch.tensor(self.vae.config.latents_mean)
+                .view(1, self.vae.config.z_dim, 1, 1, 1)
+                .to(latents.device, latents.dtype)
+            )
+            latents_std = 1.0 / torch.tensor(self.vae.config.latents_std).view(1, self.vae.config.z_dim, 1, 1, 1).to(
+                latents.device, latents.dtype
+            )
+            latents = latents / latents_std + latents_mean
+            if enable_temporal_reasoning and latents.shape[2] > 2:
+                video_edit = self.vae.decode(latents[:, :, [0, -1]], return_dict=False)[0]
+                video_reason = self.vae.decode(latents[:, :, :-1], return_dict=False)[0]
+                video = torch.cat([video_reason, video_edit[:, :, 1:]], dim=2)
+            else:
+                video = self.vae.decode(latents, return_dict=False)[0]
+            video = self.video_processor.postprocess_video(video, output_type=output_type)
+        else:
+            video = latents
+
+        # Offload all models
+        self.maybe_free_model_hooks()
+
+        if not return_dict:
+            return (video,)
+
+        return ChronoEditPipelineOutput(frames=video)
diff --git a/src/diffusers/pipelines/chronoedit/pipeline_output.py b/src/diffusers/pipelines/chronoedit/pipeline_output.py
new file mode 100644
index 000000000000..b1df5b9de35d
--- /dev/null
+++ b/src/diffusers/pipelines/chronoedit/pipeline_output.py
@@ -0,0 +1,20 @@
+from dataclasses import dataclass
+
+import torch
+
+from diffusers.utils import BaseOutput
+
+
+@dataclass
+class ChronoEditPipelineOutput(BaseOutput):
+    r"""
+    Output class for ChronoEdit pipelines.
+
+    Args:
+        frames (`torch.Tensor`, `np.ndarray`, or List[List[PIL.Image.Image]]):
+            List of video outputs - It can be a nested list of length `batch_size,` with each sub-list containing
+            denoised PIL image sequences of length `num_frames.` It can also be a NumPy array or Torch tensor of shape
+            `(batch_size, num_frames, channels, height, width)`.
+    """
+
+    frames: torch.Tensor
diff --git a/src/diffusers/pipelines/controlnet/pipeline_controlnet_inpaint_sd_xl.py b/src/diffusers/pipelines/controlnet/pipeline_controlnet_inpaint_sd_xl.py
index 397ab15715c2..fb09d04832f3 100644
--- a/src/diffusers/pipelines/controlnet/pipeline_controlnet_inpaint_sd_xl.py
+++ b/src/diffusers/pipelines/controlnet/pipeline_controlnet_inpaint_sd_xl.py
@@ -36,10 +36,6 @@
     TextualInversionLoaderMixin,
 )
 from ...models import AutoencoderKL, ControlNetModel, ImageProjection, MultiControlNetModel, UNet2DConditionModel
-from ...models.attention_processor import (
-    AttnProcessor2_0,
-    XFormersAttnProcessor,
-)
 from ...models.lora import adjust_lora_scale_text_encoder
 from ...schedulers import KarrasDiffusionSchedulers
 from ...utils import (
@@ -1132,21 +1128,12 @@ def _get_add_time_ids(
 
     # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_upscale.StableDiffusionUpscalePipeline.upcast_vae
     def upcast_vae(self):
-        dtype = self.vae.dtype
-        self.vae.to(dtype=torch.float32)
-        use_torch_2_0_or_xformers = isinstance(
-            self.vae.decoder.mid_block.attentions[0].processor,
-            (
-                AttnProcessor2_0,
-                XFormersAttnProcessor,
-            ),
+        deprecate(
+            "upcast_vae",
+            "1.0.0",
+            "`upcast_vae` is deprecated. Please use `pipe.vae.to(torch.float32)`. For more details, please refer to: https://github.com/huggingface/diffusers/pull/12619#issue-3606633695.",
         )
-        # if xformers or torch_2_0 is used attention block does not need
-        # to be in float32 which can save lots of memory
-        if use_torch_2_0_or_xformers:
-            self.vae.post_quant_conv.to(dtype)
-            self.vae.decoder.conv_in.to(dtype)
-            self.vae.decoder.mid_block.to(dtype)
+        self.vae.to(dtype=torch.float32)
 
     @property
     def guidance_scale(self):
diff --git a/src/diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl.py b/src/diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl.py
index 8b37d38b9099..0e2a1441f8f6 100644
--- a/src/diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl.py
+++ b/src/diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl.py
@@ -39,10 +39,6 @@
     TextualInversionLoaderMixin,
 )
 from ...models import AutoencoderKL, ControlNetModel, ImageProjection, MultiControlNetModel, UNet2DConditionModel
-from ...models.attention_processor import (
-    AttnProcessor2_0,
-    XFormersAttnProcessor,
-)
 from ...models.lora import adjust_lora_scale_text_encoder
 from ...schedulers import KarrasDiffusionSchedulers
 from ...utils import (
@@ -929,21 +925,12 @@ def _get_add_time_ids(
 
     # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_upscale.StableDiffusionUpscalePipeline.upcast_vae
     def upcast_vae(self):
-        dtype = self.vae.dtype
-        self.vae.to(dtype=torch.float32)
-        use_torch_2_0_or_xformers = isinstance(
-            self.vae.decoder.mid_block.attentions[0].processor,
-            (
-                AttnProcessor2_0,
-                XFormersAttnProcessor,
-            ),
+        deprecate(
+            "upcast_vae",
+            "1.0.0",
+            "`upcast_vae` is deprecated. Please use `pipe.vae.to(torch.float32)`. For more details, please refer to: https://github.com/huggingface/diffusers/pull/12619#issue-3606633695.",
         )
-        # if xformers or torch_2_0 is used attention block does not need
-        # to be in float32 which can save lots of memory
-        if use_torch_2_0_or_xformers:
-            self.vae.post_quant_conv.to(dtype)
-            self.vae.decoder.conv_in.to(dtype)
-            self.vae.decoder.mid_block.to(dtype)
+        self.vae.to(dtype=torch.float32)
 
     # Copied from diffusers.pipelines.latent_consistency_models.pipeline_latent_consistency_text2img.LatentConsistencyModelPipeline.get_guidance_scale_embedding
     def get_guidance_scale_embedding(
diff --git a/src/diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl_img2img.py b/src/diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl_img2img.py
index 4d4845c5a0a3..94c4c394465b 100644
--- a/src/diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl_img2img.py
+++ b/src/diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl_img2img.py
@@ -39,10 +39,6 @@
     TextualInversionLoaderMixin,
 )
 from ...models import AutoencoderKL, ControlNetModel, ImageProjection, MultiControlNetModel, UNet2DConditionModel
-from ...models.attention_processor import (
-    AttnProcessor2_0,
-    XFormersAttnProcessor,
-)
 from ...models.lora import adjust_lora_scale_text_encoder
 from ...schedulers import KarrasDiffusionSchedulers
 from ...utils import (
@@ -1044,21 +1040,12 @@ def _get_add_time_ids(
 
     # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_upscale.StableDiffusionUpscalePipeline.upcast_vae
     def upcast_vae(self):
-        dtype = self.vae.dtype
-        self.vae.to(dtype=torch.float32)
-        use_torch_2_0_or_xformers = isinstance(
-            self.vae.decoder.mid_block.attentions[0].processor,
-            (
-                AttnProcessor2_0,
-                XFormersAttnProcessor,
-            ),
+        deprecate(
+            "upcast_vae",
+            "1.0.0",
+            "`upcast_vae` is deprecated. Please use `pipe.vae.to(torch.float32)`. For more details, please refer to: https://github.com/huggingface/diffusers/pull/12619#issue-3606633695.",
         )
-        # if xformers or torch_2_0 is used attention block does not need
-        # to be in float32 which can save lots of memory
-        if use_torch_2_0_or_xformers:
-            self.vae.post_quant_conv.to(dtype)
-            self.vae.decoder.conv_in.to(dtype)
-            self.vae.decoder.mid_block.to(dtype)
+        self.vae.to(dtype=torch.float32)
 
     @property
     def guidance_scale(self):
diff --git a/src/diffusers/pipelines/controlnet/pipeline_controlnet_union_inpaint_sd_xl.py b/src/diffusers/pipelines/controlnet/pipeline_controlnet_union_inpaint_sd_xl.py
index fb58b222112a..e234015f8616 100644
--- a/src/diffusers/pipelines/controlnet/pipeline_controlnet_union_inpaint_sd_xl.py
+++ b/src/diffusers/pipelines/controlnet/pipeline_controlnet_union_inpaint_sd_xl.py
@@ -41,10 +41,6 @@
     MultiControlNetUnionModel,
     UNet2DConditionModel,
 )
-from ...models.attention_processor import (
-    AttnProcessor2_0,
-    XFormersAttnProcessor,
-)
 from ...models.lora import adjust_lora_scale_text_encoder
 from ...schedulers import KarrasDiffusionSchedulers
 from ...utils import (
@@ -1111,21 +1107,12 @@ def _get_add_time_ids(
 
     # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_upscale.StableDiffusionUpscalePipeline.upcast_vae
     def upcast_vae(self):
-        dtype = self.vae.dtype
-        self.vae.to(dtype=torch.float32)
-        use_torch_2_0_or_xformers = isinstance(
-            self.vae.decoder.mid_block.attentions[0].processor,
-            (
-                AttnProcessor2_0,
-                XFormersAttnProcessor,
-            ),
+        deprecate(
+            "upcast_vae",
+            "1.0.0",
+            "`upcast_vae` is deprecated. Please use `pipe.vae.to(torch.float32)`. For more details, please refer to: https://github.com/huggingface/diffusers/pull/12619#issue-3606633695.",
         )
-        # if xformers or torch_2_0 is used attention block does not need
-        # to be in float32 which can save lots of memory
-        if use_torch_2_0_or_xformers:
-            self.vae.post_quant_conv.to(dtype)
-            self.vae.decoder.conv_in.to(dtype)
-            self.vae.decoder.mid_block.to(dtype)
+        self.vae.to(dtype=torch.float32)
 
     @property
     def guidance_scale(self):
diff --git a/src/diffusers/pipelines/controlnet/pipeline_controlnet_union_sd_xl.py b/src/diffusers/pipelines/controlnet/pipeline_controlnet_union_sd_xl.py
index 5961d389effb..40cc76cf70d8 100644
--- a/src/diffusers/pipelines/controlnet/pipeline_controlnet_union_sd_xl.py
+++ b/src/diffusers/pipelines/controlnet/pipeline_controlnet_union_sd_xl.py
@@ -44,14 +44,11 @@
     MultiControlNetUnionModel,
     UNet2DConditionModel,
 )
-from ...models.attention_processor import (
-    AttnProcessor2_0,
-    XFormersAttnProcessor,
-)
 from ...models.lora import adjust_lora_scale_text_encoder
 from ...schedulers import KarrasDiffusionSchedulers
 from ...utils import (
     USE_PEFT_BACKEND,
+    deprecate,
     logging,
     replace_example_docstring,
     scale_lora_layers,
@@ -904,21 +901,12 @@ def _get_add_time_ids(
 
     # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_upscale.StableDiffusionUpscalePipeline.upcast_vae
     def upcast_vae(self):
-        dtype = self.vae.dtype
-        self.vae.to(dtype=torch.float32)
-        use_torch_2_0_or_xformers = isinstance(
-            self.vae.decoder.mid_block.attentions[0].processor,
-            (
-                AttnProcessor2_0,
-                XFormersAttnProcessor,
-            ),
+        deprecate(
+            "upcast_vae",
+            "1.0.0",
+            "`upcast_vae` is deprecated. Please use `pipe.vae.to(torch.float32)`. For more details, please refer to: https://github.com/huggingface/diffusers/pull/12619#issue-3606633695.",
         )
-        # if xformers or torch_2_0 is used attention block does not need
-        # to be in float32 which can save lots of memory
-        if use_torch_2_0_or_xformers:
-            self.vae.post_quant_conv.to(dtype)
-            self.vae.decoder.conv_in.to(dtype)
-            self.vae.decoder.mid_block.to(dtype)
+        self.vae.to(dtype=torch.float32)
 
     # Copied from diffusers.pipelines.latent_consistency_models.pipeline_latent_consistency_text2img.LatentConsistencyModelPipeline.get_guidance_scale_embedding
     def get_guidance_scale_embedding(
diff --git a/src/diffusers/pipelines/controlnet/pipeline_controlnet_union_sd_xl_img2img.py b/src/diffusers/pipelines/controlnet/pipeline_controlnet_union_sd_xl_img2img.py
index 8fedb6d8609a..4d0093132b9c 100644
--- a/src/diffusers/pipelines/controlnet/pipeline_controlnet_union_sd_xl_img2img.py
+++ b/src/diffusers/pipelines/controlnet/pipeline_controlnet_union_sd_xl_img2img.py
@@ -44,10 +44,6 @@
     MultiControlNetUnionModel,
     UNet2DConditionModel,
 )
-from ...models.attention_processor import (
-    AttnProcessor2_0,
-    XFormersAttnProcessor,
-)
 from ...models.lora import adjust_lora_scale_text_encoder
 from ...schedulers import KarrasDiffusionSchedulers
 from ...utils import (
@@ -1032,21 +1028,12 @@ def _get_add_time_ids(
 
     # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_upscale.StableDiffusionUpscalePipeline.upcast_vae
     def upcast_vae(self):
-        dtype = self.vae.dtype
-        self.vae.to(dtype=torch.float32)
-        use_torch_2_0_or_xformers = isinstance(
-            self.vae.decoder.mid_block.attentions[0].processor,
-            (
-                AttnProcessor2_0,
-                XFormersAttnProcessor,
-            ),
+        deprecate(
+            "upcast_vae",
+            "1.0.0",
+            "`upcast_vae` is deprecated. Please use `pipe.vae.to(torch.float32)`. For more details, please refer to: https://github.com/huggingface/diffusers/pull/12619#issue-3606633695.",
         )
-        # if xformers or torch_2_0 is used attention block does not need
-        # to be in float32 which can save lots of memory
-        if use_torch_2_0_or_xformers:
-            self.vae.post_quant_conv.to(dtype)
-            self.vae.decoder.conv_in.to(dtype)
-            self.vae.decoder.mid_block.to(dtype)
+        self.vae.to(dtype=torch.float32)
 
     @property
     def guidance_scale(self):
diff --git a/src/diffusers/pipelines/controlnet_sd3/pipeline_stable_diffusion_3_controlnet.py b/src/diffusers/pipelines/controlnet_sd3/pipeline_stable_diffusion_3_controlnet.py
index f67a0e211281..d605eac1f2b1 100644
--- a/src/diffusers/pipelines/controlnet_sd3/pipeline_stable_diffusion_3_controlnet.py
+++ b/src/diffusers/pipelines/controlnet_sd3/pipeline_stable_diffusion_3_controlnet.py
@@ -355,7 +355,7 @@ def _get_clip_prompt_embeds(
         prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
         prompt_embeds = prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
 
-        pooled_prompt_embeds = pooled_prompt_embeds.repeat(1, num_images_per_prompt, 1)
+        pooled_prompt_embeds = pooled_prompt_embeds.repeat(1, num_images_per_prompt)
         pooled_prompt_embeds = pooled_prompt_embeds.view(batch_size * num_images_per_prompt, -1)
 
         return prompt_embeds, pooled_prompt_embeds
diff --git a/src/diffusers/pipelines/controlnet_sd3/pipeline_stable_diffusion_3_controlnet_inpainting.py b/src/diffusers/pipelines/controlnet_sd3/pipeline_stable_diffusion_3_controlnet_inpainting.py
index 68984da4dc40..9d0158c6b654 100644
--- a/src/diffusers/pipelines/controlnet_sd3/pipeline_stable_diffusion_3_controlnet_inpainting.py
+++ b/src/diffusers/pipelines/controlnet_sd3/pipeline_stable_diffusion_3_controlnet_inpainting.py
@@ -373,7 +373,7 @@ def _get_clip_prompt_embeds(
         prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
         prompt_embeds = prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
 
-        pooled_prompt_embeds = pooled_prompt_embeds.repeat(1, num_images_per_prompt, 1)
+        pooled_prompt_embeds = pooled_prompt_embeds.repeat(1, num_images_per_prompt)
         pooled_prompt_embeds = pooled_prompt_embeds.view(batch_size * num_images_per_prompt, -1)
 
         return prompt_embeds, pooled_prompt_embeds
diff --git a/src/diffusers/pipelines/controlnet_xs/pipeline_controlnet_xs_sd_xl.py b/src/diffusers/pipelines/controlnet_xs/pipeline_controlnet_xs_sd_xl.py
index c10931a0f44a..7bf610f3a0ba 100644
--- a/src/diffusers/pipelines/controlnet_xs/pipeline_controlnet_xs_sd_xl.py
+++ b/src/diffusers/pipelines/controlnet_xs/pipeline_controlnet_xs_sd_xl.py
@@ -32,14 +32,11 @@
 from ...image_processor import PipelineImageInput, VaeImageProcessor
 from ...loaders import FromSingleFileMixin, StableDiffusionXLLoraLoaderMixin, TextualInversionLoaderMixin
 from ...models import AutoencoderKL, ControlNetXSAdapter, UNet2DConditionModel, UNetControlNetXSModel
-from ...models.attention_processor import (
-    AttnProcessor2_0,
-    XFormersAttnProcessor,
-)
 from ...models.lora import adjust_lora_scale_text_encoder
 from ...schedulers import KarrasDiffusionSchedulers
 from ...utils import (
     USE_PEFT_BACKEND,
+    deprecate,
     logging,
     replace_example_docstring,
     scale_lora_layers,
@@ -685,21 +682,12 @@ def _get_add_time_ids(
 
     # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_upscale.StableDiffusionUpscalePipeline.upcast_vae
     def upcast_vae(self):
-        dtype = self.vae.dtype
-        self.vae.to(dtype=torch.float32)
-        use_torch_2_0_or_xformers = isinstance(
-            self.vae.decoder.mid_block.attentions[0].processor,
-            (
-                AttnProcessor2_0,
-                XFormersAttnProcessor,
-            ),
+        deprecate(
+            "upcast_vae",
+            "1.0.0",
+            "`upcast_vae` is deprecated. Please use `pipe.vae.to(torch.float32)`. For more details, please refer to: https://github.com/huggingface/diffusers/pull/12619#issue-3606633695.",
         )
-        # if xformers or torch_2_0 is used attention block does not need
-        # to be in float32 which can save lots of memory
-        if use_torch_2_0_or_xformers:
-            self.vae.post_quant_conv.to(dtype)
-            self.vae.decoder.conv_in.to(dtype)
-            self.vae.decoder.mid_block.to(dtype)
+        self.vae.to(dtype=torch.float32)
 
     @property
     # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.guidance_scale
diff --git a/src/diffusers/pipelines/deprecated/versatile_diffusion/modeling_text_unet.py b/src/diffusers/pipelines/deprecated/versatile_diffusion/modeling_text_unet.py
index 397fbc0d85b8..7c25713cd1d7 100644
--- a/src/diffusers/pipelines/deprecated/versatile_diffusion/modeling_text_unet.py
+++ b/src/diffusers/pipelines/deprecated/versatile_diffusion/modeling_text_unet.py
@@ -374,21 +374,21 @@ def __init__(
         center_input_sample: bool = False,
         flip_sin_to_cos: bool = True,
         freq_shift: int = 0,
-        down_block_types: Tuple[str] = (
+        down_block_types: Tuple[str, ...] = (
             "CrossAttnDownBlockFlat",
             "CrossAttnDownBlockFlat",
             "CrossAttnDownBlockFlat",
             "DownBlockFlat",
         ),
         mid_block_type: Optional[str] = "UNetMidBlockFlatCrossAttn",
-        up_block_types: Tuple[str] = (
+        up_block_types: Tuple[str, ...] = (
             "UpBlockFlat",
             "CrossAttnUpBlockFlat",
             "CrossAttnUpBlockFlat",
             "CrossAttnUpBlockFlat",
         ),
         only_cross_attention: Union[bool, Tuple[bool]] = False,
-        block_out_channels: Tuple[int] = (320, 640, 1280, 1280),
+        block_out_channels: Tuple[int, ...] = (320, 640, 1280, 1280),
         layers_per_block: Union[int, Tuple[int]] = 2,
         downsample_padding: int = 1,
         mid_block_scale_factor: float = 1,
diff --git a/src/diffusers/pipelines/flux2/__init__.py b/src/diffusers/pipelines/flux2/__init__.py
new file mode 100644
index 000000000000..d986c9a63011
--- /dev/null
+++ b/src/diffusers/pipelines/flux2/__init__.py
@@ -0,0 +1,47 @@
+from typing import TYPE_CHECKING
+
+from ...utils import (
+    DIFFUSERS_SLOW_IMPORT,
+    OptionalDependencyNotAvailable,
+    _LazyModule,
+    get_objects_from_module,
+    is_torch_available,
+    is_transformers_available,
+)
+
+
+_dummy_objects = {}
+_additional_imports = {}
+_import_structure = {"pipeline_output": ["Flux2PipelineOutput"]}
+
+try:
+    if not (is_transformers_available() and is_torch_available()):
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    from ...utils import dummy_torch_and_transformers_objects  # noqa F403
+
+    _dummy_objects.update(get_objects_from_module(dummy_torch_and_transformers_objects))
+else:
+    _import_structure["pipeline_flux2"] = ["Flux2Pipeline"]
+if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
+    try:
+        if not (is_transformers_available() and is_torch_available()):
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        from ...utils.dummy_torch_and_transformers_objects import *  # noqa F403
+    else:
+        from .pipeline_flux2 import Flux2Pipeline
+else:
+    import sys
+
+    sys.modules[__name__] = _LazyModule(
+        __name__,
+        globals()["__file__"],
+        _import_structure,
+        module_spec=__spec__,
+    )
+
+    for name, value in _dummy_objects.items():
+        setattr(sys.modules[__name__], name, value)
+    for name, value in _additional_imports.items():
+        setattr(sys.modules[__name__], name, value)
diff --git a/src/diffusers/pipelines/flux2/image_processor.py b/src/diffusers/pipelines/flux2/image_processor.py
new file mode 100644
index 000000000000..f1a8742491f7
--- /dev/null
+++ b/src/diffusers/pipelines/flux2/image_processor.py
@@ -0,0 +1,178 @@
+# Copyright 2025 The Black Forest Labs Team and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import math
+from typing import List
+
+import PIL.Image
+
+from ...configuration_utils import register_to_config
+from ...image_processor import VaeImageProcessor
+
+
+class Flux2ImageProcessor(VaeImageProcessor):
+    r"""
+    Image processor to preprocess the reference (character) image for the Flux2 model.
+
+    Args:
+        do_resize (`bool`, *optional*, defaults to `True`):
+            Whether to downscale the image's (height, width) dimensions to multiples of `vae_scale_factor`. Can accept
+            `height` and `width` arguments from [`image_processor.VaeImageProcessor.preprocess`] method.
+        vae_scale_factor (`int`, *optional*, defaults to `16`):
+            VAE (spatial) scale factor. If `do_resize` is `True`, the image is automatically resized to multiples of
+            this factor.
+        vae_latent_channels (`int`, *optional*, defaults to `32`):
+            VAE latent channels.
+        do_normalize (`bool`, *optional*, defaults to `True`):
+            Whether to normalize the image to [-1,1].
+        do_convert_rgb (`bool`, *optional*, defaults to be `True`):
+            Whether to convert the images to RGB format.
+    """
+
+    @register_to_config
+    def __init__(
+        self,
+        do_resize: bool = True,
+        vae_scale_factor: int = 16,
+        vae_latent_channels: int = 32,
+        do_normalize: bool = True,
+        do_convert_rgb: bool = True,
+    ):
+        super().__init__(
+            do_resize=do_resize,
+            vae_scale_factor=vae_scale_factor,
+            vae_latent_channels=vae_latent_channels,
+            do_normalize=do_normalize,
+            do_convert_rgb=do_convert_rgb,
+        )
+
+    @staticmethod
+    def check_image_input(
+        image: PIL.Image.Image, max_aspect_ratio: int = 8, min_side_length: int = 64, max_area: int = 1024 * 1024
+    ) -> PIL.Image.Image:
+        """
+        Check if image meets minimum size and aspect ratio requirements.
+
+        Args:
+            image: PIL Image to validate
+            max_aspect_ratio: Maximum allowed aspect ratio (width/height or height/width)
+            min_side_length: Minimum pixels required for width and height
+            max_area: Maximum allowed area in pixels²
+
+        Returns:
+            The input image if valid
+
+        Raises:
+            ValueError: If image is too small or aspect ratio is too extreme
+        """
+        if not isinstance(image, PIL.Image.Image):
+            raise ValueError(f"Image must be a PIL.Image.Image, got {type(image)}")
+
+        width, height = image.size
+
+        # Check minimum dimensions
+        if width < min_side_length or height < min_side_length:
+            raise ValueError(
+                f"Image too small: {width}×{height}. Both dimensions must be at least {min_side_length}px"
+            )
+
+        # Check aspect ratio
+        aspect_ratio = max(width / height, height / width)
+        if aspect_ratio > max_aspect_ratio:
+            raise ValueError(
+                f"Aspect ratio too extreme: {width}×{height} (ratio: {aspect_ratio:.1f}:1). "
+                f"Maximum allowed ratio is {max_aspect_ratio}:1"
+            )
+
+        return image
+
+    @staticmethod
+    def _resize_to_target_area(image: PIL.Image.Image, target_area: int = 1024 * 1024) -> PIL.Image.Image:
+        image_width, image_height = image.size
+
+        scale = math.sqrt(target_area / (image_width * image_height))
+        width = int(image_width * scale)
+        height = int(image_height * scale)
+
+        return image.resize((width, height), PIL.Image.Resampling.LANCZOS)
+
+    @staticmethod
+    def _resize_if_exceeds_area(image, target_area=1024 * 1024) -> PIL.Image.Image:
+        image_width, image_height = image.size
+        pixel_count = image_width * image_height
+        if pixel_count <= target_area:
+            return image
+        return Flux2ImageProcessor._resize_to_target_area(image, target_area)
+
+    def _resize_and_crop(
+        self,
+        image: PIL.Image.Image,
+        width: int,
+        height: int,
+    ) -> PIL.Image.Image:
+        r"""
+        center crop the image to the specified width and height.
+
+        Args:
+            image (`PIL.Image.Image`):
+                The image to resize and crop.
+            width (`int`):
+                The width to resize the image to.
+            height (`int`):
+                The height to resize the image to.
+
+        Returns:
+            `PIL.Image.Image`:
+                The resized and cropped image.
+        """
+        image_width, image_height = image.size
+
+        left = (image_width - width) // 2
+        top = (image_height - height) // 2
+        right = left + width
+        bottom = top + height
+
+        return image.crop((left, top, right, bottom))
+
+    # Taken from
+    # https://github.com/black-forest-labs/flux2/blob/5a5d316b1b42f6b59a8c9194b77c8256be848432/src/flux2/sampling.py#L310C1-L339C19
+    @staticmethod
+    def concatenate_images(images: List[PIL.Image.Image]) -> PIL.Image.Image:
+        """
+        Concatenate a list of PIL images horizontally with center alignment and white background.
+        """
+
+        # If only one image, return a copy of it
+        if len(images) == 1:
+            return images[0].copy()
+
+        # Convert all images to RGB if not already
+        images = [img.convert("RGB") if img.mode != "RGB" else img for img in images]
+
+        # Calculate dimensions for horizontal concatenation
+        total_width = sum(img.width for img in images)
+        max_height = max(img.height for img in images)
+
+        # Create new image with white background
+        background_color = (255, 255, 255)
+        new_img = PIL.Image.new("RGB", (total_width, max_height), background_color)
+
+        # Paste images with center alignment
+        x_offset = 0
+        for img in images:
+            y_offset = (max_height - img.height) // 2
+            new_img.paste(img, (x_offset, y_offset))
+            x_offset += img.width
+
+        return new_img
diff --git a/src/diffusers/pipelines/flux2/pipeline_flux2.py b/src/diffusers/pipelines/flux2/pipeline_flux2.py
new file mode 100644
index 000000000000..b54a43dd89a5
--- /dev/null
+++ b/src/diffusers/pipelines/flux2/pipeline_flux2.py
@@ -0,0 +1,1032 @@
+# Copyright 2025 Black Forest Labs and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import inspect
+from typing import Any, Callable, Dict, List, Optional, Tuple, Union
+
+import numpy as np
+import PIL
+import torch
+from transformers import AutoProcessor, Mistral3ForConditionalGeneration
+
+from ...loaders import Flux2LoraLoaderMixin
+from ...models import AutoencoderKLFlux2, Flux2Transformer2DModel
+from ...schedulers import FlowMatchEulerDiscreteScheduler
+from ...utils import is_torch_xla_available, logging, replace_example_docstring
+from ...utils.torch_utils import randn_tensor
+from ..pipeline_utils import DiffusionPipeline
+from .image_processor import Flux2ImageProcessor
+from .pipeline_output import Flux2PipelineOutput
+from .system_messages import SYSTEM_MESSAGE, SYSTEM_MESSAGE_UPSAMPLING_I2I, SYSTEM_MESSAGE_UPSAMPLING_T2I
+
+
+if is_torch_xla_available():
+    import torch_xla.core.xla_model as xm
+
+    XLA_AVAILABLE = True
+else:
+    XLA_AVAILABLE = False
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+EXAMPLE_DOC_STRING = """
+    Examples:
+        ```py
+        >>> import torch
+        >>> from diffusers import Flux2Pipeline
+
+        >>> pipe = Flux2Pipeline.from_pretrained("black-forest-labs/FLUX.2-dev", torch_dtype=torch.bfloat16)
+        >>> pipe.to("cuda")
+        >>> prompt = "A cat holding a sign that says hello world"
+        >>> # Depending on the variant being used, the pipeline call will slightly vary.
+        >>> # Refer to the pipeline documentation for more details.
+        >>> image = pipe(prompt, num_inference_steps=50, guidance_scale=2.5).images[0]
+        >>> image.save("flux.png")
+        ```
+"""
+
+UPSAMPLING_MAX_IMAGE_SIZE = 768**2
+
+
+# Adapted from
+# https://github.com/black-forest-labs/flux2/blob/5a5d316b1b42f6b59a8c9194b77c8256be848432/src/flux2/text_encoder.py#L68
+def format_input(
+    prompts: List[str],
+    system_message: str = SYSTEM_MESSAGE,
+    images: Optional[Union[List[PIL.Image.Image], List[List[PIL.Image.Image]]]] = None,
+):
+    """
+    Format a batch of text prompts into the conversation format expected by apply_chat_template. Optionally, add images
+    to the input.
+
+    Args:
+        prompts: List of text prompts
+        system_message: System message to use (default: CREATIVE_SYSTEM_MESSAGE)
+        images (optional): List of images to add to the input.
+
+    Returns:
+        List of conversations, where each conversation is a list of message dicts
+    """
+    # Remove [IMG] tokens from prompts to avoid Pixtral validation issues
+    # when truncation is enabled. The processor counts [IMG] tokens and fails
+    # if the count changes after truncation.
+    cleaned_txt = [prompt.replace("[IMG]", "") for prompt in prompts]
+
+    if images is None or len(images) == 0:
+        return [
+            [
+                {
+                    "role": "system",
+                    "content": [{"type": "text", "text": system_message}],
+                },
+                {"role": "user", "content": [{"type": "text", "text": prompt}]},
+            ]
+            for prompt in cleaned_txt
+        ]
+    else:
+        assert len(images) == len(prompts), "Number of images must match number of prompts"
+        messages = [
+            [
+                {
+                    "role": "system",
+                    "content": [{"type": "text", "text": system_message}],
+                },
+            ]
+            for _ in cleaned_txt
+        ]
+
+        for i, (el, images) in enumerate(zip(messages, images)):
+            # optionally add the images per batch element.
+            if images is not None:
+                el.append(
+                    {
+                        "role": "user",
+                        "content": [{"type": "image", "image": image_obj} for image_obj in images],
+                    }
+                )
+            # add the text.
+            el.append(
+                {
+                    "role": "user",
+                    "content": [{"type": "text", "text": cleaned_txt[i]}],
+                }
+            )
+
+        return messages
+
+
+# Adapted from
+# https://github.com/black-forest-labs/flux2/blob/5a5d316b1b42f6b59a8c9194b77c8256be848432/src/flux2/text_encoder.py#L49C5-L66C19
+def _validate_and_process_images(
+    images: List[List[PIL.Image.Image]] | List[PIL.Image.Image],
+    image_processor: Flux2ImageProcessor,
+    upsampling_max_image_size: int,
+) -> List[List[PIL.Image.Image]]:
+    # Simple validation: ensure it's a list of PIL images or list of lists of PIL images
+    if not images:
+        return []
+
+    # Check if it's a list of lists or a list of images
+    if isinstance(images[0], PIL.Image.Image):
+        # It's a list of images, convert to list of lists
+        images = [[im] for im in images]
+
+    # potentially concatenate multiple images to reduce the size
+    images = [[image_processor.concatenate_images(img_i)] if len(img_i) > 1 else img_i for img_i in images]
+
+    # cap the pixels
+    images = [
+        [image_processor._resize_if_exceeds_area(img_i, upsampling_max_image_size) for img_i in img_i]
+        for img_i in images
+    ]
+    return images
+
+
+# Taken from
+# https://github.com/black-forest-labs/flux2/blob/5a5d316b1b42f6b59a8c9194b77c8256be848432/src/flux2/sampling.py#L251
+def compute_empirical_mu(image_seq_len: int, num_steps: int) -> float:
+    a1, b1 = 8.73809524e-05, 1.89833333
+    a2, b2 = 0.00016927, 0.45666666
+
+    if image_seq_len > 4300:
+        mu = a2 * image_seq_len + b2
+        return float(mu)
+
+    m_200 = a2 * image_seq_len + b2
+    m_10 = a1 * image_seq_len + b1
+
+    a = (m_200 - m_10) / 190.0
+    b = m_200 - 200.0 * a
+    mu = a * num_steps + b
+
+    return float(mu)
+
+
+# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.retrieve_timesteps
+def retrieve_timesteps(
+    scheduler,
+    num_inference_steps: Optional[int] = None,
+    device: Optional[Union[str, torch.device]] = None,
+    timesteps: Optional[List[int]] = None,
+    sigmas: Optional[List[float]] = None,
+    **kwargs,
+):
+    r"""
+    Calls the scheduler's `set_timesteps` method and retrieves timesteps from the scheduler after the call. Handles
+    custom timesteps. Any kwargs will be supplied to `scheduler.set_timesteps`.
+
+    Args:
+        scheduler (`SchedulerMixin`):
+            The scheduler to get timesteps from.
+        num_inference_steps (`int`):
+            The number of diffusion steps used when generating samples with a pre-trained model. If used, `timesteps`
+            must be `None`.
+        device (`str` or `torch.device`, *optional*):
+            The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
+        timesteps (`List[int]`, *optional*):
+            Custom timesteps used to override the timestep spacing strategy of the scheduler. If `timesteps` is passed,
+            `num_inference_steps` and `sigmas` must be `None`.
+        sigmas (`List[float]`, *optional*):
+            Custom sigmas used to override the timestep spacing strategy of the scheduler. If `sigmas` is passed,
+            `num_inference_steps` and `timesteps` must be `None`.
+
+    Returns:
+        `Tuple[torch.Tensor, int]`: A tuple where the first element is the timestep schedule from the scheduler and the
+        second element is the number of inference steps.
+    """
+    if timesteps is not None and sigmas is not None:
+        raise ValueError("Only one of `timesteps` or `sigmas` can be passed. Please choose one to set custom values")
+    if timesteps is not None:
+        accepts_timesteps = "timesteps" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
+        if not accepts_timesteps:
+            raise ValueError(
+                f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
+                f" timestep schedules. Please check whether you are using the correct scheduler."
+            )
+        scheduler.set_timesteps(timesteps=timesteps, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+        num_inference_steps = len(timesteps)
+    elif sigmas is not None:
+        accept_sigmas = "sigmas" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
+        if not accept_sigmas:
+            raise ValueError(
+                f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
+                f" sigmas schedules. Please check whether you are using the correct scheduler."
+            )
+        scheduler.set_timesteps(sigmas=sigmas, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+        num_inference_steps = len(timesteps)
+    else:
+        scheduler.set_timesteps(num_inference_steps, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+    return timesteps, num_inference_steps
+
+
+# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img.retrieve_latents
+def retrieve_latents(
+    encoder_output: torch.Tensor, generator: Optional[torch.Generator] = None, sample_mode: str = "sample"
+):
+    if hasattr(encoder_output, "latent_dist") and sample_mode == "sample":
+        return encoder_output.latent_dist.sample(generator)
+    elif hasattr(encoder_output, "latent_dist") and sample_mode == "argmax":
+        return encoder_output.latent_dist.mode()
+    elif hasattr(encoder_output, "latents"):
+        return encoder_output.latents
+    else:
+        raise AttributeError("Could not access latents of provided encoder_output")
+
+
+class Flux2Pipeline(DiffusionPipeline, Flux2LoraLoaderMixin):
+    r"""
+    The Flux2 pipeline for text-to-image generation.
+
+    Reference: [https://bfl.ai/blog/flux-2](https://bfl.ai/blog/flux-2)
+
+    Args:
+        transformer ([`Flux2Transformer2DModel`]):
+            Conditional Transformer (MMDiT) architecture to denoise the encoded image latents.
+        scheduler ([`FlowMatchEulerDiscreteScheduler`]):
+            A scheduler to be used in combination with `transformer` to denoise the encoded image latents.
+        vae ([`AutoencoderKLFlux2`]):
+            Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.
+        text_encoder ([`Mistral3ForConditionalGeneration`]):
+            [Mistral3ForConditionalGeneration](https://huggingface.co/docs/transformers/en/model_doc/mistral3#transformers.Mistral3ForConditionalGeneration)
+        tokenizer (`AutoProcessor`):
+            Tokenizer of class
+            [PixtralProcessor](https://huggingface.co/docs/transformers/en/model_doc/pixtral#transformers.PixtralProcessor).
+    """
+
+    model_cpu_offload_seq = "text_encoder->transformer->vae"
+    _callback_tensor_inputs = ["latents", "prompt_embeds"]
+
+    def __init__(
+        self,
+        scheduler: FlowMatchEulerDiscreteScheduler,
+        vae: AutoencoderKLFlux2,
+        text_encoder: Mistral3ForConditionalGeneration,
+        tokenizer: AutoProcessor,
+        transformer: Flux2Transformer2DModel,
+    ):
+        super().__init__()
+
+        self.register_modules(
+            vae=vae,
+            text_encoder=text_encoder,
+            tokenizer=tokenizer,
+            scheduler=scheduler,
+            transformer=transformer,
+        )
+        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1) if getattr(self, "vae", None) else 8
+        # Flux latents are turned into 2x2 patches and packed. This means the latent width and height has to be divisible
+        # by the patch size. So the vae scale factor is multiplied by the patch size to account for this
+        self.image_processor = Flux2ImageProcessor(vae_scale_factor=self.vae_scale_factor * 2)
+        self.tokenizer_max_length = 512
+        self.default_sample_size = 128
+
+        self.system_message = SYSTEM_MESSAGE
+        self.system_message_upsampling_t2i = SYSTEM_MESSAGE_UPSAMPLING_T2I
+        self.system_message_upsampling_i2i = SYSTEM_MESSAGE_UPSAMPLING_I2I
+        self.upsampling_max_image_size = UPSAMPLING_MAX_IMAGE_SIZE
+
+    @staticmethod
+    def _get_mistral_3_small_prompt_embeds(
+        text_encoder: Mistral3ForConditionalGeneration,
+        tokenizer: AutoProcessor,
+        prompt: Union[str, List[str]],
+        dtype: Optional[torch.dtype] = None,
+        device: Optional[torch.device] = None,
+        max_sequence_length: int = 512,
+        system_message: str = SYSTEM_MESSAGE,
+        hidden_states_layers: List[int] = (10, 20, 30),
+    ):
+        dtype = text_encoder.dtype if dtype is None else dtype
+        device = text_encoder.device if device is None else device
+
+        prompt = [prompt] if isinstance(prompt, str) else prompt
+
+        # Format input messages
+        messages_batch = format_input(prompts=prompt, system_message=system_message)
+
+        # Process all messages at once
+        inputs = tokenizer.apply_chat_template(
+            messages_batch,
+            add_generation_prompt=False,
+            tokenize=True,
+            return_dict=True,
+            return_tensors="pt",
+            padding="max_length",
+            truncation=True,
+            max_length=max_sequence_length,
+        )
+
+        # Move to device
+        input_ids = inputs["input_ids"].to(device)
+        attention_mask = inputs["attention_mask"].to(device)
+
+        # Forward pass through the model
+        output = text_encoder(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            output_hidden_states=True,
+            use_cache=False,
+        )
+
+        # Only use outputs from intermediate layers and stack them
+        out = torch.stack([output.hidden_states[k] for k in hidden_states_layers], dim=1)
+        out = out.to(dtype=dtype, device=device)
+
+        batch_size, num_channels, seq_len, hidden_dim = out.shape
+        prompt_embeds = out.permute(0, 2, 1, 3).reshape(batch_size, seq_len, num_channels * hidden_dim)
+
+        return prompt_embeds
+
+    @staticmethod
+    def _prepare_text_ids(
+        x: torch.Tensor,  # (B, L, D) or (L, D)
+        t_coord: Optional[torch.Tensor] = None,
+    ):
+        B, L, _ = x.shape
+        out_ids = []
+
+        for i in range(B):
+            t = torch.arange(1) if t_coord is None else t_coord[i]
+            h = torch.arange(1)
+            w = torch.arange(1)
+            l = torch.arange(L)
+
+            coords = torch.cartesian_prod(t, h, w, l)
+            out_ids.append(coords)
+
+        return torch.stack(out_ids)
+
+    @staticmethod
+    def _prepare_latent_ids(
+        latents: torch.Tensor,  # (B, C, H, W)
+    ):
+        r"""
+        Generates 4D position coordinates (T, H, W, L) for latent tensors.
+
+        Args:
+            latents (torch.Tensor):
+                Latent tensor of shape (B, C, H, W)
+
+        Returns:
+            torch.Tensor:
+                Position IDs tensor of shape (B, H*W, 4) All batches share the same coordinate structure: T=0,
+                H=[0..H-1], W=[0..W-1], L=0
+        """
+
+        batch_size, _, height, width = latents.shape
+
+        t = torch.arange(1)  # [0] - time dimension
+        h = torch.arange(height)
+        w = torch.arange(width)
+        l = torch.arange(1)  # [0] - layer dimension
+
+        # Create position IDs: (H*W, 4)
+        latent_ids = torch.cartesian_prod(t, h, w, l)
+
+        # Expand to batch: (B, H*W, 4)
+        latent_ids = latent_ids.unsqueeze(0).expand(batch_size, -1, -1)
+
+        return latent_ids
+
+    @staticmethod
+    def _prepare_image_ids(
+        image_latents: List[torch.Tensor],  # [(1, C, H, W), (1, C, H, W), ...]
+        scale: int = 10,
+    ):
+        r"""
+        Generates 4D time-space coordinates (T, H, W, L) for a sequence of image latents.
+
+        This function creates a unique coordinate for every pixel/patch across all input latent with different
+        dimensions.
+
+        Args:
+            image_latents (List[torch.Tensor]):
+                A list of image latent feature tensors, typically of shape (C, H, W).
+            scale (int, optional):
+                A factor used to define the time separation (T-coordinate) between latents. T-coordinate for the i-th
+                latent is: 'scale + scale * i'. Defaults to 10.
+
+        Returns:
+            torch.Tensor:
+                The combined coordinate tensor. Shape: (1, N_total, 4) Where N_total is the sum of (H * W) for all
+                input latents.
+
+        Coordinate Components (Dimension 4):
+            - T (Time): The unique index indicating which latent image the coordinate belongs to.
+            - H (Height): The row index within that latent image.
+            - W (Width): The column index within that latent image.
+            - L (Seq. Length): A sequence length dimension, which is always fixed at 0 (size 1)
+        """
+
+        if not isinstance(image_latents, list):
+            raise ValueError(f"Expected `image_latents` to be a list, got {type(image_latents)}.")
+
+        # create time offset for each reference image
+        t_coords = [scale + scale * t for t in torch.arange(0, len(image_latents))]
+        t_coords = [t.view(-1) for t in t_coords]
+
+        image_latent_ids = []
+        for x, t in zip(image_latents, t_coords):
+            x = x.squeeze(0)
+            _, height, width = x.shape
+
+            x_ids = torch.cartesian_prod(t, torch.arange(height), torch.arange(width), torch.arange(1))
+            image_latent_ids.append(x_ids)
+
+        image_latent_ids = torch.cat(image_latent_ids, dim=0)
+        image_latent_ids = image_latent_ids.unsqueeze(0)
+
+        return image_latent_ids
+
+    @staticmethod
+    def _patchify_latents(latents):
+        batch_size, num_channels_latents, height, width = latents.shape
+        latents = latents.view(batch_size, num_channels_latents, height // 2, 2, width // 2, 2)
+        latents = latents.permute(0, 1, 3, 5, 2, 4)
+        latents = latents.reshape(batch_size, num_channels_latents * 4, height // 2, width // 2)
+        return latents
+
+    @staticmethod
+    def _unpatchify_latents(latents):
+        batch_size, num_channels_latents, height, width = latents.shape
+        latents = latents.reshape(batch_size, num_channels_latents // (2 * 2), 2, 2, height, width)
+        latents = latents.permute(0, 1, 4, 2, 5, 3)
+        latents = latents.reshape(batch_size, num_channels_latents // (2 * 2), height * 2, width * 2)
+        return latents
+
+    @staticmethod
+    def _pack_latents(latents):
+        """
+        pack latents: (batch_size, num_channels, height, width) -> (batch_size, height * width, num_channels)
+        """
+
+        batch_size, num_channels, height, width = latents.shape
+        latents = latents.reshape(batch_size, num_channels, height * width).permute(0, 2, 1)
+
+        return latents
+
+    @staticmethod
+    def _unpack_latents_with_ids(x: torch.Tensor, x_ids: torch.Tensor) -> list[torch.Tensor]:
+        """
+        using position ids to scatter tokens into place
+        """
+        x_list = []
+        for data, pos in zip(x, x_ids):
+            _, ch = data.shape  # noqa: F841
+            h_ids = pos[:, 1].to(torch.int64)
+            w_ids = pos[:, 2].to(torch.int64)
+
+            h = torch.max(h_ids) + 1
+            w = torch.max(w_ids) + 1
+
+            flat_ids = h_ids * w + w_ids
+
+            out = torch.zeros((h * w, ch), device=data.device, dtype=data.dtype)
+            out.scatter_(0, flat_ids.unsqueeze(1).expand(-1, ch), data)
+
+            # reshape from (H * W, C) to (H, W, C) and permute to (C, H, W)
+
+            out = out.view(h, w, ch).permute(2, 0, 1)
+            x_list.append(out)
+
+        return torch.stack(x_list, dim=0)
+
+    def upsample_prompt(
+        self,
+        prompt: Union[str, List[str]],
+        images: Union[List[PIL.Image.Image], List[List[PIL.Image.Image]]] = None,
+        temperature: float = 0.15,
+        device: torch.device = None,
+    ) -> List[str]:
+        prompt = [prompt] if isinstance(prompt, str) else prompt
+        device = self.text_encoder.device if device is None else device
+
+        # Set system message based on whether images are provided
+        if images is None or len(images) == 0 or images[0] is None:
+            system_message = SYSTEM_MESSAGE_UPSAMPLING_T2I
+        else:
+            system_message = SYSTEM_MESSAGE_UPSAMPLING_I2I
+
+        # Validate and process the input images
+        if images:
+            images = _validate_and_process_images(images, self.image_processor, self.upsampling_max_image_size)
+
+        # Format input messages
+        messages_batch = format_input(prompts=prompt, system_message=system_message, images=images)
+
+        # Process all messages at once
+        # with image processing a too short max length can throw an error in here.
+        inputs = self.tokenizer.apply_chat_template(
+            messages_batch,
+            add_generation_prompt=True,
+            tokenize=True,
+            return_dict=True,
+            return_tensors="pt",
+            padding="max_length",
+            truncation=True,
+            max_length=2048,
+        )
+
+        # Move to device
+        inputs["input_ids"] = inputs["input_ids"].to(device)
+        inputs["attention_mask"] = inputs["attention_mask"].to(device)
+
+        if "pixel_values" in inputs:
+            inputs["pixel_values"] = inputs["pixel_values"].to(device, self.text_encoder.dtype)
+
+        # Generate text using the model's generate method
+        generated_ids = self.text_encoder.generate(
+            **inputs,
+            max_new_tokens=512,
+            do_sample=True,
+            temperature=temperature,
+            use_cache=True,
+        )
+
+        # Decode only the newly generated tokens (skip input tokens)
+        # Extract only the generated portion
+        input_length = inputs["input_ids"].shape[1]
+        generated_tokens = generated_ids[:, input_length:]
+
+        upsampled_prompt = self.tokenizer.tokenizer.batch_decode(
+            generated_tokens, skip_special_tokens=True, clean_up_tokenization_spaces=True
+        )
+        return upsampled_prompt
+
+    def encode_prompt(
+        self,
+        prompt: Union[str, List[str]],
+        device: Optional[torch.device] = None,
+        num_images_per_prompt: int = 1,
+        prompt_embeds: Optional[torch.Tensor] = None,
+        max_sequence_length: int = 512,
+        text_encoder_out_layers: Tuple[int] = (10, 20, 30),
+    ):
+        device = device or self._execution_device
+
+        if prompt is None:
+            prompt = ""
+
+        prompt = [prompt] if isinstance(prompt, str) else prompt
+
+        if prompt_embeds is None:
+            prompt_embeds = self._get_mistral_3_small_prompt_embeds(
+                text_encoder=self.text_encoder,
+                tokenizer=self.tokenizer,
+                prompt=prompt,
+                device=device,
+                max_sequence_length=max_sequence_length,
+                system_message=self.system_message,
+                hidden_states_layers=text_encoder_out_layers,
+            )
+
+        batch_size, seq_len, _ = prompt_embeds.shape
+        prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
+        prompt_embeds = prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
+
+        text_ids = self._prepare_text_ids(prompt_embeds)
+        text_ids = text_ids.to(device)
+        return prompt_embeds, text_ids
+
+    def _encode_vae_image(self, image: torch.Tensor, generator: torch.Generator):
+        if image.ndim != 4:
+            raise ValueError(f"Expected image dims 4, got {image.ndim}.")
+
+        image_latents = retrieve_latents(self.vae.encode(image), generator=generator, sample_mode="argmax")
+        image_latents = self._patchify_latents(image_latents)
+
+        latents_bn_mean = self.vae.bn.running_mean.view(1, -1, 1, 1).to(image_latents.device, image_latents.dtype)
+        latents_bn_std = torch.sqrt(self.vae.bn.running_var.view(1, -1, 1, 1) + self.vae.config.batch_norm_eps)
+        image_latents = (image_latents - latents_bn_mean) / latents_bn_std
+
+        return image_latents
+
+    def prepare_latents(
+        self,
+        batch_size,
+        num_latents_channels,
+        height,
+        width,
+        dtype,
+        device,
+        generator: torch.Generator,
+        latents: Optional[torch.Tensor] = None,
+    ):
+        # VAE applies 8x compression on images but we must also account for packing which requires
+        # latent height and width to be divisible by 2.
+        height = 2 * (int(height) // (self.vae_scale_factor * 2))
+        width = 2 * (int(width) // (self.vae_scale_factor * 2))
+
+        shape = (batch_size, num_latents_channels * 4, height // 2, width // 2)
+        if isinstance(generator, list) and len(generator) != batch_size:
+            raise ValueError(
+                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+            )
+        if latents is None:
+            latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+        else:
+            latents = latents.to(device=device, dtype=dtype)
+
+        latent_ids = self._prepare_latent_ids(latents)
+        latent_ids = latent_ids.to(device)
+
+        latents = self._pack_latents(latents)  # [B, C, H, W] -> [B, H*W, C]
+        return latents, latent_ids
+
+    def prepare_image_latents(
+        self,
+        images: List[torch.Tensor],
+        batch_size,
+        generator: torch.Generator,
+        device,
+        dtype,
+    ):
+        image_latents = []
+        for image in images:
+            image = image.to(device=device, dtype=dtype)
+            imagge_latent = self._encode_vae_image(image=image, generator=generator)
+            image_latents.append(imagge_latent)  # (1, 128, 32, 32)
+
+        image_latent_ids = self._prepare_image_ids(image_latents)
+
+        # Pack each latent and concatenate
+        packed_latents = []
+        for latent in image_latents:
+            # latent: (1, 128, 32, 32)
+            packed = self._pack_latents(latent)  # (1, 1024, 128)
+            packed = packed.squeeze(0)  # (1024, 128) - remove batch dim
+            packed_latents.append(packed)
+
+        # Concatenate all reference tokens along sequence dimension
+        image_latents = torch.cat(packed_latents, dim=0)  # (N*1024, 128)
+        image_latents = image_latents.unsqueeze(0)  # (1, N*1024, 128)
+
+        image_latents = image_latents.repeat(batch_size, 1, 1)
+        image_latent_ids = image_latent_ids.repeat(batch_size, 1, 1)
+        image_latent_ids = image_latent_ids.to(device)
+
+        return image_latents, image_latent_ids
+
+    def check_inputs(
+        self,
+        prompt,
+        height,
+        width,
+        prompt_embeds=None,
+        callback_on_step_end_tensor_inputs=None,
+    ):
+        if (
+            height is not None
+            and height % (self.vae_scale_factor * 2) != 0
+            or width is not None
+            and width % (self.vae_scale_factor * 2) != 0
+        ):
+            logger.warning(
+                f"`height` and `width` have to be divisible by {self.vae_scale_factor * 2} but are {height} and {width}. Dimensions will be resized accordingly"
+            )
+
+        if callback_on_step_end_tensor_inputs is not None and not all(
+            k in self._callback_tensor_inputs for k in callback_on_step_end_tensor_inputs
+        ):
+            raise ValueError(
+                f"`callback_on_step_end_tensor_inputs` has to be in {self._callback_tensor_inputs}, but found {[k for k in callback_on_step_end_tensor_inputs if k not in self._callback_tensor_inputs]}"
+            )
+
+        if prompt is not None and prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
+                " only forward one of the two."
+            )
+        elif prompt is None and prompt_embeds is None:
+            raise ValueError(
+                "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
+            )
+        elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+
+    @property
+    def guidance_scale(self):
+        return self._guidance_scale
+
+    @property
+    def joint_attention_kwargs(self):
+        return self._joint_attention_kwargs
+
+    @property
+    def num_timesteps(self):
+        return self._num_timesteps
+
+    @property
+    def current_timestep(self):
+        return self._current_timestep
+
+    @property
+    def interrupt(self):
+        return self._interrupt
+
+    @torch.no_grad()
+    @replace_example_docstring(EXAMPLE_DOC_STRING)
+    def __call__(
+        self,
+        image: Optional[Union[List[PIL.Image.Image], PIL.Image.Image]] = None,
+        prompt: Union[str, List[str]] = None,
+        height: Optional[int] = None,
+        width: Optional[int] = None,
+        num_inference_steps: int = 50,
+        sigmas: Optional[List[float]] = None,
+        guidance_scale: Optional[float] = 4.0,
+        num_images_per_prompt: int = 1,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.Tensor] = None,
+        prompt_embeds: Optional[torch.Tensor] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        attention_kwargs: Optional[Dict[str, Any]] = None,
+        callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None,
+        callback_on_step_end_tensor_inputs: List[str] = ["latents"],
+        max_sequence_length: int = 512,
+        text_encoder_out_layers: Tuple[int] = (10, 20, 30),
+        caption_upsample_temperature: float = None,
+    ):
+        r"""
+        Function invoked when calling the pipeline for generation.
+
+        Args:
+            image (`torch.Tensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.Tensor]`, `List[PIL.Image.Image]`, or `List[np.ndarray]`):
+                `Image`, numpy array or tensor representing an image batch to be used as the starting point. For both
+                numpy array and pytorch tensor, the expected value range is between `[0, 1]` If it's a tensor or a list
+                or tensors, the expected shape should be `(B, C, H, W)` or `(C, H, W)`. If it is a numpy array or a
+                list of arrays, the expected shape should be `(B, H, W, C)` or `(H, W, C)` It can also accept image
+                latents as `image`, but if passing latents directly it is not encoded again.
+            prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
+                instead.
+            guidance_scale (`float`, *optional*, defaults to 1.0):
+                Embedded guiddance scale is enabled by setting `guidance_scale` > 1. Higher `guidance_scale` encourages
+                a model to generate images more aligned with `prompt` at the expense of lower image quality.
+
+                Guidance-distilled models approximates true classifer-free guidance for `guidance_scale` > 1. Refer to
+                the [paper](https://huggingface.co/papers/2210.03142) to learn more.
+            height (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
+                The height in pixels of the generated image. This is set to 1024 by default for the best results.
+            width (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
+                The width in pixels of the generated image. This is set to 1024 by default for the best results.
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            sigmas (`List[float]`, *optional*):
+                Custom sigmas to use for the denoising process with schedulers which support a `sigmas` argument in
+                their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is passed
+                will be used.
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
+                to make generation deterministic.
+            latents (`torch.Tensor`, *optional*):
+                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor will be generated by sampling using the supplied random `generator`.
+            prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generate image. Choose between
+                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.qwenimage.QwenImagePipelineOutput`] instead of a plain tuple.
+            attention_kwargs (`dict`, *optional*):
+                A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
+                `self.processor` in
+                [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
+            callback_on_step_end (`Callable`, *optional*):
+                A function that calls at the end of each denoising steps during the inference. The function is called
+                with the following arguments: `callback_on_step_end(self: DiffusionPipeline, step: int, timestep: int,
+                callback_kwargs: Dict)`. `callback_kwargs` will include a list of all tensors as specified by
+                `callback_on_step_end_tensor_inputs`.
+            callback_on_step_end_tensor_inputs (`List`, *optional*):
+                The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
+                will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
+                `._callback_tensor_inputs` attribute of your pipeline class.
+            max_sequence_length (`int` defaults to 512): Maximum sequence length to use with the `prompt`.
+            text_encoder_out_layers (`Tuple[int]`):
+                Layer indices to use in the `text_encoder` to derive the final prompt embeddings.
+            caption_upsample_temperature (`float`):
+                When specified, we will try to perform caption upsampling for potentially improved outputs. We
+                recommend setting it to 0.15 if caption upsampling is to be performed.
+
+        Examples:
+
+        Returns:
+            [`~pipelines.flux2.Flux2PipelineOutput`] or `tuple`: [`~pipelines.flux2.Flux2PipelineOutput`] if
+            `return_dict` is True, otherwise a `tuple`. When returning a tuple, the first element is a list with the
+            generated images.
+        """
+
+        # 1. Check inputs. Raise error if not correct
+        self.check_inputs(
+            prompt=prompt,
+            height=height,
+            width=width,
+            prompt_embeds=prompt_embeds,
+            callback_on_step_end_tensor_inputs=callback_on_step_end_tensor_inputs,
+        )
+
+        self._guidance_scale = guidance_scale
+        self._attention_kwargs = attention_kwargs
+        self._current_timestep = None
+        self._interrupt = False
+
+        # 2. Define call parameters
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        device = self._execution_device
+
+        # 3. prepare text embeddings
+        if caption_upsample_temperature:
+            prompt = self.upsample_prompt(
+                prompt, images=image, temperature=caption_upsample_temperature, device=device
+            )
+        prompt_embeds, text_ids = self.encode_prompt(
+            prompt=prompt,
+            prompt_embeds=prompt_embeds,
+            device=device,
+            num_images_per_prompt=num_images_per_prompt,
+            max_sequence_length=max_sequence_length,
+            text_encoder_out_layers=text_encoder_out_layers,
+        )
+
+        # 4. process images
+        if image is not None and not isinstance(image, list):
+            image = [image]
+
+        condition_images = None
+        if image is not None:
+            for img in image:
+                self.image_processor.check_image_input(img)
+
+            condition_images = []
+            for img in image:
+                image_width, image_height = img.size
+                if image_width * image_height > 1024 * 1024:
+                    img = self.image_processor._resize_to_target_area(img, 1024 * 1024)
+                    image_width, image_height = img.size
+
+                multiple_of = self.vae_scale_factor * 2
+                image_width = (image_width // multiple_of) * multiple_of
+                image_height = (image_height // multiple_of) * multiple_of
+                img = self.image_processor.preprocess(img, height=image_height, width=image_width, resize_mode="crop")
+                condition_images.append(img)
+                height = height or image_height
+                width = width or image_width
+
+        height = height or self.default_sample_size * self.vae_scale_factor
+        width = width or self.default_sample_size * self.vae_scale_factor
+
+        # 5. prepare latent variables
+        num_channels_latents = self.transformer.config.in_channels // 4
+        latents, latent_ids = self.prepare_latents(
+            batch_size=batch_size * num_images_per_prompt,
+            num_latents_channels=num_channels_latents,
+            height=height,
+            width=width,
+            dtype=prompt_embeds.dtype,
+            device=device,
+            generator=generator,
+            latents=latents,
+        )
+
+        image_latents = None
+        image_latent_ids = None
+        if condition_images is not None:
+            image_latents, image_latent_ids = self.prepare_image_latents(
+                images=condition_images,
+                batch_size=batch_size * num_images_per_prompt,
+                generator=generator,
+                device=device,
+                dtype=self.vae.dtype,
+            )
+
+        # 6. Prepare timesteps
+        sigmas = np.linspace(1.0, 1 / num_inference_steps, num_inference_steps) if sigmas is None else sigmas
+        if hasattr(self.scheduler.config, "use_flow_sigmas") and self.scheduler.config.use_flow_sigmas:
+            sigmas = None
+        image_seq_len = latents.shape[1]
+        mu = compute_empirical_mu(image_seq_len=image_seq_len, num_steps=num_inference_steps)
+        timesteps, num_inference_steps = retrieve_timesteps(
+            self.scheduler,
+            num_inference_steps,
+            device,
+            sigmas=sigmas,
+            mu=mu,
+        )
+        num_warmup_steps = max(len(timesteps) - num_inference_steps * self.scheduler.order, 0)
+        self._num_timesteps = len(timesteps)
+
+        # handle guidance
+        guidance = torch.full([1], guidance_scale, device=device, dtype=torch.float32)
+        guidance = guidance.expand(latents.shape[0])
+
+        # 7. Denoising loop
+        # We set the index here to remove DtoH sync, helpful especially during compilation.
+        # Check out more details here: https://github.com/huggingface/diffusers/pull/11696
+        self.scheduler.set_begin_index(0)
+        with self.progress_bar(total=num_inference_steps) as progress_bar:
+            for i, t in enumerate(timesteps):
+                if self.interrupt:
+                    continue
+
+                self._current_timestep = t
+                # broadcast to batch dimension in a way that's compatible with ONNX/Core ML
+                timestep = t.expand(latents.shape[0]).to(latents.dtype)
+
+                latent_model_input = latents.to(self.transformer.dtype)
+                latent_image_ids = latent_ids
+
+                if image_latents is not None:
+                    latent_model_input = torch.cat([latents, image_latents], dim=1).to(self.transformer.dtype)
+                    latent_image_ids = torch.cat([latent_ids, image_latent_ids], dim=1)
+
+                noise_pred = self.transformer(
+                    hidden_states=latent_model_input,  # (B, image_seq_len, C)
+                    timestep=timestep / 1000,
+                    guidance=guidance,
+                    encoder_hidden_states=prompt_embeds,
+                    txt_ids=text_ids,  # B, text_seq_len, 4
+                    img_ids=latent_image_ids,  # B, image_seq_len, 4
+                    joint_attention_kwargs=self._attention_kwargs,
+                    return_dict=False,
+                )[0]
+
+                noise_pred = noise_pred[:, : latents.size(1) :]
+
+                # compute the previous noisy sample x_t -> x_t-1
+                latents_dtype = latents.dtype
+                latents = self.scheduler.step(noise_pred, t, latents, return_dict=False)[0]
+
+                if latents.dtype != latents_dtype:
+                    if torch.backends.mps.is_available():
+                        # some platforms (eg. apple mps) misbehave due to a pytorch bug: https://github.com/pytorch/pytorch/pull/99272
+                        latents = latents.to(latents_dtype)
+
+                if callback_on_step_end is not None:
+                    callback_kwargs = {}
+                    for k in callback_on_step_end_tensor_inputs:
+                        callback_kwargs[k] = locals()[k]
+                    callback_outputs = callback_on_step_end(self, i, t, callback_kwargs)
+
+                    latents = callback_outputs.pop("latents", latents)
+                    prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds)
+
+                # call the callback, if provided
+                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+                    progress_bar.update()
+
+                if XLA_AVAILABLE:
+                    xm.mark_step()
+
+        self._current_timestep = None
+
+        if output_type == "latent":
+            image = latents
+        else:
+            latents = self._unpack_latents_with_ids(latents, latent_ids)
+
+            latents_bn_mean = self.vae.bn.running_mean.view(1, -1, 1, 1).to(latents.device, latents.dtype)
+            latents_bn_std = torch.sqrt(self.vae.bn.running_var.view(1, -1, 1, 1) + self.vae.config.batch_norm_eps).to(
+                latents.device, latents.dtype
+            )
+            latents = latents * latents_bn_std + latents_bn_mean
+            latents = self._unpatchify_latents(latents)
+
+            image = self.vae.decode(latents, return_dict=False)[0]
+            image = self.image_processor.postprocess(image, output_type=output_type)
+
+        # Offload all models
+        self.maybe_free_model_hooks()
+
+        if not return_dict:
+            return (image,)
+
+        return Flux2PipelineOutput(images=image)
diff --git a/src/diffusers/pipelines/flux2/pipeline_output.py b/src/diffusers/pipelines/flux2/pipeline_output.py
new file mode 100644
index 000000000000..58e8ad49c210
--- /dev/null
+++ b/src/diffusers/pipelines/flux2/pipeline_output.py
@@ -0,0 +1,23 @@
+from dataclasses import dataclass
+from typing import List, Union
+
+import numpy as np
+import PIL.Image
+
+from ...utils import BaseOutput
+
+
+@dataclass
+class Flux2PipelineOutput(BaseOutput):
+    """
+    Output class for Flux2 image generation pipelines.
+
+    Args:
+        images (`List[PIL.Image.Image]` or `torch.Tensor` or `np.ndarray`)
+            List of denoised PIL images of length `batch_size` or numpy array or torch tensor of shape `(batch_size,
+            height, width, num_channels)`. PIL images or numpy array present the denoised images of the diffusion
+            pipeline. Torch tensors can represent either the denoised images or the intermediate latents ready to be
+            passed to the decoder.
+    """
+
+    images: Union[List[PIL.Image.Image], np.ndarray]
diff --git a/src/diffusers/pipelines/flux2/system_messages.py b/src/diffusers/pipelines/flux2/system_messages.py
new file mode 100644
index 000000000000..ecdb1371f0d4
--- /dev/null
+++ b/src/diffusers/pipelines/flux2/system_messages.py
@@ -0,0 +1,33 @@
+# docstyle-ignore
+"""
+These system prompts come from:
+https://github.com/black-forest-labs/flux2/blob/5a5d316b1b42f6b59a8c9194b77c8256be848432/src/flux2/system_messages.py#L54
+"""
+
+# docstyle-ignore
+SYSTEM_MESSAGE = """You are an AI that reasons about image descriptions. You give structured responses focusing on object relationships, object
+attribution and actions without speculation."""
+
+# docstyle-ignore
+SYSTEM_MESSAGE_UPSAMPLING_T2I = """You are an expert prompt engineer for FLUX.2 by Black Forest Labs. Rewrite user prompts to be more descriptive while strictly preserving their core subject and intent.
+
+Guidelines:
+1. Structure: Keep structured inputs structured (enhance within fields). Convert natural language to detailed paragraphs.
+2. Details: Add concrete visual specifics - form, scale, textures, materials, lighting (quality, direction, color), shadows, spatial relationships, and environmental context.
+3. Text in Images: Put ALL text in quotation marks, matching the prompt's language. Always provide explicit quoted text for objects that would contain text in reality (signs, labels, screens, etc.) - without it, the model generates gibberish.
+
+Output only the revised prompt and nothing else."""
+
+# docstyle-ignore
+SYSTEM_MESSAGE_UPSAMPLING_I2I = """You are FLUX.2 by Black Forest Labs, an image-editing expert. You convert editing requests into one concise instruction (50-80 words, ~30 for brief requests).
+
+Rules:
+- Single instruction only, no commentary
+- Use clear, analytical language (avoid "whimsical," "cascading," etc.)
+- Specify what changes AND what stays the same (face, lighting, composition)
+- Reference actual image elements
+- Turn negatives into positives ("don't change X" → "keep X")
+- Make abstractions concrete ("futuristic" → "glowing cyan neon, metallic panels")
+- Keep content PG-13
+
+Output only the final instruction in plain text and nothing else."""
diff --git a/src/diffusers/pipelines/hunyuan_video1_5/__init__.py b/src/diffusers/pipelines/hunyuan_video1_5/__init__.py
new file mode 100644
index 000000000000..846320f4ace0
--- /dev/null
+++ b/src/diffusers/pipelines/hunyuan_video1_5/__init__.py
@@ -0,0 +1,50 @@
+from typing import TYPE_CHECKING
+
+from ...utils import (
+    DIFFUSERS_SLOW_IMPORT,
+    OptionalDependencyNotAvailable,
+    _LazyModule,
+    get_objects_from_module,
+    is_torch_available,
+    is_transformers_available,
+)
+
+
+_dummy_objects = {}
+_import_structure = {}
+
+
+try:
+    if not (is_transformers_available() and is_torch_available()):
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    from ...utils import dummy_torch_and_transformers_objects  # noqa F403
+
+    _dummy_objects.update(get_objects_from_module(dummy_torch_and_transformers_objects))
+else:
+    _import_structure["pipeline_hunyuan_video1_5"] = ["HunyuanVideo15Pipeline"]
+    _import_structure["pipeline_hunyuan_video1_5_image2video"] = ["HunyuanVideo15ImageToVideoPipeline"]
+
+if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
+    try:
+        if not (is_transformers_available() and is_torch_available()):
+            raise OptionalDependencyNotAvailable()
+
+    except OptionalDependencyNotAvailable:
+        from ...utils.dummy_torch_and_transformers_objects import *
+    else:
+        from .pipeline_hunyuan_video1_5 import HunyuanVideo15Pipeline
+        from .pipeline_hunyuan_video1_5_image2video import HunyuanVideo15ImageToVideoPipeline
+
+else:
+    import sys
+
+    sys.modules[__name__] = _LazyModule(
+        __name__,
+        globals()["__file__"],
+        _import_structure,
+        module_spec=__spec__,
+    )
+
+    for name, value in _dummy_objects.items():
+        setattr(sys.modules[__name__], name, value)
diff --git a/src/diffusers/pipelines/hunyuan_video1_5/image_processor.py b/src/diffusers/pipelines/hunyuan_video1_5/image_processor.py
new file mode 100644
index 000000000000..82817365b6a5
--- /dev/null
+++ b/src/diffusers/pipelines/hunyuan_video1_5/image_processor.py
@@ -0,0 +1,103 @@
+# Copyright 2025 The HunyuanVideo Team and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+
+from ...configuration_utils import register_to_config
+from ...video_processor import VideoProcessor
+
+
+# copied from https://github.com/Tencent-Hunyuan/HunyuanVideo-1.5/blob/main/hyvideo/utils/data_utils.py#L20
+def generate_crop_size_list(base_size=256, patch_size=16, max_ratio=4.0):
+    num_patches = round((base_size / patch_size) ** 2)
+    assert max_ratio >= 1.0
+    crop_size_list = []
+    wp, hp = num_patches, 1
+    while wp > 0:
+        if max(wp, hp) / min(wp, hp) <= max_ratio:
+            crop_size_list.append((wp * patch_size, hp * patch_size))
+        if (hp + 1) * wp <= num_patches:
+            hp += 1
+        else:
+            wp -= 1
+    return crop_size_list
+
+
+# copied from https://github.com/Tencent-Hunyuan/HunyuanVideo-1.5/blob/main/hyvideo/utils/data_utils.py#L38
+def get_closest_ratio(height: float, width: float, ratios: list, buckets: list):
+    """
+    Get the closest ratio in the buckets.
+
+    Args:
+        height (float): video height
+        width (float): video width
+        ratios (list): video aspect ratio
+        buckets (list): buckets generated by `generate_crop_size_list`
+
+    Returns:
+        the closest size in the buckets and the corresponding ratio
+    """
+    aspect_ratio = float(height) / float(width)
+    diff_ratios = ratios - aspect_ratio
+
+    if aspect_ratio >= 1:
+        indices = [(index, x) for index, x in enumerate(diff_ratios) if x <= 0]
+    else:
+        indices = [(index, x) for index, x in enumerate(diff_ratios) if x >= 0]
+
+    closest_ratio_id = min(indices, key=lambda pair: abs(pair[1]))[0]
+    closest_size = buckets[closest_ratio_id]
+    closest_ratio = ratios[closest_ratio_id]
+
+    return closest_size, closest_ratio
+
+
+class HunyuanVideo15ImageProcessor(VideoProcessor):
+    r"""
+    Image/video processor to preproces/postprocess the reference image/generatedvideo for the HunyuanVideo1.5 model.
+
+    Args:
+        do_resize (`bool`, *optional*, defaults to `True`):
+            Whether to downscale the image's (height, width) dimensions to multiples of `vae_scale_factor`. Can accept
+            `height` and `width` arguments from [`image_processor.VaeImageProcessor.preprocess`] method.
+        vae_scale_factor (`int`, *optional*, defaults to `16`):
+            VAE (spatial) scale factor. If `do_resize` is `True`, the image is automatically resized to multiples of
+            this factor.
+        vae_latent_channels (`int`, *optional*, defaults to `32`):
+            VAE latent channels.
+        do_convert_rgb (`bool`, *optional*, defaults to `True`):
+            Whether to convert the image to RGB.
+    """
+
+    @register_to_config
+    def __init__(
+        self,
+        do_resize: bool = True,
+        vae_scale_factor: int = 16,
+        vae_latent_channels: int = 32,
+        do_convert_rgb: bool = True,
+    ):
+        super().__init__(
+            do_resize=do_resize,
+            vae_scale_factor=vae_scale_factor,
+            vae_latent_channels=vae_latent_channels,
+            do_convert_rgb=do_convert_rgb,
+        )
+
+    def calculate_default_height_width(self, height: int, width: int, target_size: int):
+        crop_size_list = generate_crop_size_list(base_size=target_size, patch_size=self.config.vae_scale_factor)
+        aspect_ratios = np.array([round(float(h) / float(w), 5) for h, w in crop_size_list])
+        height, width = get_closest_ratio(height, width, aspect_ratios, crop_size_list)[0]
+
+        return height, width
diff --git a/src/diffusers/pipelines/hunyuan_video1_5/pipeline_hunyuan_video1_5.py b/src/diffusers/pipelines/hunyuan_video1_5/pipeline_hunyuan_video1_5.py
new file mode 100644
index 000000000000..00a703939004
--- /dev/null
+++ b/src/diffusers/pipelines/hunyuan_video1_5/pipeline_hunyuan_video1_5.py
@@ -0,0 +1,837 @@
+# Copyright 2025 The HunyuanVideo Team and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import inspect
+import re
+from typing import Any, Dict, List, Optional, Tuple, Union
+
+import numpy as np
+import torch
+from transformers import ByT5Tokenizer, Qwen2_5_VLTextModel, Qwen2Tokenizer, T5EncoderModel
+
+from ...guiders import ClassifierFreeGuidance
+from ...models import AutoencoderKLHunyuanVideo15, HunyuanVideo15Transformer3DModel
+from ...schedulers import FlowMatchEulerDiscreteScheduler
+from ...utils import is_torch_xla_available, logging, replace_example_docstring
+from ...utils.torch_utils import randn_tensor
+from ..pipeline_utils import DiffusionPipeline
+from .image_processor import HunyuanVideo15ImageProcessor
+from .pipeline_output import HunyuanVideo15PipelineOutput
+
+
+if is_torch_xla_available():
+    import torch_xla.core.xla_model as xm
+
+    XLA_AVAILABLE = True
+else:
+    XLA_AVAILABLE = False
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+
+EXAMPLE_DOC_STRING = """
+    Examples:
+        ```python
+        >>> import torch
+        >>> from diffusers import HunyuanVideo15Pipeline
+        >>> from diffusers.utils import export_to_video
+
+        >>> model_id = "hunyuanvideo-community/HunyuanVideo-1.5-480p_t2v"
+        >>> pipe = HunyuanVideo15Pipeline.from_pretrained(model_id, torch_dtype=torch.float16)
+        >>> pipe.vae.enable_tiling()
+        >>> pipe.to("cuda")
+
+        >>> output = pipe(
+        ...     prompt="A cat walks on the grass, realistic",
+        ...     num_inference_steps=50,
+        ... ).frames[0]
+        >>> export_to_video(output, "output.mp4", fps=15)
+        ```
+"""
+
+
+def format_text_input(prompt: List[str], system_message: str) -> List[Dict[str, Any]]:
+    """
+    Apply text to template.
+
+    Args:
+        prompt (List[str]): Input text.
+        system_message (str): System message.
+
+    Returns:
+        List[Dict[str, Any]]: List of chat conversation.
+    """
+
+    template = [
+        [{"role": "system", "content": system_message}, {"role": "user", "content": p if p else " "}] for p in prompt
+    ]
+
+    return template
+
+
+def extract_glyph_texts(prompt: str) -> List[str]:
+    """
+    Extract glyph texts from prompt using regex pattern.
+
+    Args:
+        prompt: Input prompt string
+
+    Returns:
+        List of extracted glyph texts
+    """
+    pattern = r"\"(.*?)\"|“(.*?)”"
+    matches = re.findall(pattern, prompt)
+    result = [match[0] or match[1] for match in matches]
+    result = list(dict.fromkeys(result)) if len(result) > 1 else result
+
+    if result:
+        formatted_result = ". ".join([f'Text "{text}"' for text in result]) + ". "
+    else:
+        formatted_result = None
+
+    return formatted_result
+
+
+# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.retrieve_timesteps
+def retrieve_timesteps(
+    scheduler,
+    num_inference_steps: Optional[int] = None,
+    device: Optional[Union[str, torch.device]] = None,
+    timesteps: Optional[List[int]] = None,
+    sigmas: Optional[List[float]] = None,
+    **kwargs,
+):
+    r"""
+    Calls the scheduler's `set_timesteps` method and retrieves timesteps from the scheduler after the call. Handles
+    custom timesteps. Any kwargs will be supplied to `scheduler.set_timesteps`.
+
+    Args:
+        scheduler (`SchedulerMixin`):
+            The scheduler to get timesteps from.
+        num_inference_steps (`int`):
+            The number of diffusion steps used when generating samples with a pre-trained model. If used, `timesteps`
+            must be `None`.
+        device (`str` or `torch.device`, *optional*):
+            The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
+        timesteps (`List[int]`, *optional*):
+            Custom timesteps used to override the timestep spacing strategy of the scheduler. If `timesteps` is passed,
+            `num_inference_steps` and `sigmas` must be `None`.
+        sigmas (`List[float]`, *optional*):
+            Custom sigmas used to override the timestep spacing strategy of the scheduler. If `sigmas` is passed,
+            `num_inference_steps` and `timesteps` must be `None`.
+
+    Returns:
+        `Tuple[torch.Tensor, int]`: A tuple where the first element is the timestep schedule from the scheduler and the
+        second element is the number of inference steps.
+    """
+    if timesteps is not None and sigmas is not None:
+        raise ValueError("Only one of `timesteps` or `sigmas` can be passed. Please choose one to set custom values")
+    if timesteps is not None:
+        accepts_timesteps = "timesteps" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
+        if not accepts_timesteps:
+            raise ValueError(
+                f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
+                f" timestep schedules. Please check whether you are using the correct scheduler."
+            )
+        scheduler.set_timesteps(timesteps=timesteps, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+        num_inference_steps = len(timesteps)
+    elif sigmas is not None:
+        accept_sigmas = "sigmas" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
+        if not accept_sigmas:
+            raise ValueError(
+                f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
+                f" sigmas schedules. Please check whether you are using the correct scheduler."
+            )
+        scheduler.set_timesteps(sigmas=sigmas, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+        num_inference_steps = len(timesteps)
+    else:
+        scheduler.set_timesteps(num_inference_steps, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+    return timesteps, num_inference_steps
+
+
+class HunyuanVideo15Pipeline(DiffusionPipeline):
+    r"""
+    Pipeline for text-to-video generation using HunyuanVideo1.5.
+
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods
+    implemented for all pipelines (downloading, saving, running on a particular device, etc.).
+
+    Args:
+        transformer ([`HunyuanVideo15Transformer3DModel`]):
+            Conditional Transformer (MMDiT) architecture to denoise the encoded video latents.
+        scheduler ([`FlowMatchEulerDiscreteScheduler`]):
+            A scheduler to be used in combination with `transformer` to denoise the encoded video latents.
+        vae ([`AutoencoderKLHunyuanVideo15`]):
+            Variational Auto-Encoder (VAE) Model to encode and decode videos to and from latent representations.
+        text_encoder ([`Qwen2.5-VL-7B-Instruct`]):
+            [Qwen2.5-VL-7B-Instruct](https://huggingface.co/Qwen/Qwen2.5-VL-7B-Instruct), specifically the
+            [Qwen2.5-VL-7B-Instruct](https://huggingface.co/Qwen/Qwen2.5-VL-7B-Instruct) variant.
+        tokenizer (`Qwen2Tokenizer`): Tokenizer of class [Qwen2Tokenizer].
+        text_encoder_2 ([`T5EncoderModel`]):
+            [T5EncoderModel](https://huggingface.co/docs/transformers/en/model_doc/t5#transformers.T5EncoderModel)
+            variant.
+        tokenizer_2 (`ByT5Tokenizer`): Tokenizer of class [ByT5Tokenizer]
+        guider ([`ClassifierFreeGuidance`]):
+            [ClassifierFreeGuidance]for classifier free guidance.
+    """
+
+    model_cpu_offload_seq = "text_encoder->transformer->vae"
+
+    def __init__(
+        self,
+        text_encoder: Qwen2_5_VLTextModel,
+        tokenizer: Qwen2Tokenizer,
+        transformer: HunyuanVideo15Transformer3DModel,
+        vae: AutoencoderKLHunyuanVideo15,
+        scheduler: FlowMatchEulerDiscreteScheduler,
+        text_encoder_2: T5EncoderModel,
+        tokenizer_2: ByT5Tokenizer,
+        guider: ClassifierFreeGuidance,
+    ):
+        super().__init__()
+
+        self.register_modules(
+            vae=vae,
+            text_encoder=text_encoder,
+            tokenizer=tokenizer,
+            transformer=transformer,
+            scheduler=scheduler,
+            text_encoder_2=text_encoder_2,
+            tokenizer_2=tokenizer_2,
+            guider=guider,
+        )
+
+        self.vae_scale_factor_temporal = self.vae.temporal_compression_ratio if getattr(self, "vae", None) else 4
+        self.vae_scale_factor_spatial = self.vae.spatial_compression_ratio if getattr(self, "vae", None) else 16
+        self.video_processor = HunyuanVideo15ImageProcessor(vae_scale_factor=self.vae_scale_factor_spatial)
+        self.target_size = self.transformer.config.target_size if getattr(self, "transformer", None) else 640
+        self.vision_states_dim = (
+            self.transformer.config.image_embed_dim if getattr(self, "transformer", None) else 1152
+        )
+        self.num_channels_latents = self.vae.config.latent_channels if hasattr(self, "vae") else 32
+        # fmt: off
+        self.system_message = "You are a helpful assistant. Describe the video by detailing the following aspects: \
+        1. The main content and theme of the video. \
+        2. The color, shape, size, texture, quantity, text, and spatial relationships of the objects. \
+        3. Actions, events, behaviors temporal relationships, physical movement changes of the objects. \
+        4. background environment, light, style and atmosphere. \
+        5. camera angles, movements, and transitions used in the video."
+        # fmt: on
+        self.prompt_template_encode_start_idx = 108
+        self.tokenizer_max_length = 1000
+        self.tokenizer_2_max_length = 256
+        self.vision_num_semantic_tokens = 729
+        self.default_aspect_ratio = (16, 9)  # (width: height)
+
+    @staticmethod
+    def _get_mllm_prompt_embeds(
+        text_encoder: Qwen2_5_VLTextModel,
+        tokenizer: Qwen2Tokenizer,
+        prompt: Union[str, List[str]],
+        device: torch.device,
+        tokenizer_max_length: int = 1000,
+        num_hidden_layers_to_skip: int = 2,
+        # fmt: off
+        system_message: str = "You are a helpful assistant. Describe the video by detailing the following aspects: \
+        1. The main content and theme of the video. \
+        2. The color, shape, size, texture, quantity, text, and spatial relationships of the objects. \
+        3. Actions, events, behaviors temporal relationships, physical movement changes of the objects. \
+        4. background environment, light, style and atmosphere. \
+        5. camera angles, movements, and transitions used in the video.",
+        # fmt: on
+        crop_start: int = 108,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        prompt = [prompt] if isinstance(prompt, str) else prompt
+
+        prompt = format_text_input(prompt, system_message)
+
+        text_inputs = tokenizer.apply_chat_template(
+            prompt,
+            add_generation_prompt=True,
+            tokenize=True,
+            return_dict=True,
+            padding="max_length",
+            max_length=tokenizer_max_length + crop_start,
+            truncation=True,
+            return_tensors="pt",
+        )
+
+        text_input_ids = text_inputs.input_ids.to(device=device)
+        prompt_attention_mask = text_inputs.attention_mask.to(device=device)
+
+        prompt_embeds = text_encoder(
+            input_ids=text_input_ids,
+            attention_mask=prompt_attention_mask,
+            output_hidden_states=True,
+        ).hidden_states[-(num_hidden_layers_to_skip + 1)]
+
+        if crop_start is not None and crop_start > 0:
+            prompt_embeds = prompt_embeds[:, crop_start:]
+            prompt_attention_mask = prompt_attention_mask[:, crop_start:]
+
+        return prompt_embeds, prompt_attention_mask
+
+    @staticmethod
+    def _get_byt5_prompt_embeds(
+        tokenizer: ByT5Tokenizer,
+        text_encoder: T5EncoderModel,
+        prompt: Union[str, List[str]],
+        device: torch.device,
+        tokenizer_max_length: int = 256,
+    ):
+        prompt = [prompt] if isinstance(prompt, str) else prompt
+
+        glyph_texts = [extract_glyph_texts(p) for p in prompt]
+
+        prompt_embeds_list = []
+        prompt_embeds_mask_list = []
+
+        for glyph_text in glyph_texts:
+            if glyph_text is None:
+                glyph_text_embeds = torch.zeros(
+                    (1, tokenizer_max_length, text_encoder.config.d_model), device=device, dtype=text_encoder.dtype
+                )
+                glyph_text_embeds_mask = torch.zeros((1, tokenizer_max_length), device=device, dtype=torch.int64)
+            else:
+                txt_tokens = tokenizer(
+                    glyph_text,
+                    padding="max_length",
+                    max_length=tokenizer_max_length,
+                    truncation=True,
+                    add_special_tokens=True,
+                    return_tensors="pt",
+                ).to(device)
+
+                glyph_text_embeds = text_encoder(
+                    input_ids=txt_tokens.input_ids,
+                    attention_mask=txt_tokens.attention_mask.float(),
+                )[0]
+                glyph_text_embeds = glyph_text_embeds.to(device=device)
+                glyph_text_embeds_mask = txt_tokens.attention_mask.to(device=device)
+
+            prompt_embeds_list.append(glyph_text_embeds)
+            prompt_embeds_mask_list.append(glyph_text_embeds_mask)
+
+        prompt_embeds = torch.cat(prompt_embeds_list, dim=0)
+        prompt_embeds_mask = torch.cat(prompt_embeds_mask_list, dim=0)
+
+        return prompt_embeds, prompt_embeds_mask
+
+    def encode_prompt(
+        self,
+        prompt: Union[str, List[str]],
+        device: Optional[torch.device] = None,
+        dtype: Optional[torch.dtype] = None,
+        batch_size: int = 1,
+        num_videos_per_prompt: int = 1,
+        prompt_embeds: Optional[torch.Tensor] = None,
+        prompt_embeds_mask: Optional[torch.Tensor] = None,
+        prompt_embeds_2: Optional[torch.Tensor] = None,
+        prompt_embeds_mask_2: Optional[torch.Tensor] = None,
+    ):
+        r"""
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                prompt to be encoded
+            device: (`torch.device`):
+                torch device
+            batch_size (`int`):
+                batch size of prompts, defaults to 1
+            num_images_per_prompt (`int`):
+                number of images that should be generated per prompt
+            prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated text embeddings. If not provided, text embeddings will be generated from `prompt` input
+                argument.
+            prompt_embeds_mask (`torch.Tensor`, *optional*):
+                Pre-generated text mask. If not provided, text mask will be generated from `prompt` input argument.
+            prompt_embeds_2 (`torch.Tensor`, *optional*):
+                Pre-generated glyph text embeddings from ByT5. If not provided, will be generated from `prompt` input
+                argument using self.tokenizer_2 and self.text_encoder_2.
+            prompt_embeds_mask_2 (`torch.Tensor`, *optional*):
+                Pre-generated glyph text mask from ByT5. If not provided, will be generated from `prompt` input
+                argument using self.tokenizer_2 and self.text_encoder_2.
+        """
+        device = device or self._execution_device
+        dtype = dtype or self.text_encoder.dtype
+
+        if prompt is None:
+            prompt = [""] * batch_size
+
+        prompt = [prompt] if isinstance(prompt, str) else prompt
+
+        if prompt_embeds is None:
+            prompt_embeds, prompt_embeds_mask = self._get_mllm_prompt_embeds(
+                tokenizer=self.tokenizer,
+                text_encoder=self.text_encoder,
+                prompt=prompt,
+                device=device,
+                tokenizer_max_length=self.tokenizer_max_length,
+                system_message=self.system_message,
+                crop_start=self.prompt_template_encode_start_idx,
+            )
+
+        if prompt_embeds_2 is None:
+            prompt_embeds_2, prompt_embeds_mask_2 = self._get_byt5_prompt_embeds(
+                tokenizer=self.tokenizer_2,
+                text_encoder=self.text_encoder_2,
+                prompt=prompt,
+                device=device,
+                tokenizer_max_length=self.tokenizer_2_max_length,
+            )
+
+        _, seq_len, _ = prompt_embeds.shape
+        prompt_embeds = prompt_embeds.repeat(1, num_videos_per_prompt, 1)
+        prompt_embeds = prompt_embeds.view(batch_size * num_videos_per_prompt, seq_len, -1)
+        prompt_embeds_mask = prompt_embeds_mask.repeat(1, num_videos_per_prompt, 1)
+        prompt_embeds_mask = prompt_embeds_mask.view(batch_size * num_videos_per_prompt, seq_len)
+
+        _, seq_len_2, _ = prompt_embeds_2.shape
+        prompt_embeds_2 = prompt_embeds_2.repeat(1, num_videos_per_prompt, 1)
+        prompt_embeds_2 = prompt_embeds_2.view(batch_size * num_videos_per_prompt, seq_len_2, -1)
+        prompt_embeds_mask_2 = prompt_embeds_mask_2.repeat(1, num_videos_per_prompt, 1)
+        prompt_embeds_mask_2 = prompt_embeds_mask_2.view(batch_size * num_videos_per_prompt, seq_len_2)
+
+        prompt_embeds = prompt_embeds.to(dtype=dtype, device=device)
+        prompt_embeds_mask = prompt_embeds_mask.to(dtype=dtype, device=device)
+        prompt_embeds_2 = prompt_embeds_2.to(dtype=dtype, device=device)
+        prompt_embeds_mask_2 = prompt_embeds_mask_2.to(dtype=dtype, device=device)
+
+        return prompt_embeds, prompt_embeds_mask, prompt_embeds_2, prompt_embeds_mask_2
+
+    def check_inputs(
+        self,
+        prompt,
+        height,
+        width,
+        negative_prompt=None,
+        prompt_embeds=None,
+        negative_prompt_embeds=None,
+        prompt_embeds_mask=None,
+        negative_prompt_embeds_mask=None,
+        prompt_embeds_2=None,
+        prompt_embeds_mask_2=None,
+        negative_prompt_embeds_2=None,
+        negative_prompt_embeds_mask_2=None,
+    ):
+        if height is None and width is not None:
+            raise ValueError("If `width` is provided, `height` also have to be provided.")
+        elif width is None and height is not None:
+            raise ValueError("If `height` is provided, `width` also have to be provided.")
+
+        if prompt is not None and prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
+                " only forward one of the two."
+            )
+        elif prompt is None and prompt_embeds is None:
+            raise ValueError(
+                "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
+            )
+        elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+
+        if negative_prompt is not None and negative_prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
+                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
+            )
+
+        if prompt_embeds is not None and prompt_embeds_mask is None:
+            raise ValueError(
+                "If `prompt_embeds` are provided, `prompt_embeds_mask` also have to be passed. Make sure to generate `prompt_embeds_mask` from the same text encoder that was used to generate `prompt_embeds`."
+            )
+        if negative_prompt_embeds is not None and negative_prompt_embeds_mask is None:
+            raise ValueError(
+                "If `negative_prompt_embeds` are provided, `negative_prompt_embeds_mask` also have to be passed. Make sure to generate `negative_prompt_embeds_mask` from the same text encoder that was used to generate `negative_prompt_embeds`."
+            )
+
+        if prompt is None and prompt_embeds_2 is None:
+            raise ValueError(
+                "Provide either `prompt` or `prompt_embeds_2`. Cannot leave both `prompt` and `prompt_embeds_2` undefined."
+            )
+
+        if prompt_embeds_2 is not None and prompt_embeds_mask_2 is None:
+            raise ValueError(
+                "If `prompt_embeds_2` are provided, `prompt_embeds_mask_2` also have to be passed. Make sure to generate `prompt_embeds_mask_2` from the same text encoder that was used to generate `prompt_embeds_2`."
+            )
+        if negative_prompt_embeds_2 is not None and negative_prompt_embeds_mask_2 is None:
+            raise ValueError(
+                "If `negative_prompt_embeds_2` are provided, `negative_prompt_embeds_mask_2` also have to be passed. Make sure to generate `negative_prompt_embeds_mask_2` from the same text encoder that was used to generate `negative_prompt_embeds_2`."
+            )
+
+    def prepare_latents(
+        self,
+        batch_size: int,
+        num_channels_latents: int = 32,
+        height: int = 720,
+        width: int = 1280,
+        num_frames: int = 129,
+        dtype: Optional[torch.dtype] = None,
+        device: Optional[torch.device] = None,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        if latents is not None:
+            return latents.to(device=device, dtype=dtype)
+
+        shape = (
+            batch_size,
+            num_channels_latents,
+            (num_frames - 1) // self.vae_scale_factor_temporal + 1,
+            int(height) // self.vae_scale_factor_spatial,
+            int(width) // self.vae_scale_factor_spatial,
+        )
+        if isinstance(generator, list) and len(generator) != batch_size:
+            raise ValueError(
+                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+            )
+
+        latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+        return latents
+
+    def prepare_cond_latents_and_mask(self, latents, dtype: Optional[torch.dtype], device: Optional[torch.device]):
+        """
+        Prepare conditional latents and mask for t2v generation.
+
+        Args:
+            latents: Main latents tensor (B, C, F, H, W)
+
+        Returns:
+            tuple: (cond_latents_concat, mask_concat) - both are zero tensors for t2v
+        """
+        batch, channels, frames, height, width = latents.shape
+
+        cond_latents_concat = torch.zeros(batch, channels, frames, height, width, dtype=dtype, device=device)
+
+        mask_concat = torch.zeros(batch, 1, frames, height, width, dtype=dtype, device=device)
+
+        return cond_latents_concat, mask_concat
+
+    @property
+    def num_timesteps(self):
+        return self._num_timesteps
+
+    @property
+    def attention_kwargs(self):
+        return self._attention_kwargs
+
+    @property
+    def current_timestep(self):
+        return self._current_timestep
+
+    @property
+    def interrupt(self):
+        return self._interrupt
+
+    @torch.no_grad()
+    @replace_example_docstring(EXAMPLE_DOC_STRING)
+    def __call__(
+        self,
+        prompt: Union[str, List[str]] = None,
+        negative_prompt: Union[str, List[str]] = None,
+        height: Optional[int] = None,
+        width: Optional[int] = None,
+        num_frames: int = 121,
+        num_inference_steps: int = 50,
+        sigmas: List[float] = None,
+        num_videos_per_prompt: Optional[int] = 1,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.Tensor] = None,
+        prompt_embeds: Optional[torch.Tensor] = None,
+        prompt_embeds_mask: Optional[torch.Tensor] = None,
+        negative_prompt_embeds: Optional[torch.Tensor] = None,
+        negative_prompt_embeds_mask: Optional[torch.Tensor] = None,
+        prompt_embeds_2: Optional[torch.Tensor] = None,
+        prompt_embeds_mask_2: Optional[torch.Tensor] = None,
+        negative_prompt_embeds_2: Optional[torch.Tensor] = None,
+        negative_prompt_embeds_mask_2: Optional[torch.Tensor] = None,
+        output_type: Optional[str] = "np",
+        return_dict: bool = True,
+        attention_kwargs: Optional[Dict[str, Any]] = None,
+    ):
+        r"""
+        The call function to the pipeline for generation.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`
+                instead.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. If not defined, one has to pass
+                `negative_prompt_embeds` instead.
+            height (`int`, *optional*):
+                The height in pixels of the generated video.
+            width (`int`, *optional*):
+                The width in pixels of the generated video.
+            num_frames (`int`, defaults to `121`):
+                The number of frames in the generated video.
+            num_inference_steps (`int`, defaults to `50`):
+                The number of denoising steps. More denoising steps usually lead to a higher quality video at the
+                expense of slower inference.
+            sigmas (`List[float]`, *optional*):
+                Custom sigmas to use for the denoising process with schedulers which support a `sigmas` argument in
+                their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is passed
+                will be used.
+            num_videos_per_prompt (`int`, *optional*, defaults to 1):
+                The number of videos to generate per prompt.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
+                generation deterministic.
+            latents (`torch.Tensor`, *optional*):
+                Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for video
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor is generated by sampling using the supplied random `generator`.
+            prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not
+                provided, text embeddings are generated from the `prompt` input argument.
+            prompt_embeds_mask (`torch.Tensor`, *optional*):
+                Pre-generated mask for prompt embeddings.
+            negative_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+            negative_prompt_embeds_mask (`torch.Tensor`, *optional*):
+                Pre-generated mask for negative prompt embeddings.
+            prompt_embeds_2 (`torch.Tensor`, *optional*):
+                Pre-generated text embeddings from the second text encoder. Can be used to easily tweak text inputs.
+            prompt_embeds_mask_2 (`torch.Tensor`, *optional*):
+                Pre-generated mask for prompt embeddings from the second text encoder.
+            negative_prompt_embeds_2 (`torch.Tensor`, *optional*):
+                Pre-generated negative text embeddings from the second text encoder.
+            negative_prompt_embeds_mask_2 (`torch.Tensor`, *optional*):
+                Pre-generated mask for negative prompt embeddings from the second text encoder.
+            output_type (`str`, *optional*, defaults to `"np"`):
+                The output format of the generated video. Choose between "np", "pt", or "latent".
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`HunyuanVideo15PipelineOutput`] instead of a plain tuple.
+            attention_kwargs (`dict`, *optional*):
+                A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
+                `self.processor` in
+                [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
+
+        Examples:
+
+        Returns:
+            [`~HunyuanVideo15PipelineOutput`] or `tuple`:
+                If `return_dict` is `True`, [`HunyuanVideo15PipelineOutput`] is returned, otherwise a `tuple` is
+                returned where the first element is a list with the generated videos.
+        """
+
+        # 1. Check inputs. Raise error if not correct
+        self.check_inputs(
+            prompt=prompt,
+            height=height,
+            width=width,
+            negative_prompt=negative_prompt,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            prompt_embeds_mask=prompt_embeds_mask,
+            negative_prompt_embeds_mask=negative_prompt_embeds_mask,
+            prompt_embeds_2=prompt_embeds_2,
+            prompt_embeds_mask_2=prompt_embeds_mask_2,
+            negative_prompt_embeds_2=negative_prompt_embeds_2,
+            negative_prompt_embeds_mask_2=negative_prompt_embeds_mask_2,
+        )
+
+        if height is None and width is None:
+            height, width = self.video_processor.calculate_default_height_width(
+                self.default_aspect_ratio[1], self.default_aspect_ratio[0], self.target_size
+            )
+
+        self._attention_kwargs = attention_kwargs
+        self._current_timestep = None
+        self._interrupt = False
+
+        device = self._execution_device
+
+        # 2. Define call parameters
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        # 3. Encode input prompt
+        prompt_embeds, prompt_embeds_mask, prompt_embeds_2, prompt_embeds_mask_2 = self.encode_prompt(
+            prompt=prompt,
+            device=device,
+            dtype=self.transformer.dtype,
+            batch_size=batch_size,
+            num_videos_per_prompt=num_videos_per_prompt,
+            prompt_embeds=prompt_embeds,
+            prompt_embeds_mask=prompt_embeds_mask,
+            prompt_embeds_2=prompt_embeds_2,
+            prompt_embeds_mask_2=prompt_embeds_mask_2,
+        )
+
+        if self.guider._enabled and self.guider.num_conditions > 1:
+            (
+                negative_prompt_embeds,
+                negative_prompt_embeds_mask,
+                negative_prompt_embeds_2,
+                negative_prompt_embeds_mask_2,
+            ) = self.encode_prompt(
+                prompt=negative_prompt,
+                device=device,
+                dtype=self.transformer.dtype,
+                batch_size=batch_size,
+                num_videos_per_prompt=num_videos_per_prompt,
+                prompt_embeds=negative_prompt_embeds,
+                prompt_embeds_mask=negative_prompt_embeds_mask,
+                prompt_embeds_2=negative_prompt_embeds_2,
+                prompt_embeds_mask_2=negative_prompt_embeds_mask_2,
+            )
+
+        # 4. Prepare timesteps
+        sigmas = np.linspace(1.0, 0.0, num_inference_steps + 1)[:-1] if sigmas is None else sigmas
+        timesteps, num_inference_steps = retrieve_timesteps(self.scheduler, num_inference_steps, device, sigmas=sigmas)
+
+        # 5. Prepare latent variables
+        latents = self.prepare_latents(
+            batch_size * num_videos_per_prompt,
+            self.num_channels_latents,
+            height,
+            width,
+            num_frames,
+            self.transformer.dtype,
+            device,
+            generator,
+            latents,
+        )
+        cond_latents_concat, mask_concat = self.prepare_cond_latents_and_mask(latents, self.transformer.dtype, device)
+        image_embeds = torch.zeros(
+            batch_size,
+            self.vision_num_semantic_tokens,
+            self.vision_states_dim,
+            dtype=self.transformer.dtype,
+            device=device,
+        )
+
+        # 7. Denoising loop
+        num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
+        self._num_timesteps = len(timesteps)
+
+        with self.progress_bar(total=num_inference_steps) as progress_bar:
+            for i, t in enumerate(timesteps):
+                if self.interrupt:
+                    continue
+
+                self._current_timestep = t
+                latent_model_input = torch.cat([latents, cond_latents_concat, mask_concat], dim=1)
+                # broadcast to batch dimension in a way that's compatible with ONNX/Core ML
+                timestep = t.expand(latent_model_input.shape[0]).to(latent_model_input.dtype)
+
+                # Step 1: Collect model inputs needed for the guidance method
+                # conditional inputs should always be first element in the tuple
+                guider_inputs = {
+                    "encoder_hidden_states": (prompt_embeds, negative_prompt_embeds),
+                    "encoder_attention_mask": (prompt_embeds_mask, negative_prompt_embeds_mask),
+                    "encoder_hidden_states_2": (prompt_embeds_2, negative_prompt_embeds_2),
+                    "encoder_attention_mask_2": (prompt_embeds_mask_2, negative_prompt_embeds_mask_2),
+                }
+
+                # Step 2: Update guider's internal state for this denoising step
+                self.guider.set_state(step=i, num_inference_steps=num_inference_steps, timestep=t)
+
+                # Step 3: Prepare batched model inputs based on the guidance method
+                # The guider splits model inputs into separate batches for conditional/unconditional predictions.
+                # For CFG with guider_inputs = {"encoder_hidden_states": (prompt_embeds, negative_prompt_embeds)}:
+                # you will get a guider_state with two batches:
+                #   guider_state = [
+                #       {"encoder_hidden_states": prompt_embeds, "__guidance_identifier__": "pred_cond"},      # conditional batch
+                #       {"encoder_hidden_states": negative_prompt_embeds, "__guidance_identifier__": "pred_uncond"},  # unconditional batch
+                #   ]
+                # Other guidance methods may return 1 batch (no guidance) or 3+ batches (e.g., PAG, APG).
+                guider_state = self.guider.prepare_inputs(guider_inputs)
+                # Step 4: Run the denoiser for each batch
+                # Each batch in guider_state represents a different conditioning (conditional, unconditional, etc.).
+                # We run the model once per batch and store the noise prediction in guider_state_batch.noise_pred.
+                for guider_state_batch in guider_state:
+                    self.guider.prepare_models(self.transformer)
+
+                    # Extract conditioning kwargs for this batch (e.g., encoder_hidden_states)
+                    cond_kwargs = {
+                        input_name: getattr(guider_state_batch, input_name) for input_name in guider_inputs.keys()
+                    }
+
+                    # e.g. "pred_cond"/"pred_uncond"
+                    context_name = getattr(guider_state_batch, self.guider._identifier_key)
+                    with self.transformer.cache_context(context_name):
+                        # Run denoiser and store noise prediction in this batch
+                        guider_state_batch.noise_pred = self.transformer(
+                            hidden_states=latent_model_input,
+                            image_embeds=image_embeds,
+                            timestep=timestep,
+                            attention_kwargs=self.attention_kwargs,
+                            return_dict=False,
+                            **cond_kwargs,
+                        )[0]
+
+                    # Cleanup model (e.g., remove hooks)
+                    self.guider.cleanup_models(self.transformer)
+
+                # Step 5: Combine predictions using the guidance method
+                # The guider takes all noise predictions from guider_state and combines them according to the guidance algorithm.
+                # Continuing the CFG example, the guider receives:
+                #   guider_state = [
+                #       {"encoder_hidden_states": prompt_embeds, "noise_pred": noise_pred_cond, "__guidance_identifier__": "pred_cond"},      # batch 0
+                #       {"encoder_hidden_states": negative_prompt_embeds, "noise_pred": noise_pred_uncond, "__guidance_identifier__": "pred_uncond"},  # batch 1
+                #   ]
+                # And extracts predictions using the __guidance_identifier__:
+                #   pred_cond = guider_state[0]["noise_pred"]      # extracts noise_pred_cond
+                #   pred_uncond = guider_state[1]["noise_pred"]    # extracts noise_pred_uncond
+                # Then applies CFG formula:
+                #   noise_pred = pred_uncond + guidance_scale * (pred_cond - pred_uncond)
+                # Returns GuiderOutput(pred=noise_pred, pred_cond=pred_cond, pred_uncond=pred_uncond)
+                noise_pred = self.guider(guider_state)[0]
+
+                # compute the previous noisy sample x_t -> x_t-1
+                latents_dtype = latents.dtype
+                latents = self.scheduler.step(noise_pred, t, latents, return_dict=False)[0]
+
+                if latents.dtype != latents_dtype:
+                    if torch.backends.mps.is_available():
+                        # some platforms (eg. apple mps) misbehave due to a pytorch bug: https://github.com/pytorch/pytorch/pull/99272
+                        latents = latents.to(latents_dtype)
+
+                # call the callback, if provided
+                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+                    progress_bar.update()
+
+                if XLA_AVAILABLE:
+                    xm.mark_step()
+
+        self._current_timestep = None
+
+        # 8. decode the latents to video and postprocess
+        if not output_type == "latent":
+            latents = latents.to(self.vae.dtype) / self.vae.config.scaling_factor
+            video = self.vae.decode(latents, return_dict=False)[0]
+            video = self.video_processor.postprocess_video(video, output_type=output_type)
+        else:
+            video = latents
+
+        # Offload all models
+        self.maybe_free_model_hooks()
+
+        if not return_dict:
+            return (video,)
+
+        return HunyuanVideo15PipelineOutput(frames=video)
diff --git a/src/diffusers/pipelines/hunyuan_video1_5/pipeline_hunyuan_video1_5_image2video.py b/src/diffusers/pipelines/hunyuan_video1_5/pipeline_hunyuan_video1_5_image2video.py
new file mode 100644
index 000000000000..8c555eabba11
--- /dev/null
+++ b/src/diffusers/pipelines/hunyuan_video1_5/pipeline_hunyuan_video1_5_image2video.py
@@ -0,0 +1,960 @@
+# Copyright 2025 The HunyuanVideo Team and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import inspect
+import re
+from typing import Any, Dict, List, Optional, Tuple, Union
+
+import numpy as np
+import PIL
+import torch
+from transformers import (
+    ByT5Tokenizer,
+    Qwen2_5_VLTextModel,
+    Qwen2Tokenizer,
+    SiglipImageProcessor,
+    SiglipVisionModel,
+    T5EncoderModel,
+)
+
+from ...guiders import ClassifierFreeGuidance
+from ...models import AutoencoderKLHunyuanVideo15, HunyuanVideo15Transformer3DModel
+from ...schedulers import FlowMatchEulerDiscreteScheduler
+from ...utils import is_torch_xla_available, logging, replace_example_docstring
+from ...utils.torch_utils import randn_tensor
+from ..pipeline_utils import DiffusionPipeline
+from .image_processor import HunyuanVideo15ImageProcessor
+from .pipeline_output import HunyuanVideo15PipelineOutput
+
+
+if is_torch_xla_available():
+    import torch_xla.core.xla_model as xm
+
+    XLA_AVAILABLE = True
+else:
+    XLA_AVAILABLE = False
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+
+EXAMPLE_DOC_STRING = """
+    Examples:
+        ```python
+        >>> import torch
+        >>> from diffusers import HunyuanVideo15ImageToVideoPipeline
+        >>> from diffusers.utils import export_to_video
+
+        >>> model_id = "hunyuanvideo-community/HunyuanVideo-1.5-480p_i2v"
+        >>> pipe = HunyuanVideo15ImageToVideoPipeline.from_pretrained(model_id, torch_dtype=torch.float16)
+        >>> pipe.vae.enable_tiling()
+        >>> pipe.to("cuda")
+
+        >>> image = load_image("https://huggingface.co/datasets/YiYiXu/testing-images/resolve/main/wan_i2v_input.JPG")
+
+        >>> output = pipe(
+        ...     prompt="Summer beach vacation style, a white cat wearing sunglasses sits on a surfboard. The fluffy-furred feline gazes directly at the camera with a relaxed expression. Blurred beach scenery forms the background featuring crystal-clear waters, distant green hills, and a blue sky dotted with white clouds. The cat assumes a naturally relaxed posture, as if savoring the sea breeze and warm sunlight. A close-up shot highlights the feline's intricate details and the refreshing atmosphere of the seaside.",
+        ...     image=image,
+        ...     num_inference_steps=50,
+        ... ).frames[0]
+        >>> export_to_video(output, "output.mp4", fps=24)
+        ```
+"""
+
+
+# Copied from diffusers.pipelines.hunyuan_video1_5.pipeline_hunyuan_video1_5.format_text_input
+def format_text_input(prompt: List[str], system_message: str) -> List[Dict[str, Any]]:
+    """
+    Apply text to template.
+
+    Args:
+        prompt (List[str]): Input text.
+        system_message (str): System message.
+
+    Returns:
+        List[Dict[str, Any]]: List of chat conversation.
+    """
+
+    template = [
+        [{"role": "system", "content": system_message}, {"role": "user", "content": p if p else " "}] for p in prompt
+    ]
+
+    return template
+
+
+# Copied from diffusers.pipelines.hunyuan_video1_5.pipeline_hunyuan_video1_5.extract_glyph_texts
+def extract_glyph_texts(prompt: str) -> List[str]:
+    """
+    Extract glyph texts from prompt using regex pattern.
+
+    Args:
+        prompt: Input prompt string
+
+    Returns:
+        List of extracted glyph texts
+    """
+    pattern = r"\"(.*?)\"|“(.*?)”"
+    matches = re.findall(pattern, prompt)
+    result = [match[0] or match[1] for match in matches]
+    result = list(dict.fromkeys(result)) if len(result) > 1 else result
+
+    if result:
+        formatted_result = ". ".join([f'Text "{text}"' for text in result]) + ". "
+    else:
+        formatted_result = None
+
+    return formatted_result
+
+
+# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img.retrieve_latents
+def retrieve_latents(
+    encoder_output: torch.Tensor, generator: Optional[torch.Generator] = None, sample_mode: str = "sample"
+):
+    if hasattr(encoder_output, "latent_dist") and sample_mode == "sample":
+        return encoder_output.latent_dist.sample(generator)
+    elif hasattr(encoder_output, "latent_dist") and sample_mode == "argmax":
+        return encoder_output.latent_dist.mode()
+    elif hasattr(encoder_output, "latents"):
+        return encoder_output.latents
+    else:
+        raise AttributeError("Could not access latents of provided encoder_output")
+
+
+# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.retrieve_timesteps
+def retrieve_timesteps(
+    scheduler,
+    num_inference_steps: Optional[int] = None,
+    device: Optional[Union[str, torch.device]] = None,
+    timesteps: Optional[List[int]] = None,
+    sigmas: Optional[List[float]] = None,
+    **kwargs,
+):
+    r"""
+    Calls the scheduler's `set_timesteps` method and retrieves timesteps from the scheduler after the call. Handles
+    custom timesteps. Any kwargs will be supplied to `scheduler.set_timesteps`.
+
+    Args:
+        scheduler (`SchedulerMixin`):
+            The scheduler to get timesteps from.
+        num_inference_steps (`int`):
+            The number of diffusion steps used when generating samples with a pre-trained model. If used, `timesteps`
+            must be `None`.
+        device (`str` or `torch.device`, *optional*):
+            The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
+        timesteps (`List[int]`, *optional*):
+            Custom timesteps used to override the timestep spacing strategy of the scheduler. If `timesteps` is passed,
+            `num_inference_steps` and `sigmas` must be `None`.
+        sigmas (`List[float]`, *optional*):
+            Custom sigmas used to override the timestep spacing strategy of the scheduler. If `sigmas` is passed,
+            `num_inference_steps` and `timesteps` must be `None`.
+
+    Returns:
+        `Tuple[torch.Tensor, int]`: A tuple where the first element is the timestep schedule from the scheduler and the
+        second element is the number of inference steps.
+    """
+    if timesteps is not None and sigmas is not None:
+        raise ValueError("Only one of `timesteps` or `sigmas` can be passed. Please choose one to set custom values")
+    if timesteps is not None:
+        accepts_timesteps = "timesteps" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
+        if not accepts_timesteps:
+            raise ValueError(
+                f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
+                f" timestep schedules. Please check whether you are using the correct scheduler."
+            )
+        scheduler.set_timesteps(timesteps=timesteps, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+        num_inference_steps = len(timesteps)
+    elif sigmas is not None:
+        accept_sigmas = "sigmas" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
+        if not accept_sigmas:
+            raise ValueError(
+                f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
+                f" sigmas schedules. Please check whether you are using the correct scheduler."
+            )
+        scheduler.set_timesteps(sigmas=sigmas, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+        num_inference_steps = len(timesteps)
+    else:
+        scheduler.set_timesteps(num_inference_steps, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+    return timesteps, num_inference_steps
+
+
+class HunyuanVideo15ImageToVideoPipeline(DiffusionPipeline):
+    r"""
+    Pipeline for image-to-video generation using HunyuanVideo1.5.
+
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods
+    implemented for all pipelines (downloading, saving, running on a particular device, etc.).
+
+    Args:
+        transformer ([`HunyuanVideo15Transformer3DModel`]):
+            Conditional Transformer (MMDiT) architecture to denoise the encoded video latents.
+        scheduler ([`FlowMatchEulerDiscreteScheduler`]):
+            A scheduler to be used in combination with `transformer` to denoise the encoded video latents.
+        vae ([`AutoencoderKLHunyuanVideo15`]):
+            Variational Auto-Encoder (VAE) Model to encode and decode videos to and from latent representations.
+        text_encoder ([`Qwen2.5-VL-7B-Instruct`]):
+            [Qwen2.5-VL-7B-Instruct](https://huggingface.co/Qwen/Qwen2.5-VL-7B-Instruct), specifically the
+            [Qwen2.5-VL-7B-Instruct](https://huggingface.co/Qwen/Qwen2.5-VL-7B-Instruct) variant.
+        tokenizer (`Qwen2Tokenizer`): Tokenizer of class [Qwen2Tokenizer].
+        text_encoder_2 ([`T5EncoderModel`]):
+            [T5EncoderModel](https://huggingface.co/docs/transformers/en/model_doc/t5#transformers.T5EncoderModel)
+            variant.
+        tokenizer_2 (`ByT5Tokenizer`): Tokenizer of class [ByT5Tokenizer]
+        guider ([`ClassifierFreeGuidance`]):
+            [ClassifierFreeGuidance]for classifier free guidance.
+        image_encoder ([`SiglipVisionModel`]):
+            [SiglipVisionModel](https://huggingface.co/docs/transformers/en/model_doc/siglip#transformers.SiglipVisionModel)
+            variant.
+        feature_extractor ([`SiglipImageProcessor`]):
+            [SiglipImageProcessor](https://huggingface.co/docs/transformers/en/model_doc/siglip#transformers.SiglipImageProcessor)
+            variant.
+    """
+
+    model_cpu_offload_seq = "image_encoder->text_encoder->transformer->vae"
+
+    def __init__(
+        self,
+        text_encoder: Qwen2_5_VLTextModel,
+        tokenizer: Qwen2Tokenizer,
+        transformer: HunyuanVideo15Transformer3DModel,
+        vae: AutoencoderKLHunyuanVideo15,
+        scheduler: FlowMatchEulerDiscreteScheduler,
+        text_encoder_2: T5EncoderModel,
+        tokenizer_2: ByT5Tokenizer,
+        guider: ClassifierFreeGuidance,
+        image_encoder: SiglipVisionModel,
+        feature_extractor: SiglipImageProcessor,
+    ):
+        super().__init__()
+
+        self.register_modules(
+            vae=vae,
+            text_encoder=text_encoder,
+            tokenizer=tokenizer,
+            transformer=transformer,
+            scheduler=scheduler,
+            text_encoder_2=text_encoder_2,
+            tokenizer_2=tokenizer_2,
+            guider=guider,
+            image_encoder=image_encoder,
+            feature_extractor=feature_extractor,
+        )
+
+        self.vae_scale_factor_temporal = self.vae.temporal_compression_ratio if getattr(self, "vae", None) else 4
+        self.vae_scale_factor_spatial = self.vae.spatial_compression_ratio if getattr(self, "vae", None) else 16
+        self.video_processor = HunyuanVideo15ImageProcessor(
+            vae_scale_factor=self.vae_scale_factor_spatial, do_resize=False, do_convert_rgb=True
+        )
+        self.target_size = self.transformer.config.target_size if getattr(self, "transformer", None) else 640
+        self.vision_states_dim = (
+            self.transformer.config.image_embed_dim if getattr(self, "transformer", None) else 1152
+        )
+        self.num_channels_latents = self.vae.config.latent_channels if hasattr(self, "vae") else 32
+        # fmt: off
+        self.system_message = "You are a helpful assistant. Describe the video by detailing the following aspects: \
+        1. The main content and theme of the video. \
+        2. The color, shape, size, texture, quantity, text, and spatial relationships of the objects. \
+        3. Actions, events, behaviors temporal relationships, physical movement changes of the objects. \
+        4. background environment, light, style and atmosphere. \
+        5. camera angles, movements, and transitions used in the video."
+        # fmt: on
+        self.prompt_template_encode_start_idx = 108
+        self.tokenizer_max_length = 1000
+        self.tokenizer_2_max_length = 256
+        self.vision_num_semantic_tokens = 729
+
+    @staticmethod
+    # Copied from diffusers.pipelines.hunyuan_video1_5.pipeline_hunyuan_video1_5.HunyuanVideo15Pipeline._get_mllm_prompt_embeds
+    def _get_mllm_prompt_embeds(
+        text_encoder: Qwen2_5_VLTextModel,
+        tokenizer: Qwen2Tokenizer,
+        prompt: Union[str, List[str]],
+        device: torch.device,
+        tokenizer_max_length: int = 1000,
+        num_hidden_layers_to_skip: int = 2,
+        # fmt: off
+        system_message: str = "You are a helpful assistant. Describe the video by detailing the following aspects: \
+        1. The main content and theme of the video. \
+        2. The color, shape, size, texture, quantity, text, and spatial relationships of the objects. \
+        3. Actions, events, behaviors temporal relationships, physical movement changes of the objects. \
+        4. background environment, light, style and atmosphere. \
+        5. camera angles, movements, and transitions used in the video.",
+        # fmt: on
+        crop_start: int = 108,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        prompt = [prompt] if isinstance(prompt, str) else prompt
+
+        prompt = format_text_input(prompt, system_message)
+
+        text_inputs = tokenizer.apply_chat_template(
+            prompt,
+            add_generation_prompt=True,
+            tokenize=True,
+            return_dict=True,
+            padding="max_length",
+            max_length=tokenizer_max_length + crop_start,
+            truncation=True,
+            return_tensors="pt",
+        )
+
+        text_input_ids = text_inputs.input_ids.to(device=device)
+        prompt_attention_mask = text_inputs.attention_mask.to(device=device)
+
+        prompt_embeds = text_encoder(
+            input_ids=text_input_ids,
+            attention_mask=prompt_attention_mask,
+            output_hidden_states=True,
+        ).hidden_states[-(num_hidden_layers_to_skip + 1)]
+
+        if crop_start is not None and crop_start > 0:
+            prompt_embeds = prompt_embeds[:, crop_start:]
+            prompt_attention_mask = prompt_attention_mask[:, crop_start:]
+
+        return prompt_embeds, prompt_attention_mask
+
+    @staticmethod
+    # Copied from diffusers.pipelines.hunyuan_video1_5.pipeline_hunyuan_video1_5.HunyuanVideo15Pipeline._get_byt5_prompt_embeds
+    def _get_byt5_prompt_embeds(
+        tokenizer: ByT5Tokenizer,
+        text_encoder: T5EncoderModel,
+        prompt: Union[str, List[str]],
+        device: torch.device,
+        tokenizer_max_length: int = 256,
+    ):
+        prompt = [prompt] if isinstance(prompt, str) else prompt
+
+        glyph_texts = [extract_glyph_texts(p) for p in prompt]
+
+        prompt_embeds_list = []
+        prompt_embeds_mask_list = []
+
+        for glyph_text in glyph_texts:
+            if glyph_text is None:
+                glyph_text_embeds = torch.zeros(
+                    (1, tokenizer_max_length, text_encoder.config.d_model), device=device, dtype=text_encoder.dtype
+                )
+                glyph_text_embeds_mask = torch.zeros((1, tokenizer_max_length), device=device, dtype=torch.int64)
+            else:
+                txt_tokens = tokenizer(
+                    glyph_text,
+                    padding="max_length",
+                    max_length=tokenizer_max_length,
+                    truncation=True,
+                    add_special_tokens=True,
+                    return_tensors="pt",
+                ).to(device)
+
+                glyph_text_embeds = text_encoder(
+                    input_ids=txt_tokens.input_ids,
+                    attention_mask=txt_tokens.attention_mask.float(),
+                )[0]
+                glyph_text_embeds = glyph_text_embeds.to(device=device)
+                glyph_text_embeds_mask = txt_tokens.attention_mask.to(device=device)
+
+            prompt_embeds_list.append(glyph_text_embeds)
+            prompt_embeds_mask_list.append(glyph_text_embeds_mask)
+
+        prompt_embeds = torch.cat(prompt_embeds_list, dim=0)
+        prompt_embeds_mask = torch.cat(prompt_embeds_mask_list, dim=0)
+
+        return prompt_embeds, prompt_embeds_mask
+
+    @staticmethod
+    def _get_image_latents(
+        vae: AutoencoderKLHunyuanVideo15,
+        image_processor: HunyuanVideo15ImageProcessor,
+        image: PIL.Image.Image,
+        height: int,
+        width: int,
+        device: torch.device,
+    ) -> torch.Tensor:
+        vae_dtype = vae.dtype
+        image_tensor = image_processor.preprocess(image, height=height, width=width).to(device, dtype=vae_dtype)
+        image_tensor = image_tensor.unsqueeze(2)
+        image_latents = retrieve_latents(vae.encode(image_tensor), sample_mode="argmax")
+        image_latents = image_latents * vae.config.scaling_factor
+        return image_latents
+
+    @staticmethod
+    def _get_image_embeds(
+        image_encoder: SiglipVisionModel,
+        feature_extractor: SiglipImageProcessor,
+        image: PIL.Image.Image,
+        device: torch.device,
+    ) -> torch.Tensor:
+        image_encoder_dtype = next(image_encoder.parameters()).dtype
+        image = feature_extractor.preprocess(images=image, do_resize=True, return_tensors="pt", do_convert_rgb=True)
+        image = image.to(device=device, dtype=image_encoder_dtype)
+        image_enc_hidden_states = image_encoder(**image).last_hidden_state
+
+        return image_enc_hidden_states
+
+    def encode_image(
+        self,
+        image: PIL.Image.Image,
+        batch_size: int,
+        device: torch.device,
+        dtype: torch.dtype,
+    ) -> torch.Tensor:
+        image_embeds = self._get_image_embeds(
+            image_encoder=self.image_encoder,
+            feature_extractor=self.feature_extractor,
+            image=image,
+            device=device,
+        )
+        image_embeds = image_embeds.repeat(batch_size, 1, 1)
+        image_embeds = image_embeds.to(device=device, dtype=dtype)
+        return image_embeds
+
+    # Copied from diffusers.pipelines.hunyuan_video1_5.pipeline_hunyuan_video1_5.HunyuanVideo15Pipeline.encode_prompt
+    def encode_prompt(
+        self,
+        prompt: Union[str, List[str]],
+        device: Optional[torch.device] = None,
+        dtype: Optional[torch.dtype] = None,
+        batch_size: int = 1,
+        num_videos_per_prompt: int = 1,
+        prompt_embeds: Optional[torch.Tensor] = None,
+        prompt_embeds_mask: Optional[torch.Tensor] = None,
+        prompt_embeds_2: Optional[torch.Tensor] = None,
+        prompt_embeds_mask_2: Optional[torch.Tensor] = None,
+    ):
+        r"""
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                prompt to be encoded
+            device: (`torch.device`):
+                torch device
+            batch_size (`int`):
+                batch size of prompts, defaults to 1
+            num_images_per_prompt (`int`):
+                number of images that should be generated per prompt
+            prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated text embeddings. If not provided, text embeddings will be generated from `prompt` input
+                argument.
+            prompt_embeds_mask (`torch.Tensor`, *optional*):
+                Pre-generated text mask. If not provided, text mask will be generated from `prompt` input argument.
+            prompt_embeds_2 (`torch.Tensor`, *optional*):
+                Pre-generated glyph text embeddings from ByT5. If not provided, will be generated from `prompt` input
+                argument using self.tokenizer_2 and self.text_encoder_2.
+            prompt_embeds_mask_2 (`torch.Tensor`, *optional*):
+                Pre-generated glyph text mask from ByT5. If not provided, will be generated from `prompt` input
+                argument using self.tokenizer_2 and self.text_encoder_2.
+        """
+        device = device or self._execution_device
+        dtype = dtype or self.text_encoder.dtype
+
+        if prompt is None:
+            prompt = [""] * batch_size
+
+        prompt = [prompt] if isinstance(prompt, str) else prompt
+
+        if prompt_embeds is None:
+            prompt_embeds, prompt_embeds_mask = self._get_mllm_prompt_embeds(
+                tokenizer=self.tokenizer,
+                text_encoder=self.text_encoder,
+                prompt=prompt,
+                device=device,
+                tokenizer_max_length=self.tokenizer_max_length,
+                system_message=self.system_message,
+                crop_start=self.prompt_template_encode_start_idx,
+            )
+
+        if prompt_embeds_2 is None:
+            prompt_embeds_2, prompt_embeds_mask_2 = self._get_byt5_prompt_embeds(
+                tokenizer=self.tokenizer_2,
+                text_encoder=self.text_encoder_2,
+                prompt=prompt,
+                device=device,
+                tokenizer_max_length=self.tokenizer_2_max_length,
+            )
+
+        _, seq_len, _ = prompt_embeds.shape
+        prompt_embeds = prompt_embeds.repeat(1, num_videos_per_prompt, 1)
+        prompt_embeds = prompt_embeds.view(batch_size * num_videos_per_prompt, seq_len, -1)
+        prompt_embeds_mask = prompt_embeds_mask.repeat(1, num_videos_per_prompt, 1)
+        prompt_embeds_mask = prompt_embeds_mask.view(batch_size * num_videos_per_prompt, seq_len)
+
+        _, seq_len_2, _ = prompt_embeds_2.shape
+        prompt_embeds_2 = prompt_embeds_2.repeat(1, num_videos_per_prompt, 1)
+        prompt_embeds_2 = prompt_embeds_2.view(batch_size * num_videos_per_prompt, seq_len_2, -1)
+        prompt_embeds_mask_2 = prompt_embeds_mask_2.repeat(1, num_videos_per_prompt, 1)
+        prompt_embeds_mask_2 = prompt_embeds_mask_2.view(batch_size * num_videos_per_prompt, seq_len_2)
+
+        prompt_embeds = prompt_embeds.to(dtype=dtype, device=device)
+        prompt_embeds_mask = prompt_embeds_mask.to(dtype=dtype, device=device)
+        prompt_embeds_2 = prompt_embeds_2.to(dtype=dtype, device=device)
+        prompt_embeds_mask_2 = prompt_embeds_mask_2.to(dtype=dtype, device=device)
+
+        return prompt_embeds, prompt_embeds_mask, prompt_embeds_2, prompt_embeds_mask_2
+
+    def check_inputs(
+        self,
+        prompt,
+        image: PIL.Image.Image,
+        negative_prompt=None,
+        prompt_embeds=None,
+        negative_prompt_embeds=None,
+        prompt_embeds_mask=None,
+        negative_prompt_embeds_mask=None,
+        prompt_embeds_2=None,
+        prompt_embeds_mask_2=None,
+        negative_prompt_embeds_2=None,
+        negative_prompt_embeds_mask_2=None,
+    ):
+        if not isinstance(image, PIL.Image.Image):
+            raise ValueError(f"`image` has to be of type `PIL.Image.Image` but is {type(image)}")
+
+        if prompt is not None and prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
+                " only forward one of the two."
+            )
+        elif prompt is None and prompt_embeds is None:
+            raise ValueError(
+                "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
+            )
+        elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+
+        if negative_prompt is not None and negative_prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
+                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
+            )
+
+        if prompt_embeds is not None and prompt_embeds_mask is None:
+            raise ValueError(
+                "If `prompt_embeds` are provided, `prompt_embeds_mask` also have to be passed. Make sure to generate `prompt_embeds_mask` from the same text encoder that was used to generate `prompt_embeds`."
+            )
+        if negative_prompt_embeds is not None and negative_prompt_embeds_mask is None:
+            raise ValueError(
+                "If `negative_prompt_embeds` are provided, `negative_prompt_embeds_mask` also have to be passed. Make sure to generate `negative_prompt_embeds_mask` from the same text encoder that was used to generate `negative_prompt_embeds`."
+            )
+
+        if prompt is None and prompt_embeds_2 is None:
+            raise ValueError(
+                "Provide either `prompt` or `prompt_embeds_2`. Cannot leave both `prompt` and `prompt_embeds_2` undefined."
+            )
+
+        if prompt_embeds_2 is not None and prompt_embeds_mask_2 is None:
+            raise ValueError(
+                "If `prompt_embeds_2` are provided, `prompt_embeds_mask_2` also have to be passed. Make sure to generate `prompt_embeds_mask_2` from the same text encoder that was used to generate `prompt_embeds_2`."
+            )
+        if negative_prompt_embeds_2 is not None and negative_prompt_embeds_mask_2 is None:
+            raise ValueError(
+                "If `negative_prompt_embeds_2` are provided, `negative_prompt_embeds_mask_2` also have to be passed. Make sure to generate `negative_prompt_embeds_mask_2` from the same text encoder that was used to generate `negative_prompt_embeds_2`."
+            )
+
+    # Copied from diffusers.pipelines.hunyuan_video1_5.pipeline_hunyuan_video1_5.HunyuanVideo15Pipeline.prepare_latents
+    def prepare_latents(
+        self,
+        batch_size: int,
+        num_channels_latents: int = 32,
+        height: int = 720,
+        width: int = 1280,
+        num_frames: int = 129,
+        dtype: Optional[torch.dtype] = None,
+        device: Optional[torch.device] = None,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        if latents is not None:
+            return latents.to(device=device, dtype=dtype)
+
+        shape = (
+            batch_size,
+            num_channels_latents,
+            (num_frames - 1) // self.vae_scale_factor_temporal + 1,
+            int(height) // self.vae_scale_factor_spatial,
+            int(width) // self.vae_scale_factor_spatial,
+        )
+        if isinstance(generator, list) and len(generator) != batch_size:
+            raise ValueError(
+                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+            )
+
+        latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+        return latents
+
+    def prepare_cond_latents_and_mask(
+        self,
+        latents: torch.Tensor,
+        image: PIL.Image.Image,
+        batch_size: int,
+        height: int,
+        width: int,
+        dtype: torch.dtype,
+        device: torch.device,
+    ):
+        """
+        Prepare conditional latents and mask for t2v generation.
+
+        Args:
+            latents: Main latents tensor (B, C, F, H, W)
+
+        Returns:
+            tuple: (cond_latents_concat, mask_concat) - both are zero tensors for t2v
+        """
+
+        batch, channels, frames, height, width = latents.shape
+
+        image_latents = self._get_image_latents(
+            vae=self.vae,
+            image_processor=self.video_processor,
+            image=image,
+            height=height,
+            width=width,
+            device=device,
+        )
+
+        latent_condition = image_latents.repeat(batch_size, 1, frames, 1, 1)
+        latent_condition[:, :, 1:, :, :] = 0
+        latent_condition = latent_condition.to(device=device, dtype=dtype)
+
+        latent_mask = torch.zeros(batch, 1, frames, height, width, dtype=dtype, device=device)
+        latent_mask[:, :, 0, :, :] = 1.0
+
+        return latent_condition, latent_mask
+
+    @property
+    def num_timesteps(self):
+        return self._num_timesteps
+
+    @property
+    def attention_kwargs(self):
+        return self._attention_kwargs
+
+    @property
+    def current_timestep(self):
+        return self._current_timestep
+
+    @property
+    def interrupt(self):
+        return self._interrupt
+
+    @torch.no_grad()
+    @replace_example_docstring(EXAMPLE_DOC_STRING)
+    def __call__(
+        self,
+        image: PIL.Image.Image,
+        prompt: Union[str, List[str]] = None,
+        negative_prompt: Union[str, List[str]] = None,
+        num_frames: int = 121,
+        num_inference_steps: int = 50,
+        sigmas: List[float] = None,
+        num_videos_per_prompt: Optional[int] = 1,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.Tensor] = None,
+        prompt_embeds: Optional[torch.Tensor] = None,
+        prompt_embeds_mask: Optional[torch.Tensor] = None,
+        negative_prompt_embeds: Optional[torch.Tensor] = None,
+        negative_prompt_embeds_mask: Optional[torch.Tensor] = None,
+        prompt_embeds_2: Optional[torch.Tensor] = None,
+        prompt_embeds_mask_2: Optional[torch.Tensor] = None,
+        negative_prompt_embeds_2: Optional[torch.Tensor] = None,
+        negative_prompt_embeds_mask_2: Optional[torch.Tensor] = None,
+        output_type: Optional[str] = "np",
+        return_dict: bool = True,
+        attention_kwargs: Optional[Dict[str, Any]] = None,
+    ):
+        r"""
+        The call function to the pipeline for generation.
+
+        Args:
+            image (`PIL.Image.Image`):
+                The input image to condition video generation on.
+            prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide the video generation. If not defined, one has to pass `prompt_embeds`
+                instead.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the video generation. If not defined, one has to pass
+                `negative_prompt_embeds` instead.
+            num_frames (`int`, defaults to `121`):
+                The number of frames in the generated video.
+            num_inference_steps (`int`, defaults to `50`):
+                The number of denoising steps. More denoising steps usually lead to a higher quality video at the
+                expense of slower inference.
+            sigmas (`List[float]`, *optional*):
+                Custom sigmas to use for the denoising process with schedulers which support a `sigmas` argument in
+                their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is passed
+                will be used.
+            num_videos_per_prompt (`int`, *optional*, defaults to 1):
+                The number of videos to generate per prompt.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
+                generation deterministic.
+            latents (`torch.Tensor`, *optional*):
+                Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for video
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor is generated by sampling using the supplied random `generator`.
+            prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not
+                provided, text embeddings are generated from the `prompt` input argument.
+            prompt_embeds_mask (`torch.Tensor`, *optional*):
+                Pre-generated mask for prompt embeddings.
+            negative_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+            negative_prompt_embeds_mask (`torch.Tensor`, *optional*):
+                Pre-generated mask for negative prompt embeddings.
+            prompt_embeds_2 (`torch.Tensor`, *optional*):
+                Pre-generated text embeddings from the second text encoder. Can be used to easily tweak text inputs.
+            prompt_embeds_mask_2 (`torch.Tensor`, *optional*):
+                Pre-generated mask for prompt embeddings from the second text encoder.
+            negative_prompt_embeds_2 (`torch.Tensor`, *optional*):
+                Pre-generated negative text embeddings from the second text encoder.
+            negative_prompt_embeds_mask_2 (`torch.Tensor`, *optional*):
+                Pre-generated mask for negative prompt embeddings from the second text encoder.
+            output_type (`str`, *optional*, defaults to `"np"`):
+                The output format of the generated video. Choose between "np", "pt", or "latent".
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`HunyuanVideo15PipelineOutput`] instead of a plain tuple.
+            attention_kwargs (`dict`, *optional*):
+                A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
+                `self.processor` in
+                [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
+
+        Examples:
+
+        Returns:
+            [`~HunyuanVideo15PipelineOutput`] or `tuple`:
+                If `return_dict` is `True`, [`HunyuanVideo15PipelineOutput`] is returned, otherwise a `tuple` is
+                returned where the first element is a list with the generated videos.
+        """
+
+        # 1. Check inputs. Raise error if not correct
+        self.check_inputs(
+            prompt=prompt,
+            image=image,
+            negative_prompt=negative_prompt,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            prompt_embeds_mask=prompt_embeds_mask,
+            negative_prompt_embeds_mask=negative_prompt_embeds_mask,
+            prompt_embeds_2=prompt_embeds_2,
+            prompt_embeds_mask_2=prompt_embeds_mask_2,
+            negative_prompt_embeds_2=negative_prompt_embeds_2,
+            negative_prompt_embeds_mask_2=negative_prompt_embeds_mask_2,
+        )
+
+        height, width = self.video_processor.calculate_default_height_width(
+            height=image.size[1], width=image.size[0], target_size=self.target_size
+        )
+        image = self.video_processor.resize(image, height=height, width=width, resize_mode="crop")
+
+        self._attention_kwargs = attention_kwargs
+        self._current_timestep = None
+        self._interrupt = False
+
+        device = self._execution_device
+
+        # 2. Define call parameters
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        # 3. Encode image
+        image_embeds = self.encode_image(
+            image=image,
+            batch_size=batch_size * num_videos_per_prompt,
+            device=device,
+            dtype=self.transformer.dtype,
+        )
+
+        # 4. Encode input prompt
+        prompt_embeds, prompt_embeds_mask, prompt_embeds_2, prompt_embeds_mask_2 = self.encode_prompt(
+            prompt=prompt,
+            device=device,
+            dtype=self.transformer.dtype,
+            batch_size=batch_size,
+            num_videos_per_prompt=num_videos_per_prompt,
+            prompt_embeds=prompt_embeds,
+            prompt_embeds_mask=prompt_embeds_mask,
+            prompt_embeds_2=prompt_embeds_2,
+            prompt_embeds_mask_2=prompt_embeds_mask_2,
+        )
+
+        if self.guider._enabled and self.guider.num_conditions > 1:
+            (
+                negative_prompt_embeds,
+                negative_prompt_embeds_mask,
+                negative_prompt_embeds_2,
+                negative_prompt_embeds_mask_2,
+            ) = self.encode_prompt(
+                prompt=negative_prompt,
+                device=device,
+                dtype=self.transformer.dtype,
+                batch_size=batch_size,
+                num_videos_per_prompt=num_videos_per_prompt,
+                prompt_embeds=negative_prompt_embeds,
+                prompt_embeds_mask=negative_prompt_embeds_mask,
+                prompt_embeds_2=negative_prompt_embeds_2,
+                prompt_embeds_mask_2=negative_prompt_embeds_mask_2,
+            )
+
+        # 5. Prepare timesteps
+        sigmas = np.linspace(1.0, 0.0, num_inference_steps + 1)[:-1] if sigmas is None else sigmas
+        timesteps, num_inference_steps = retrieve_timesteps(self.scheduler, num_inference_steps, device, sigmas=sigmas)
+
+        # 6. Prepare latent variables
+        latents = self.prepare_latents(
+            batch_size=batch_size * num_videos_per_prompt,
+            num_channels_latents=self.num_channels_latents,
+            height=height,
+            width=width,
+            num_frames=num_frames,
+            dtype=self.transformer.dtype,
+            device=device,
+            generator=generator,
+            latents=latents,
+        )
+
+        cond_latents_concat, mask_concat = self.prepare_cond_latents_and_mask(
+            latents=latents,
+            image=image,
+            batch_size=batch_size * num_videos_per_prompt,
+            height=height,
+            width=width,
+            dtype=self.transformer.dtype,
+            device=device,
+        )
+
+        # 7. Denoising loop
+        num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
+        self._num_timesteps = len(timesteps)
+
+        with self.progress_bar(total=num_inference_steps) as progress_bar:
+            for i, t in enumerate(timesteps):
+                if self.interrupt:
+                    continue
+
+                self._current_timestep = t
+                latent_model_input = torch.cat([latents, cond_latents_concat, mask_concat], dim=1)
+                # broadcast to batch dimension in a way that's compatible with ONNX/Core ML
+                timestep = t.expand(latent_model_input.shape[0]).to(latent_model_input.dtype)
+
+                if self.transformer.config.use_meanflow:
+                    if i == len(timesteps) - 1:
+                        timestep_r = torch.tensor([0.0], device=device)
+                    else:
+                        timestep_r = timesteps[i + 1]
+                    timestep_r = timestep_r.expand(latents.shape[0]).to(latents.dtype)
+                else:
+                    timestep_r = None
+
+                # Step 1: Collect model inputs needed for the guidance method
+                # conditional inputs should always be first element in the tuple
+                guider_inputs = {
+                    "encoder_hidden_states": (prompt_embeds, negative_prompt_embeds),
+                    "encoder_attention_mask": (prompt_embeds_mask, negative_prompt_embeds_mask),
+                    "encoder_hidden_states_2": (prompt_embeds_2, negative_prompt_embeds_2),
+                    "encoder_attention_mask_2": (prompt_embeds_mask_2, negative_prompt_embeds_mask_2),
+                }
+
+                # Step 2: Update guider's internal state for this denoising step
+                self.guider.set_state(step=i, num_inference_steps=num_inference_steps, timestep=t)
+
+                # Step 3: Prepare batched model inputs based on the guidance method
+                # The guider splits model inputs into separate batches for conditional/unconditional predictions.
+                # For CFG with guider_inputs = {"encoder_hidden_states": (prompt_embeds, negative_prompt_embeds)}:
+                # you will get a guider_state with two batches:
+                #   guider_state = [
+                #       {"encoder_hidden_states": prompt_embeds, "__guidance_identifier__": "pred_cond"},      # conditional batch
+                #       {"encoder_hidden_states": negative_prompt_embeds, "__guidance_identifier__": "pred_uncond"},  # unconditional batch
+                #   ]
+                # Other guidance methods may return 1 batch (no guidance) or 3+ batches (e.g., PAG, APG).
+                guider_state = self.guider.prepare_inputs(guider_inputs)
+                # Step 4: Run the denoiser for each batch
+                # Each batch in guider_state represents a different conditioning (conditional, unconditional, etc.).
+                # We run the model once per batch and store the noise prediction in guider_state_batch.noise_pred.
+                for guider_state_batch in guider_state:
+                    self.guider.prepare_models(self.transformer)
+
+                    # Extract conditioning kwargs for this batch (e.g., encoder_hidden_states)
+                    cond_kwargs = {
+                        input_name: getattr(guider_state_batch, input_name) for input_name in guider_inputs.keys()
+                    }
+
+                    # e.g. "pred_cond"/"pred_uncond"
+                    context_name = getattr(guider_state_batch, self.guider._identifier_key)
+                    with self.transformer.cache_context(context_name):
+                        # Run denoiser and store noise prediction in this batch
+                        guider_state_batch.noise_pred = self.transformer(
+                            hidden_states=latent_model_input,
+                            image_embeds=image_embeds,
+                            timestep=timestep,
+                            timestep_r=timestep_r,
+                            attention_kwargs=self.attention_kwargs,
+                            return_dict=False,
+                            **cond_kwargs,
+                        )[0]
+
+                    # Cleanup model (e.g., remove hooks)
+                    self.guider.cleanup_models(self.transformer)
+
+                # Step 5: Combine predictions using the guidance method
+                # The guider takes all noise predictions from guider_state and combines them according to the guidance algorithm.
+                # Continuing the CFG example, the guider receives:
+                #   guider_state = [
+                #       {"encoder_hidden_states": prompt_embeds, "noise_pred": noise_pred_cond, "__guidance_identifier__": "pred_cond"},      # batch 0
+                #       {"encoder_hidden_states": negative_prompt_embeds, "noise_pred": noise_pred_uncond, "__guidance_identifier__": "pred_uncond"},  # batch 1
+                #   ]
+                # And extracts predictions using the __guidance_identifier__:
+                #   pred_cond = guider_state[0]["noise_pred"]      # extracts noise_pred_cond
+                #   pred_uncond = guider_state[1]["noise_pred"]    # extracts noise_pred_uncond
+                # Then applies CFG formula:
+                #   noise_pred = pred_uncond + guidance_scale * (pred_cond - pred_uncond)
+                # Returns GuiderOutput(pred=noise_pred, pred_cond=pred_cond, pred_uncond=pred_uncond)
+                noise_pred = self.guider(guider_state)[0]
+
+                # compute the previous noisy sample x_t -> x_t-1
+                latents_dtype = latents.dtype
+                latents = self.scheduler.step(noise_pred, t, latents, return_dict=False)[0]
+
+                if latents.dtype != latents_dtype:
+                    if torch.backends.mps.is_available():
+                        # some platforms (eg. apple mps) misbehave due to a pytorch bug: https://github.com/pytorch/pytorch/pull/99272
+                        latents = latents.to(latents_dtype)
+
+                # call the callback, if provided
+                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+                    progress_bar.update()
+
+                if XLA_AVAILABLE:
+                    xm.mark_step()
+
+        self._current_timestep = None
+
+        if not output_type == "latent":
+            latents = latents.to(self.vae.dtype) / self.vae.config.scaling_factor
+            video = self.vae.decode(latents, return_dict=False)[0]
+            video = self.video_processor.postprocess_video(video, output_type=output_type)
+        else:
+            video = latents
+
+        # Offload all models
+        self.maybe_free_model_hooks()
+
+        if not return_dict:
+            return (video,)
+
+        return HunyuanVideo15PipelineOutput(frames=video)
diff --git a/src/diffusers/pipelines/hunyuan_video1_5/pipeline_output.py b/src/diffusers/pipelines/hunyuan_video1_5/pipeline_output.py
new file mode 100644
index 000000000000..441164db5a09
--- /dev/null
+++ b/src/diffusers/pipelines/hunyuan_video1_5/pipeline_output.py
@@ -0,0 +1,20 @@
+from dataclasses import dataclass
+
+import torch
+
+from diffusers.utils import BaseOutput
+
+
+@dataclass
+class HunyuanVideo15PipelineOutput(BaseOutput):
+    r"""
+    Output class for HunyuanVideo1.5 pipelines.
+
+    Args:
+        frames (`torch.Tensor`, `np.ndarray`, or List[List[PIL.Image.Image]]):
+            List of video outputs - It can be a nested list of length `batch_size,` with each sub-list containing
+            denoised PIL image sequences of length `num_frames.` It can also be a NumPy array or Torch tensor of shape
+            `(batch_size, num_frames, channels, height, width)`.
+    """
+
+    frames: torch.Tensor
diff --git a/src/diffusers/pipelines/kandinsky5/__init__.py b/src/diffusers/pipelines/kandinsky5/__init__.py
index a7975bdce926..d417ed932b92 100644
--- a/src/diffusers/pipelines/kandinsky5/__init__.py
+++ b/src/diffusers/pipelines/kandinsky5/__init__.py
@@ -23,6 +23,9 @@
     _dummy_objects.update(get_objects_from_module(dummy_torch_and_transformers_objects))
 else:
     _import_structure["pipeline_kandinsky"] = ["Kandinsky5T2VPipeline"]
+    _import_structure["pipeline_kandinsky_i2i"] = ["Kandinsky5I2IPipeline"]
+    _import_structure["pipeline_kandinsky_i2v"] = ["Kandinsky5I2VPipeline"]
+    _import_structure["pipeline_kandinsky_t2i"] = ["Kandinsky5T2IPipeline"]
 
 if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
     try:
@@ -33,6 +36,9 @@
         from ...utils.dummy_torch_and_transformers_objects import *
     else:
         from .pipeline_kandinsky import Kandinsky5T2VPipeline
+        from .pipeline_kandinsky_i2i import Kandinsky5I2IPipeline
+        from .pipeline_kandinsky_i2v import Kandinsky5I2VPipeline
+        from .pipeline_kandinsky_t2i import Kandinsky5T2IPipeline
 
 else:
     import sys
diff --git a/src/diffusers/pipelines/kandinsky5/pipeline_kandinsky.py b/src/diffusers/pipelines/kandinsky5/pipeline_kandinsky.py
index 3f93aa1889d0..2b666f0ec697 100644
--- a/src/diffusers/pipelines/kandinsky5/pipeline_kandinsky.py
+++ b/src/diffusers/pipelines/kandinsky5/pipeline_kandinsky.py
@@ -25,7 +25,14 @@
 from ...models import AutoencoderKLHunyuanVideo
 from ...models.transformers import Kandinsky5Transformer3DModel
 from ...schedulers import FlowMatchEulerDiscreteScheduler
-from ...utils import is_ftfy_available, is_torch_xla_available, logging, replace_example_docstring
+
+# Add imports for offloading and tiling
+from ...utils import (
+    is_ftfy_available,
+    is_torch_xla_available,
+    logging,
+    replace_example_docstring,
+)
 from ...utils.torch_utils import randn_tensor
 from ...video_processor import VideoProcessor
 from ..pipeline_utils import DiffusionPipeline
@@ -56,12 +63,17 @@
         >>> from diffusers.utils import export_to_video
 
         >>> # Available models:
-        >>> # ai-forever/Kandinsky-5.0-T2V-Lite-sft-5s-Diffusers
-        >>> # ai-forever/Kandinsky-5.0-T2V-Lite-nocfg-5s-Diffusers
-        >>> # ai-forever/Kandinsky-5.0-T2V-Lite-distilled16steps-5s-Diffusers
-        >>> # ai-forever/Kandinsky-5.0-T2V-Lite-pretrain-5s-Diffusers
-
-        >>> model_id = "ai-forever/Kandinsky-5.0-T2V-Lite-sft-5s-Diffusers"
+        >>> # kandinskylab/Kandinsky-5.0-T2V-Pro-sft-5s-Diffusers
+        >>> # kandinskylab/Kandinsky-5.0-T2V-Lite-sft-5s-Diffusers
+        >>> # kandinskylab/Kandinsky-5.0-T2V-Lite-nocfg-5s-Diffusers
+        >>> # kandinskylab/Kandinsky-5.0-T2V-Lite-distilled16steps-5s-Diffusers
+        >>> # kandinskylab/Kandinsky-5.0-T2V-Lite-pretrain-5s-Diffusers
+        >>> # kandinskylab/Kandinsky-5.0-T2V-Lite-sft-10s-Diffusers
+        >>> # kandinskylab/Kandinsky-5.0-T2V-Lite-nocfg-10s-Diffusers
+        >>> # kandinskylab/Kandinsky-5.0-T2V-Lite-distilled16steps-10s-Diffusers
+        >>> # kandinskylab/Kandinsky-5.0-T2V-Lite-pretrain-10s-Diffusers
+
+        >>> model_id = "kandinskylab/Kandinsky-5.0-T2V-Lite-sft-5s-Diffusers"
         >>> pipe = Kandinsky5T2VPipeline.from_pretrained(model_id, torch_dtype=torch.bfloat16)
         >>> pipe = pipe.to("cuda")
 
@@ -84,7 +96,11 @@
 
 
 def basic_clean(text):
-    """Clean text using ftfy if available and unescape HTML entities."""
+    """
+    Copied from https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/wan/pipeline_wan.py
+
+    Clean text using ftfy if available and unescape HTML entities.
+    """
     if is_ftfy_available():
         text = ftfy.fix_text(text)
     text = html.unescape(html.unescape(text))
@@ -92,14 +108,22 @@ def basic_clean(text):
 
 
 def whitespace_clean(text):
-    """Normalize whitespace in text by replacing multiple spaces with single space."""
+    """
+    Copied from https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/wan/pipeline_wan.py
+
+    Normalize whitespace in text by replacing multiple spaces with single space.
+    """
     text = re.sub(r"\s+", " ", text)
     text = text.strip()
     return text
 
 
 def prompt_clean(text):
-    """Apply both basic cleaning and whitespace normalization to prompts."""
+    """
+    Copied from https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/wan/pipeline_wan.py
+
+    Apply both basic cleaning and whitespace normalization to prompts.
+    """
     text = whitespace_clean(basic_clean(text))
     return text
 
@@ -115,13 +139,16 @@ class Kandinsky5T2VPipeline(DiffusionPipeline, KandinskyLoraLoaderMixin):
         transformer ([`Kandinsky5Transformer3DModel`]):
             Conditional Transformer to denoise the encoded video latents.
         vae ([`AutoencoderKLHunyuanVideo`]):
-            Variational Auto-Encoder (VAE) Model to encode and decode videos to and from latent representations.
+            Variational Auto-Encoder Model [hunyuanvideo-community/HunyuanVideo
+            (vae)](https://huggingface.co/hunyuanvideo-community/HunyuanVideo) to encode and decode videos to and from
+            latent representations.
         text_encoder ([`Qwen2_5_VLForConditionalGeneration`]):
-            Frozen text-encoder (Qwen2.5-VL).
+            Frozen text-encoder [Qwen2.5-VL](https://huggingface.co/Qwen/Qwen2.5-VL-7B-Instruct).
         tokenizer ([`AutoProcessor`]):
             Tokenizer for Qwen2.5-VL.
         text_encoder_2 ([`CLIPTextModel`]):
-            Frozen CLIP text encoder.
+            Frozen [CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModel),
+            specifically the [clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14) variant.
         tokenizer_2 ([`CLIPTokenizer`]):
             Tokenizer for CLIP.
         scheduler ([`FlowMatchEulerDiscreteScheduler`]):
@@ -179,6 +206,26 @@ def __init__(
         self.vae_scale_factor_spatial = self.vae.config.spatial_compression_ratio if getattr(self, "vae", None) else 8
         self.video_processor = VideoProcessor(vae_scale_factor=self.vae_scale_factor_spatial)
 
+    def _get_scale_factor(self, height: int, width: int) -> tuple:
+        """
+        Calculate the scale factor based on resolution.
+
+        Args:
+            height (int): Video height
+            width (int): Video width
+
+        Returns:
+            tuple: Scale factor as (temporal_scale, height_scale, width_scale)
+        """
+
+        def between_480p(x):
+            return 480 <= x <= 854
+
+        if between_480p(height) and between_480p(width):
+            return (1, 2, 2)
+        else:
+            return (1, 3.16, 3.16)
+
     @staticmethod
     def fast_sta_nabla(T: int, H: int, W: int, wT: int = 3, wH: int = 3, wW: int = 3, device="cuda") -> torch.Tensor:
         """
@@ -290,12 +337,32 @@ def _encode_prompt_qwen(
         dtype = dtype or self.text_encoder.dtype
 
         full_texts = [self.prompt_template.format(p) for p in prompt]
+        max_allowed_len = self.prompt_template_encode_start_idx + max_sequence_length
+
+        untruncated_ids = self.tokenizer(
+            text=full_texts,
+            images=None,
+            videos=None,
+            return_tensors="pt",
+            padding="longest",
+        )["input_ids"]
+
+        if untruncated_ids.shape[-1] > max_allowed_len:
+            for i, text in enumerate(full_texts):
+                tokens = untruncated_ids[i][self.prompt_template_encode_start_idx : -2]
+                removed_text = self.tokenizer.decode(tokens[max_sequence_length - 2 :])
+                if len(removed_text) > 0:
+                    full_texts[i] = text[: -len(removed_text)]
+                    logger.warning(
+                        "The following part of your input was truncated because `max_sequence_length` is set to "
+                        f" {max_sequence_length} tokens: {removed_text}"
+                    )
 
         inputs = self.tokenizer(
             text=full_texts,
             images=None,
             videos=None,
-            max_length=max_sequence_length + self.prompt_template_encode_start_idx,
+            max_length=max_allowed_len,
             truncation=True,
             return_tensors="pt",
             padding=True,
@@ -456,6 +523,7 @@ def check_inputs(
         prompt_cu_seqlens=None,
         negative_prompt_cu_seqlens=None,
         callback_on_step_end_tensor_inputs=None,
+        max_sequence_length=None,
     ):
         """
         Validate input parameters for the pipeline.
@@ -476,6 +544,10 @@ def check_inputs(
         Raises:
             ValueError: If inputs are invalid
         """
+
+        if max_sequence_length is not None and max_sequence_length > 1024:
+            raise ValueError("max_sequence_length must be less than 1024")
+
         if height % 16 != 0 or width % 16 != 0:
             raise ValueError(f"`height` and `width` have to be divisible by 16 but are {height} and {width}.")
 
@@ -597,11 +669,6 @@ def guidance_scale(self):
         """Get the current guidance scale value."""
         return self._guidance_scale
 
-    @property
-    def do_classifier_free_guidance(self):
-        """Check if classifier-free guidance is enabled."""
-        return self._guidance_scale > 1.0
-
     @property
     def num_timesteps(self):
         """Get the number of denoising timesteps."""
@@ -639,7 +706,6 @@ def __call__(
         ] = None,
         callback_on_step_end_tensor_inputs: List[str] = ["latents"],
         max_sequence_length: int = 512,
-        **kwargs,
     ):
         r"""
         The call function to the pipeline for generation.
@@ -704,6 +770,7 @@ def __call__(
             prompt_cu_seqlens=prompt_cu_seqlens,
             negative_prompt_cu_seqlens=negative_prompt_cu_seqlens,
             callback_on_step_end_tensor_inputs=callback_on_step_end_tensor_inputs,
+            max_sequence_length=max_sequence_length,
         )
 
         if num_frames % self.vae_scale_factor_temporal != 1:
@@ -737,7 +804,7 @@ def __call__(
                 dtype=dtype,
             )
 
-        if self.do_classifier_free_guidance:
+        if self.guidance_scale > 1.0:
             if negative_prompt is None:
                 negative_prompt = "Static, 2D cartoon, cartoon, 2d animation, paintings, images, worst quality, low quality, ugly, deformed, walking backwards"
 
@@ -792,10 +859,13 @@ def __call__(
             else None
         )
 
-        # 7. Sparse Params for efficient attention
+        # 7. Calculate dynamic scale factor based on resolution
+        scale_factor = self._get_scale_factor(height, width)
+
+        # 8. Sparse Params for efficient attention
         sparse_params = self.get_sparse_params(latents, device)
 
-        # 8. Denoising loop
+        # 9. Denoising loop
         num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
         self._num_timesteps = len(timesteps)
 
@@ -814,12 +884,12 @@ def __call__(
                     timestep=timestep.to(dtype),
                     visual_rope_pos=visual_rope_pos,
                     text_rope_pos=text_rope_pos,
-                    scale_factor=(1, 2, 2),
+                    scale_factor=scale_factor,
                     sparse_params=sparse_params,
                     return_dict=True,
                 ).sample
 
-                if self.do_classifier_free_guidance and negative_prompt_embeds_qwen is not None:
+                if self.guidance_scale > 1.0 and negative_prompt_embeds_qwen is not None:
                     uncond_pred_velocity = self.transformer(
                         hidden_states=latents.to(dtype),
                         encoder_hidden_states=negative_prompt_embeds_qwen.to(dtype),
@@ -827,7 +897,7 @@ def __call__(
                         timestep=timestep.to(dtype),
                         visual_rope_pos=visual_rope_pos,
                         text_rope_pos=negative_text_rope_pos,
-                        scale_factor=(1, 2, 2),
+                        scale_factor=scale_factor,
                         sparse_params=sparse_params,
                         return_dict=True,
                     ).sample
@@ -860,10 +930,10 @@ def __call__(
                 if XLA_AVAILABLE:
                     xm.mark_step()
 
-        # 8. Post-processing - extract main latents
+        # 10. Post-processing - extract main latents
         latents = latents[:, :, :, :, :num_channels_latents]
 
-        # 9. Decode latents to video
+        # 11. Decode latents to video
         if output_type != "latent":
             latents = latents.to(self.vae.dtype)
             # Reshape and normalize latents
diff --git a/src/diffusers/pipelines/kandinsky5/pipeline_kandinsky_i2i.py b/src/diffusers/pipelines/kandinsky5/pipeline_kandinsky_i2i.py
new file mode 100644
index 000000000000..f965cdad8f3e
--- /dev/null
+++ b/src/diffusers/pipelines/kandinsky5/pipeline_kandinsky_i2i.py
@@ -0,0 +1,863 @@
+# Copyright 2025 The Kandinsky Team and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import html
+from typing import Callable, Dict, List, Optional, Union
+
+import numpy as np
+import regex as re
+import torch
+from torch.nn import functional as F
+from transformers import CLIPTextModel, CLIPTokenizer, Qwen2_5_VLForConditionalGeneration, Qwen2VLProcessor
+
+from ...callbacks import MultiPipelineCallbacks, PipelineCallback
+from ...image_processor import PipelineImageInput, VaeImageProcessor
+from ...loaders import KandinskyLoraLoaderMixin
+from ...models import AutoencoderKL
+from ...models.transformers import Kandinsky5Transformer3DModel
+from ...schedulers import FlowMatchEulerDiscreteScheduler
+
+# Add imports for offloading and tiling
+from ...utils import (
+    is_ftfy_available,
+    is_torch_xla_available,
+    logging,
+    replace_example_docstring,
+)
+from ...utils.torch_utils import randn_tensor
+from ..pipeline_utils import DiffusionPipeline
+from .pipeline_output import KandinskyImagePipelineOutput
+
+
+if is_torch_xla_available():
+    import torch_xla.core.xla_model as xm
+
+    XLA_AVAILABLE = True
+else:
+    XLA_AVAILABLE = False
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+if is_ftfy_available():
+    import ftfy
+
+
+logger = logging.get_logger(__name__)
+
+EXAMPLE_DOC_STRING = """
+    Examples:
+
+        ```python
+        >>> import torch
+        >>> from diffusers import Kandinsky5I2IPipeline
+
+        >>> # Available models:
+        >>> # kandinskylab/Kandinsky-5.0-I2I-Lite-sft-Diffusers
+        >>> # kandinskylab/Kandinsky-5.0-I2I-Lite-pretrain-Diffusers
+
+        >>> model_id = "kandinskylab/Kandinsky-5.0-I2I-Lite-sft-Diffusers"
+        >>> pipe = Kandinsky5I2IPipeline.from_pretrained(model_id, torch_dtype=torch.bfloat16)
+        >>> pipe = pipe.to("cuda")
+
+        >>> prompt = "A cat and a dog baking a cake together in a kitchen."
+
+        >>> output = pipe(
+        ...     prompt=prompt,
+        ...     negative_prompt="",
+        ...     height=1024,
+        ...     width=1024,
+        ...     num_inference_steps=50,
+        ...     guidance_scale=3.5,
+        ... ).frames[0]
+        ```
+"""
+
+
+def basic_clean(text):
+    """
+    Copied from https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/wan/pipeline_wan.py
+
+    Clean text using ftfy if available and unescape HTML entities.
+    """
+    if is_ftfy_available():
+        text = ftfy.fix_text(text)
+    text = html.unescape(html.unescape(text))
+    return text.strip()
+
+
+def whitespace_clean(text):
+    """
+    Copied from https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/wan/pipeline_wan.py
+
+    Normalize whitespace in text by replacing multiple spaces with single space.
+    """
+    text = re.sub(r"\s+", " ", text)
+    text = text.strip()
+    return text
+
+
+def prompt_clean(text):
+    """
+    Copied from https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/wan/pipeline_wan.py
+
+    Apply both basic cleaning and whitespace normalization to prompts.
+    """
+    text = whitespace_clean(basic_clean(text))
+    return text
+
+
+class Kandinsky5I2IPipeline(DiffusionPipeline, KandinskyLoraLoaderMixin):
+    r"""
+    Pipeline for image-to-image generation using Kandinsky 5.0.
+
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods
+    implemented for all pipelines (downloading, saving, running on a particular device, etc.).
+
+    Args:
+        transformer ([`Kandinsky5Transformer3DModel`]):
+            Conditional Transformer to denoise the encoded image latents.
+        vae ([`AutoencoderKL`]):
+            Variational Auto-Encoder Model [black-forest-labs/FLUX.1-dev
+            (vae)](https://huggingface.co/black-forest-labs/FLUX.1-dev) to encode and decode videos to and from latent
+            representations.
+        text_encoder ([`Qwen2_5_VLForConditionalGeneration`]):
+            Frozen text-encoder [Qwen2.5-VL](https://huggingface.co/Qwen/Qwen2.5-VL-7B-Instruct).
+        tokenizer ([`AutoProcessor`]):
+            Tokenizer for Qwen2.5-VL.
+        text_encoder_2 ([`CLIPTextModel`]):
+            Frozen [CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModel),
+            specifically the [clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14) variant.
+        tokenizer_2 ([`CLIPTokenizer`]):
+            Tokenizer for CLIP.
+        scheduler ([`FlowMatchEulerDiscreteScheduler`]):
+            A scheduler to be used in combination with `transformer` to denoise the encoded image latents.
+    """
+
+    model_cpu_offload_seq = "text_encoder->text_encoder_2->transformer->vae"
+    _callback_tensor_inputs = [
+        "latents",
+        "prompt_embeds_qwen",
+        "prompt_embeds_clip",
+        "negative_prompt_embeds_qwen",
+        "negative_prompt_embeds_clip",
+    ]
+
+    def __init__(
+        self,
+        transformer: Kandinsky5Transformer3DModel,
+        vae: AutoencoderKL,
+        text_encoder: Qwen2_5_VLForConditionalGeneration,
+        tokenizer: Qwen2VLProcessor,
+        text_encoder_2: CLIPTextModel,
+        tokenizer_2: CLIPTokenizer,
+        scheduler: FlowMatchEulerDiscreteScheduler,
+    ):
+        super().__init__()
+
+        self.register_modules(
+            transformer=transformer,
+            vae=vae,
+            text_encoder=text_encoder,
+            tokenizer=tokenizer,
+            text_encoder_2=text_encoder_2,
+            tokenizer_2=tokenizer_2,
+            scheduler=scheduler,
+        )
+        self.prompt_template = "<|im_start|>system\nYou are a promt engineer. Based on the provided source image (first image) and target image (second image), create an interesting text prompt that can be used together with the source image to create the target image:<|im_end|><|im_start|>user{}<|vision_start|><|image_pad|><|vision_end|><|im_end|>"
+        self.prompt_template_encode_start_idx = 55
+
+        self.vae_scale_factor_spatial = 8
+        self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor_spatial)
+        self.resolutions = [(1024, 1024), (640, 1408), (1408, 640), (768, 1280), (1280, 768), (896, 1152), (1152, 896)]
+
+    def _encode_prompt_qwen(
+        self,
+        prompt: List[str],
+        image: Optional[PipelineImageInput] = None,
+        device: Optional[torch.device] = None,
+        max_sequence_length: int = 1024,
+        dtype: Optional[torch.dtype] = None,
+    ):
+        """
+        Encode prompt using Qwen2.5-VL text encoder.
+
+        This method processes the input prompt through the Qwen2.5-VL model to generate text embeddings suitable for
+        image generation.
+
+        Args:
+            prompt List[str]: Input list of prompts
+            image (PipelineImageInput): Input list of images to condition the generation on
+            device (torch.device): Device to run encoding on
+            max_sequence_length (int): Maximum sequence length for tokenization
+            dtype (torch.dtype): Data type for embeddings
+
+        Returns:
+            Tuple[torch.Tensor, torch.Tensor]: Text embeddings and cumulative sequence lengths
+        """
+        device = device or self._execution_device
+        dtype = dtype or self.text_encoder.dtype
+        if not isinstance(image, list):
+            image = [image]
+        image = [i.resize((i.size[0] // 2, i.size[1] // 2)) for i in image]
+        full_texts = [self.prompt_template.format(p) for p in prompt]
+        max_allowed_len = self.prompt_template_encode_start_idx + max_sequence_length
+
+        untruncated_ids = self.tokenizer(
+            text=full_texts,
+            images=image,
+            videos=None,
+            return_tensors="pt",
+            padding="longest",
+        )["input_ids"]
+
+        if untruncated_ids.shape[-1] > max_allowed_len:
+            for i, text in enumerate(full_texts):
+                tokens = untruncated_ids[i]
+                num_image_tokens = (tokens == self.tokenizer.image_token_id).sum()
+                tokens = tokens[tokens != self.tokenizer.image_token_id][self.prompt_template_encode_start_idx : -3]
+                removed_text = self.tokenizer.decode(tokens[max_sequence_length - num_image_tokens - 3 :])
+                if len(removed_text) > 0:
+                    full_texts[i] = text[: -len(removed_text)]
+                    logger.warning(
+                        "The following part of your input was truncated because `max_sequence_length` is set to "
+                        f" {max_sequence_length} tokens: {removed_text}"
+                    )
+
+        inputs = self.tokenizer(
+            text=full_texts,
+            images=image,
+            videos=None,
+            max_length=max_allowed_len,
+            truncation=True,
+            return_tensors="pt",
+            padding=True,
+        ).to(device)
+
+        embeds = self.text_encoder(
+            **inputs,
+            return_dict=True,
+            output_hidden_states=True,
+        )["hidden_states"][-1][:, self.prompt_template_encode_start_idx :]
+
+        attention_mask = inputs["attention_mask"][:, self.prompt_template_encode_start_idx :]
+        cu_seqlens = torch.cumsum(attention_mask.sum(1), dim=0)
+        cu_seqlens = F.pad(cu_seqlens, (1, 0), value=0).to(dtype=torch.int32)
+
+        return embeds.to(dtype), cu_seqlens
+
+    def _encode_prompt_clip(
+        self,
+        prompt: Union[str, List[str]],
+        device: Optional[torch.device] = None,
+        dtype: Optional[torch.dtype] = None,
+    ):
+        """
+        Encode prompt using CLIP text encoder.
+
+        This method processes the input prompt through the CLIP model to generate pooled embeddings that capture
+        semantic information.
+
+        Args:
+            prompt (Union[str, List[str]]): Input prompt or list of prompts
+            device (torch.device): Device to run encoding on
+            dtype (torch.dtype): Data type for embeddings
+
+        Returns:
+            torch.Tensor: Pooled text embeddings from CLIP
+        """
+        device = device or self._execution_device
+        dtype = dtype or self.text_encoder_2.dtype
+
+        inputs = self.tokenizer_2(
+            prompt,
+            max_length=77,
+            truncation=True,
+            add_special_tokens=True,
+            padding="max_length",
+            return_tensors="pt",
+        ).to(device)
+
+        pooled_embed = self.text_encoder_2(**inputs)["pooler_output"]
+
+        return pooled_embed.to(dtype)
+
+    def encode_prompt(
+        self,
+        prompt: Union[str, List[str]],
+        image: torch.Tensor,
+        num_images_per_prompt: int = 1,
+        max_sequence_length: int = 1024,
+        device: Optional[torch.device] = None,
+        dtype: Optional[torch.dtype] = None,
+    ):
+        r"""
+        Encodes a single prompt (positive or negative) into text encoder hidden states.
+
+        This method combines embeddings from both Qwen2.5-VL and CLIP text encoders to create comprehensive text
+        representations for image generation.
+
+        Args:
+            prompt (`str` or `List[str]`):
+                Prompt to be encoded.
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                Number of images to generate per prompt.
+            max_sequence_length (`int`, *optional*, defaults to 1024):
+                Maximum sequence length for text encoding. Must be less than 1024
+            device (`torch.device`, *optional*):
+                Torch device.
+            dtype (`torch.dtype`, *optional*):
+                Torch dtype.
+
+        Returns:
+            Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+                - Qwen text embeddings of shape (batch_size * num_images_per_prompt, sequence_length, embedding_dim)
+                - CLIP pooled embeddings of shape (batch_size * num_images_per_prompt, clip_embedding_dim)
+                - Cumulative sequence lengths (`cu_seqlens`) for Qwen embeddings of shape (batch_size *
+                  num_images_per_prompt + 1,)
+        """
+        device = device or self._execution_device
+        dtype = dtype or self.text_encoder.dtype
+
+        if not isinstance(prompt, list):
+            prompt = [prompt]
+
+        batch_size = len(prompt)
+
+        prompt = [prompt_clean(p) for p in prompt]
+
+        # Encode with Qwen2.5-VL
+        prompt_embeds_qwen, prompt_cu_seqlens = self._encode_prompt_qwen(
+            prompt=prompt,
+            image=image,
+            device=device,
+            max_sequence_length=max_sequence_length,
+            dtype=dtype,
+        )
+        # prompt_embeds_qwen shape: [batch_size, seq_len, embed_dim]
+
+        # Encode with CLIP
+        prompt_embeds_clip = self._encode_prompt_clip(
+            prompt=prompt,
+            device=device,
+            dtype=dtype,
+        )
+        # prompt_embeds_clip shape: [batch_size, clip_embed_dim]
+
+        # Repeat embeddings for num_images_per_prompt
+        # Qwen embeddings: repeat sequence for each image, then reshape
+        prompt_embeds_qwen = prompt_embeds_qwen.repeat(
+            1, num_images_per_prompt, 1
+        )  # [batch_size, seq_len * num_images_per_prompt, embed_dim]
+        # Reshape to [batch_size * num_images_per_prompt, seq_len, embed_dim]
+        prompt_embeds_qwen = prompt_embeds_qwen.view(
+            batch_size * num_images_per_prompt, -1, prompt_embeds_qwen.shape[-1]
+        )
+
+        # CLIP embeddings: repeat for each image
+        prompt_embeds_clip = prompt_embeds_clip.repeat(
+            1, num_images_per_prompt, 1
+        )  # [batch_size, num_images_per_prompt, clip_embed_dim]
+        # Reshape to [batch_size * num_images_per_prompt, clip_embed_dim]
+        prompt_embeds_clip = prompt_embeds_clip.view(batch_size * num_images_per_prompt, -1)
+
+        # Repeat cumulative sequence lengths for num_images_per_prompt
+        # Original differences (lengths) for each prompt in the batch
+        original_lengths = prompt_cu_seqlens.diff()  # [len1, len2, ...]
+        # Repeat the lengths for num_images_per_prompt
+        repeated_lengths = original_lengths.repeat_interleave(
+            num_images_per_prompt
+        )  # [len1, len1, ..., len2, len2, ...]
+        # Reconstruct the cumulative lengths
+        repeated_cu_seqlens = torch.cat(
+            [torch.tensor([0], device=device, dtype=torch.int32), repeated_lengths.cumsum(0)]
+        )
+
+        return prompt_embeds_qwen, prompt_embeds_clip, repeated_cu_seqlens
+
+    def check_inputs(
+        self,
+        prompt,
+        negative_prompt,
+        image,
+        height,
+        width,
+        prompt_embeds_qwen=None,
+        prompt_embeds_clip=None,
+        negative_prompt_embeds_qwen=None,
+        negative_prompt_embeds_clip=None,
+        prompt_cu_seqlens=None,
+        negative_prompt_cu_seqlens=None,
+        callback_on_step_end_tensor_inputs=None,
+        max_sequence_length=None,
+    ):
+        """
+        Validate input parameters for the pipeline.
+
+        Args:
+            prompt: Input prompt
+            negative_prompt: Negative prompt for guidance
+            image: Input image for conditioning
+            height: Image height
+            width: Image width
+            prompt_embeds_qwen: Pre-computed Qwen prompt embeddings
+            prompt_embeds_clip: Pre-computed CLIP prompt embeddings
+            negative_prompt_embeds_qwen: Pre-computed Qwen negative prompt embeddings
+            negative_prompt_embeds_clip: Pre-computed CLIP negative prompt embeddings
+            prompt_cu_seqlens: Pre-computed cumulative sequence lengths for Qwen positive prompt
+            negative_prompt_cu_seqlens: Pre-computed cumulative sequence lengths for Qwen negative prompt
+            callback_on_step_end_tensor_inputs: Callback tensor inputs
+
+        Raises:
+            ValueError: If inputs are invalid
+        """
+
+        if max_sequence_length is not None and max_sequence_length > 1024:
+            raise ValueError("max_sequence_length must be less than 1024")
+
+        if image is None:
+            raise ValueError("`image` must be provided for image-to-image generation")
+
+        if (width, height) not in self.resolutions:
+            resolutions_str = ",".join([f"({w},{h})" for w, h in self.resolutions])
+            logger.warning(
+                f"`height` and `width` have to be one of {resolutions_str}, but are {height} and {width}. Dimensions will be resized accordingly"
+            )
+
+        if callback_on_step_end_tensor_inputs is not None and not all(
+            k in self._callback_tensor_inputs for k in callback_on_step_end_tensor_inputs
+        ):
+            raise ValueError(
+                f"`callback_on_step_end_tensor_inputs` has to be in {self._callback_tensor_inputs}, but found {[k for k in callback_on_step_end_tensor_inputs if k not in self._callback_tensor_inputs]}"
+            )
+
+        # Check for consistency within positive prompt embeddings and sequence lengths
+        if prompt_embeds_qwen is not None or prompt_embeds_clip is not None or prompt_cu_seqlens is not None:
+            if prompt_embeds_qwen is None or prompt_embeds_clip is None or prompt_cu_seqlens is None:
+                raise ValueError(
+                    "If any of `prompt_embeds_qwen`, `prompt_embeds_clip`, or `prompt_cu_seqlens` is provided, "
+                    "all three must be provided."
+                )
+
+        # Check for consistency within negative prompt embeddings and sequence lengths
+        if (
+            negative_prompt_embeds_qwen is not None
+            or negative_prompt_embeds_clip is not None
+            or negative_prompt_cu_seqlens is not None
+        ):
+            if (
+                negative_prompt_embeds_qwen is None
+                or negative_prompt_embeds_clip is None
+                or negative_prompt_cu_seqlens is None
+            ):
+                raise ValueError(
+                    "If any of `negative_prompt_embeds_qwen`, `negative_prompt_embeds_clip`, or `negative_prompt_cu_seqlens` is provided, "
+                    "all three must be provided."
+                )
+
+        # Check if prompt or embeddings are provided (either prompt or all required embedding components for positive)
+        if prompt is None and prompt_embeds_qwen is None:
+            raise ValueError(
+                "Provide either `prompt` or `prompt_embeds_qwen` (and corresponding `prompt_embeds_clip` and `prompt_cu_seqlens`). Cannot leave all undefined."
+            )
+
+        # Validate types for prompt and negative_prompt if provided
+        if prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+        if negative_prompt is not None and (
+            not isinstance(negative_prompt, str) and not isinstance(negative_prompt, list)
+        ):
+            raise ValueError(f"`negative_prompt` has to be of type `str` or `list` but is {type(negative_prompt)}")
+
+    def prepare_latents(
+        self,
+        image: PipelineImageInput,
+        batch_size: int,
+        num_channels_latents: int = 16,
+        height: int = 1024,
+        width: int = 1024,
+        dtype: Optional[torch.dtype] = None,
+        device: Optional[torch.device] = None,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        """
+        Prepare initial latent variables for image-to-image generation.
+
+        This method creates random noise latents with encoded image,
+
+        Args:
+            image (PipelineImageInput): Input image to condition the generation on
+            batch_size (int): Number of images to generate
+            num_channels_latents (int): Number of channels in latent space
+            height (int): Height of generated image
+            width (int): Width of generated image
+            dtype (torch.dtype): Data type for latents
+            device (torch.device): Device to create latents on
+            generator (torch.Generator): Random number generator
+            latents (torch.Tensor): Pre-existing latents to use
+
+        Returns:
+            torch.Tensor: Prepared latent tensor with encoded image
+        """
+        if latents is not None:
+            return latents.to(device=device, dtype=dtype)
+
+        shape = (
+            batch_size,
+            1,
+            int(height) // self.vae_scale_factor_spatial,
+            int(width) // self.vae_scale_factor_spatial,
+            num_channels_latents,
+        )
+
+        if isinstance(generator, list) and len(generator) != batch_size:
+            raise ValueError(
+                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+            )
+
+        # Generate random noise for all frames
+        latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+
+        # Encode the input image to use as first frame
+        # Preprocess image
+        image_tensor = self.image_processor.preprocess(image, height=height, width=width).to(device, dtype=dtype)
+        # Encode image to latents using VAE
+        with torch.no_grad():
+            image_latents = self.vae.encode(image_tensor).latent_dist.sample(generator=generator)
+            image_latents = image_latents.unsqueeze(2)  # Add temporal dimension
+
+            # Normalize latents if needed
+            if hasattr(self.vae.config, "scaling_factor"):
+                image_latents = image_latents * self.vae.config.scaling_factor
+
+            # Reshape to match latent dimensions [batch, 1, height, width, channels]
+            image_latents = image_latents.permute(0, 2, 3, 4, 1)  # [batch, 1, H, W, C]
+            latents = torch.cat([latents, image_latents, torch.ones_like(latents[..., :1])], -1)
+
+        return latents
+
+    @property
+    def guidance_scale(self):
+        """Get the current guidance scale value."""
+        return self._guidance_scale
+
+    @property
+    def num_timesteps(self):
+        """Get the number of denoising timesteps."""
+        return self._num_timesteps
+
+    @property
+    def interrupt(self):
+        """Check if generation has been interrupted."""
+        return self._interrupt
+
+    @torch.no_grad()
+    @replace_example_docstring(EXAMPLE_DOC_STRING)
+    def __call__(
+        self,
+        image: PipelineImageInput,
+        prompt: Union[str, List[str]] = None,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        height: Optional[int] = None,
+        width: Optional[int] = None,
+        num_inference_steps: int = 50,
+        guidance_scale: float = 3.5,
+        num_images_per_prompt: Optional[int] = 1,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.Tensor] = None,
+        prompt_embeds_qwen: Optional[torch.Tensor] = None,
+        prompt_embeds_clip: Optional[torch.Tensor] = None,
+        negative_prompt_embeds_qwen: Optional[torch.Tensor] = None,
+        negative_prompt_embeds_clip: Optional[torch.Tensor] = None,
+        prompt_cu_seqlens: Optional[torch.Tensor] = None,
+        negative_prompt_cu_seqlens: Optional[torch.Tensor] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        callback_on_step_end: Optional[
+            Union[Callable[[int, int, Dict], None], PipelineCallback, MultiPipelineCallbacks]
+        ] = None,
+        callback_on_step_end_tensor_inputs: List[str] = ["latents"],
+        max_sequence_length: int = 1024,
+    ):
+        r"""
+        The call function to the pipeline for image-to-image generation.
+
+        Args:
+            image (`PipelineImageInput`):
+                The input image to condition the generation on. Must be an image, a list of images or a `torch.Tensor`.
+            prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide the image generation. If not defined, pass `prompt_embeds` instead.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to avoid during image generation. If not defined, pass `negative_prompt_embeds`
+                instead. Ignored when not using guidance (`guidance_scale` < `1`).
+            height (`int`):
+                The height in pixels of the generated image.
+            width (`int`):
+                The width in pixels of the generated image.
+            num_inference_steps (`int`, defaults to `50`):
+                The number of denoising steps.
+            guidance_scale (`float`, defaults to `5.0`):
+                Guidance scale as defined in classifier-free guidance.
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                A torch generator to make generation deterministic.
+            latents (`torch.Tensor`, *optional*):
+                Pre-generated noisy latents.
+            prompt_embeds_qwen (`torch.Tensor`, *optional*):
+                Pre-generated Qwen text embeddings.
+            prompt_embeds_clip (`torch.Tensor`, *optional*):
+                Pre-generated CLIP text embeddings.
+            negative_prompt_embeds_qwen (`torch.Tensor`, *optional*):
+                Pre-generated Qwen negative text embeddings.
+            negative_prompt_embeds_clip (`torch.Tensor`, *optional*):
+                Pre-generated CLIP negative text embeddings.
+            prompt_cu_seqlens (`torch.Tensor`, *optional*):
+                Pre-generated cumulative sequence lengths for Qwen positive prompt.
+            negative_prompt_cu_seqlens (`torch.Tensor`, *optional*):
+                Pre-generated cumulative sequence lengths for Qwen negative prompt.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generated image.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`KandinskyImagePipelineOutput`].
+            callback_on_step_end (`Callable`, `PipelineCallback`, `MultiPipelineCallbacks`, *optional*):
+                A function that is called at the end of each denoising step.
+            callback_on_step_end_tensor_inputs (`List`, *optional*):
+                The list of tensor inputs for the `callback_on_step_end` function.
+            max_sequence_length (`int`, defaults to `1024`):
+                The maximum sequence length for text and image qwen encoding. Must be less than 1024
+
+        Examples:
+
+        Returns:
+            [`~KandinskyImagePipelineOutput`] or `tuple`:
+                If `return_dict` is `True`, [`KandinskyImagePipelineOutput`] is returned, otherwise a `tuple` is
+                returned where the first element is a list with the generated images.
+        """
+        if isinstance(callback_on_step_end, (PipelineCallback, MultiPipelineCallbacks)):
+            callback_on_step_end_tensor_inputs = callback_on_step_end.tensor_inputs
+        # 1. Check inputs. Raise error if not correct
+        if height is None and width is None:
+            width, height = image[0].size if isinstance(image, list) else image.size
+        self.check_inputs(
+            prompt=prompt,
+            negative_prompt=negative_prompt,
+            image=image,
+            height=height,
+            width=width,
+            prompt_embeds_qwen=prompt_embeds_qwen,
+            prompt_embeds_clip=prompt_embeds_clip,
+            negative_prompt_embeds_qwen=negative_prompt_embeds_qwen,
+            negative_prompt_embeds_clip=negative_prompt_embeds_clip,
+            prompt_cu_seqlens=prompt_cu_seqlens,
+            negative_prompt_cu_seqlens=negative_prompt_cu_seqlens,
+            callback_on_step_end_tensor_inputs=callback_on_step_end_tensor_inputs,
+            max_sequence_length=max_sequence_length,
+        )
+        if (width, height) not in self.resolutions:
+            width, height = self.resolutions[
+                np.argmin([abs((i[0] / i[1]) - (width / height)) for i in self.resolutions])
+            ]
+
+        self._guidance_scale = guidance_scale
+        self._interrupt = False
+
+        device = self._execution_device
+        dtype = self.transformer.dtype
+
+        # 2. Define call parameters
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+            prompt = [prompt]
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds_qwen.shape[0]
+
+        # 3. Encode input prompt
+        if prompt_embeds_qwen is None:
+            prompt_embeds_qwen, prompt_embeds_clip, prompt_cu_seqlens = self.encode_prompt(
+                prompt=prompt,
+                image=image,
+                num_images_per_prompt=num_images_per_prompt,
+                max_sequence_length=max_sequence_length,
+                device=device,
+                dtype=dtype,
+            )
+
+        if self.guidance_scale > 1.0:
+            if negative_prompt is None:
+                negative_prompt = ""
+
+            if isinstance(negative_prompt, str):
+                negative_prompt = [negative_prompt] * len(prompt) if prompt is not None else [negative_prompt]
+            elif len(negative_prompt) != len(prompt):
+                raise ValueError(
+                    f"`negative_prompt` must have same length as `prompt`. Got {len(negative_prompt)} vs {len(prompt)}."
+                )
+
+            if negative_prompt_embeds_qwen is None:
+                negative_prompt_embeds_qwen, negative_prompt_embeds_clip, negative_prompt_cu_seqlens = (
+                    self.encode_prompt(
+                        prompt=negative_prompt,
+                        image=image,
+                        num_images_per_prompt=num_images_per_prompt,
+                        max_sequence_length=max_sequence_length,
+                        device=device,
+                        dtype=dtype,
+                    )
+                )
+
+        # 4. Prepare timesteps
+        self.scheduler.set_timesteps(num_inference_steps, device=device)
+        timesteps = self.scheduler.timesteps
+
+        # 5. Prepare latent variables with image conditioning
+        num_channels_latents = self.transformer.config.in_visual_dim
+        latents = self.prepare_latents(
+            image=image,
+            batch_size=batch_size * num_images_per_prompt,
+            num_channels_latents=num_channels_latents,
+            height=height,
+            width=width,
+            dtype=dtype,
+            device=device,
+            generator=generator,
+            latents=latents,
+        )
+
+        # 6. Prepare rope positions for positional encoding
+        visual_rope_pos = [
+            torch.arange(1, device=device),
+            torch.arange(height // self.vae_scale_factor_spatial // 2, device=device),
+            torch.arange(width // self.vae_scale_factor_spatial // 2, device=device),
+        ]
+
+        text_rope_pos = torch.arange(prompt_cu_seqlens.diff().max().item(), device=device)
+
+        negative_text_rope_pos = (
+            torch.arange(negative_prompt_cu_seqlens.diff().max().item(), device=device)
+            if negative_prompt_cu_seqlens is not None
+            else None
+        )
+
+        # 7. Calculate dynamic scale factor based on resolution
+        scale_factor = [1.0, 1.0, 1.0]
+
+        # 8. Sparse Params for efficient attention
+        sparse_params = None
+
+        # 9. Denoising loop
+        num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
+        self._num_timesteps = len(timesteps)
+
+        with self.progress_bar(total=num_inference_steps) as progress_bar:
+            for i, t in enumerate(timesteps):
+                if self.interrupt:
+                    continue
+
+                timestep = t.unsqueeze(0).repeat(batch_size * num_images_per_prompt)
+
+                # Predict noise residual
+                pred_velocity = self.transformer(
+                    hidden_states=latents.to(dtype),
+                    encoder_hidden_states=prompt_embeds_qwen.to(dtype),
+                    pooled_projections=prompt_embeds_clip.to(dtype),
+                    timestep=timestep.to(dtype),
+                    visual_rope_pos=visual_rope_pos,
+                    text_rope_pos=text_rope_pos,
+                    scale_factor=scale_factor,
+                    sparse_params=sparse_params,
+                    return_dict=True,
+                ).sample
+
+                if self.guidance_scale > 1.0 and negative_prompt_embeds_qwen is not None:
+                    uncond_pred_velocity = self.transformer(
+                        hidden_states=latents.to(dtype),
+                        encoder_hidden_states=negative_prompt_embeds_qwen.to(dtype),
+                        pooled_projections=negative_prompt_embeds_clip.to(dtype),
+                        timestep=timestep.to(dtype),
+                        visual_rope_pos=visual_rope_pos,
+                        text_rope_pos=negative_text_rope_pos,
+                        scale_factor=scale_factor,
+                        sparse_params=sparse_params,
+                        return_dict=True,
+                    ).sample
+
+                    pred_velocity = uncond_pred_velocity + guidance_scale * (pred_velocity - uncond_pred_velocity)
+
+                latents[:, :, :, :, :num_channels_latents] = self.scheduler.step(
+                    pred_velocity[:, :], t, latents[:, :, :, :, :num_channels_latents], return_dict=False
+                )[0]
+
+                if callback_on_step_end is not None:
+                    callback_kwargs = {}
+                    for k in callback_on_step_end_tensor_inputs:
+                        callback_kwargs[k] = locals()[k]
+                    callback_outputs = callback_on_step_end(self, i, t, callback_kwargs)
+
+                    latents = callback_outputs.pop("latents", latents)
+                    prompt_embeds_qwen = callback_outputs.pop("prompt_embeds_qwen", prompt_embeds_qwen)
+                    prompt_embeds_clip = callback_outputs.pop("prompt_embeds_clip", prompt_embeds_clip)
+                    negative_prompt_embeds_qwen = callback_outputs.pop(
+                        "negative_prompt_embeds_qwen", negative_prompt_embeds_qwen
+                    )
+                    negative_prompt_embeds_clip = callback_outputs.pop(
+                        "negative_prompt_embeds_clip", negative_prompt_embeds_clip
+                    )
+
+                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+                    progress_bar.update()
+
+                if XLA_AVAILABLE:
+                    xm.mark_step()
+
+        # 9. Post-processing - extract main latents
+        latents = latents[:, :, :, :, :num_channels_latents]
+
+        # 10. Decode latents to image
+        if output_type != "latent":
+            latents = latents.to(self.vae.dtype)
+            # Reshape and normalize latents
+            latents = latents.reshape(
+                batch_size,
+                num_images_per_prompt,
+                1,
+                height // self.vae_scale_factor_spatial,
+                width // self.vae_scale_factor_spatial,
+                num_channels_latents,
+            )
+            latents = latents.permute(0, 1, 5, 2, 3, 4)  # [batch, num_images, channels, 1, height, width]
+            latents = latents.reshape(
+                batch_size * num_images_per_prompt,
+                num_channels_latents,
+                height // self.vae_scale_factor_spatial,
+                width // self.vae_scale_factor_spatial,
+            )
+
+            # Normalize and decode through VAE
+            latents = latents / self.vae.config.scaling_factor
+            image = self.vae.decode(latents).sample
+            image = self.image_processor.postprocess(image, output_type=output_type)
+        else:
+            image = latents
+
+        # Offload all models
+        self.maybe_free_model_hooks()
+
+        if not return_dict:
+            return (image,)
+
+        return KandinskyImagePipelineOutput(image=image)
diff --git a/src/diffusers/pipelines/kandinsky5/pipeline_kandinsky_i2v.py b/src/diffusers/pipelines/kandinsky5/pipeline_kandinsky_i2v.py
new file mode 100644
index 000000000000..d457c9b69657
--- /dev/null
+++ b/src/diffusers/pipelines/kandinsky5/pipeline_kandinsky_i2v.py
@@ -0,0 +1,1054 @@
+# Copyright 2025 The Kandinsky Team and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import html
+from typing import Callable, Dict, List, Optional, Union
+
+import regex as re
+import torch
+from torch.nn import functional as F
+from transformers import CLIPTextModel, CLIPTokenizer, Qwen2_5_VLForConditionalGeneration, Qwen2VLProcessor
+
+from ...callbacks import MultiPipelineCallbacks, PipelineCallback
+from ...image_processor import PipelineImageInput
+from ...loaders import KandinskyLoraLoaderMixin
+from ...models import AutoencoderKLHunyuanVideo
+from ...models.transformers import Kandinsky5Transformer3DModel
+from ...schedulers import FlowMatchEulerDiscreteScheduler
+
+# Add imports for offloading and tiling
+from ...utils import (
+    is_ftfy_available,
+    is_torch_xla_available,
+    logging,
+    replace_example_docstring,
+)
+from ...utils.torch_utils import randn_tensor
+from ...video_processor import VideoProcessor
+from ..pipeline_utils import DiffusionPipeline
+from .pipeline_output import KandinskyPipelineOutput
+
+
+if is_torch_xla_available():
+    import torch_xla.core.xla_model as xm
+
+    XLA_AVAILABLE = True
+else:
+    XLA_AVAILABLE = False
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+if is_ftfy_available():
+    import ftfy
+
+
+logger = logging.get_logger(__name__)
+
+EXAMPLE_DOC_STRING = """
+    Examples:
+
+        ```python
+        >>> import torch
+        >>> from diffusers import Kandinsky5I2VPipeline
+        >>> from diffusers.utils import export_to_video, load_image
+
+        >>> # Available models:
+        >>> # kandinskylab/Kandinsky-5.0-I2V-Pro-sft-5s-Diffusers
+
+        >>> model_id = "kandinskylab/Kandinsky-5.0-I2V-Pro-sft-5s-Diffusers"
+        >>> pipe = Kandinsky5I2VPipeline.from_pretrained(model_id, torch_dtype=torch.bfloat16)
+        >>> pipe = pipe.to("cuda")
+
+        >>> image = load_image(
+        ...     "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/astronaut.jpg"
+        ... )
+        >>> prompt = "An astronaut floating in space with Earth in the background, cinematic shot"
+        >>> negative_prompt = "Static, 2D cartoon, cartoon, 2d animation, paintings, images, worst quality, low quality, ugly, deformed, walking backwards"
+
+        >>> output = pipe(
+        ...     image=image,
+        ...     prompt=prompt,
+        ...     negative_prompt=negative_prompt,
+        ...     height=512,
+        ...     width=768,
+        ...     num_frames=121,
+        ...     num_inference_steps=50,
+        ...     guidance_scale=5.0,
+        ... ).frames[0]
+
+        >>> export_to_video(output, "output.mp4", fps=24, quality=9)
+        ```
+"""
+
+
+def basic_clean(text):
+    """
+    Copied from https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/wan/pipeline_wan.py
+
+    Clean text using ftfy if available and unescape HTML entities.
+    """
+    if is_ftfy_available():
+        text = ftfy.fix_text(text)
+    text = html.unescape(html.unescape(text))
+    return text.strip()
+
+
+def whitespace_clean(text):
+    """
+    Copied from https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/wan/pipeline_wan.py
+
+    Normalize whitespace in text by replacing multiple spaces with single space.
+    """
+    text = re.sub(r"\s+", " ", text)
+    text = text.strip()
+    return text
+
+
+def prompt_clean(text):
+    """
+    Copied from https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/wan/pipeline_wan.py
+
+    Apply both basic cleaning and whitespace normalization to prompts.
+    """
+    text = whitespace_clean(basic_clean(text))
+    return text
+
+
+class Kandinsky5I2VPipeline(DiffusionPipeline, KandinskyLoraLoaderMixin):
+    r"""
+    Pipeline for image-to-video generation using Kandinsky 5.0.
+
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods
+    implemented for all pipelines (downloading, saving, running on a particular device, etc.).
+
+    Args:
+        transformer ([`Kandinsky5Transformer3DModel`]):
+            Conditional Transformer to denoise the encoded video latents.
+        vae ([`AutoencoderKLHunyuanVideo`]):
+            Variational Auto-Encoder Model [hunyuanvideo-community/HunyuanVideo
+            (vae)](https://huggingface.co/hunyuanvideo-community/HunyuanVideo) to encode and decode videos to and from
+            latent representations.
+        text_encoder ([`Qwen2_5_VLForConditionalGeneration`]):
+            Frozen text-encoder [Qwen2.5-VL](https://huggingface.co/Qwen/Qwen2.5-VL-7B-Instruct).
+        tokenizer ([`AutoProcessor`]):
+            Tokenizer for Qwen2.5-VL.
+        text_encoder_2 ([`CLIPTextModel`]):
+            Frozen [CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModel),
+            specifically the [clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14) variant.
+        tokenizer_2 ([`CLIPTokenizer`]):
+            Tokenizer for CLIP.
+        scheduler ([`FlowMatchEulerDiscreteScheduler`]):
+            A scheduler to be used in combination with `transformer` to denoise the encoded video latents.
+    """
+
+    model_cpu_offload_seq = "text_encoder->text_encoder_2->transformer->vae"
+    _callback_tensor_inputs = [
+        "latents",
+        "prompt_embeds_qwen",
+        "prompt_embeds_clip",
+        "negative_prompt_embeds_qwen",
+        "negative_prompt_embeds_clip",
+    ]
+
+    def __init__(
+        self,
+        transformer: Kandinsky5Transformer3DModel,
+        vae: AutoencoderKLHunyuanVideo,
+        text_encoder: Qwen2_5_VLForConditionalGeneration,
+        tokenizer: Qwen2VLProcessor,
+        text_encoder_2: CLIPTextModel,
+        tokenizer_2: CLIPTokenizer,
+        scheduler: FlowMatchEulerDiscreteScheduler,
+    ):
+        super().__init__()
+
+        self.register_modules(
+            transformer=transformer,
+            vae=vae,
+            text_encoder=text_encoder,
+            tokenizer=tokenizer,
+            text_encoder_2=text_encoder_2,
+            tokenizer_2=tokenizer_2,
+            scheduler=scheduler,
+        )
+
+        self.prompt_template = "\n".join(
+            [
+                "<|im_start|>system\nYou are a promt engineer. Describe the video in detail.",
+                "Describe how the camera moves or shakes, describe the zoom and view angle, whether it follows the objects.",
+                "Describe the location of the video, main characters or objects and their action.",
+                "Describe the dynamism of the video and presented actions.",
+                "Name the visual style of the video: whether it is a professional footage, user generated content, some kind of animation, video game or scren content.",
+                "Describe the visual effects, postprocessing and transitions if they are presented in the video.",
+                "Pay attention to the order of key actions shown in the scene.<|im_end|>",
+                "<|im_start|>user\n{}<|im_end|>",
+            ]
+        )
+        self.prompt_template_encode_start_idx = 129
+
+        self.vae_scale_factor_temporal = (
+            self.vae.config.temporal_compression_ratio if getattr(self, "vae", None) else 4
+        )
+        self.vae_scale_factor_spatial = self.vae.config.spatial_compression_ratio if getattr(self, "vae", None) else 8
+        self.video_processor = VideoProcessor(vae_scale_factor=self.vae_scale_factor_spatial)
+
+    def _get_scale_factor(self, height: int, width: int) -> tuple:
+        """
+        Calculate the scale factor based on resolution.
+
+        Args:
+            height (int): Video height
+            width (int): Video width
+
+        Returns:
+            tuple: Scale factor as (temporal_scale, height_scale, width_scale)
+        """
+
+        def between_480p(x):
+            return 480 <= x <= 854
+
+        if between_480p(height) and between_480p(width):
+            return (1, 2, 2)
+        else:
+            return (1, 3.16, 3.16)
+
+    @staticmethod
+    def fast_sta_nabla(T: int, H: int, W: int, wT: int = 3, wH: int = 3, wW: int = 3, device="cuda") -> torch.Tensor:
+        """
+        Create a sparse temporal attention (STA) mask for efficient video generation.
+
+        This method generates a mask that limits attention to nearby frames and spatial positions, reducing
+        computational complexity for video generation.
+
+        Args:
+            T (int): Number of temporal frames
+            H (int): Height in latent space
+            W (int): Width in latent space
+            wT (int): Temporal attention window size
+            wH (int): Height attention window size
+            wW (int): Width attention window size
+            device (str): Device to create tensor on
+
+        Returns:
+            torch.Tensor: Sparse attention mask of shape (T*H*W, T*H*W)
+        """
+        l = torch.Tensor([T, H, W]).amax()
+        r = torch.arange(0, l, 1, dtype=torch.int16, device=device)
+        mat = (r.unsqueeze(1) - r.unsqueeze(0)).abs()
+        sta_t, sta_h, sta_w = (
+            mat[:T, :T].flatten(),
+            mat[:H, :H].flatten(),
+            mat[:W, :W].flatten(),
+        )
+        sta_t = sta_t <= wT // 2
+        sta_h = sta_h <= wH // 2
+        sta_w = sta_w <= wW // 2
+        sta_hw = (sta_h.unsqueeze(1) * sta_w.unsqueeze(0)).reshape(H, H, W, W).transpose(1, 2).flatten()
+        sta = (sta_t.unsqueeze(1) * sta_hw.unsqueeze(0)).reshape(T, T, H * W, H * W).transpose(1, 2)
+        return sta.reshape(T * H * W, T * H * W)
+
+    def get_sparse_params(self, sample, device):
+        """
+        Generate sparse attention parameters for the transformer based on sample dimensions.
+
+        This method computes the sparse attention configuration needed for efficient video processing in the
+        transformer model.
+
+        Args:
+            sample (torch.Tensor): Input sample tensor
+            device (torch.device): Device to place tensors on
+
+        Returns:
+            Dict: Dictionary containing sparse attention parameters
+        """
+        assert self.transformer.config.patch_size[0] == 1
+        B, T, H, W, _ = sample.shape
+        T, H, W = (
+            T // self.transformer.config.patch_size[0],
+            H // self.transformer.config.patch_size[1],
+            W // self.transformer.config.patch_size[2],
+        )
+        if self.transformer.config.attention_type == "nabla":
+            sta_mask = self.fast_sta_nabla(
+                T,
+                H // 8,
+                W // 8,
+                self.transformer.config.attention_wT,
+                self.transformer.config.attention_wH,
+                self.transformer.config.attention_wW,
+                device=device,
+            )
+
+            sparse_params = {
+                "sta_mask": sta_mask.unsqueeze_(0).unsqueeze_(0),
+                "attention_type": self.transformer.config.attention_type,
+                "to_fractal": True,
+                "P": self.transformer.config.attention_P,
+                "wT": self.transformer.config.attention_wT,
+                "wW": self.transformer.config.attention_wW,
+                "wH": self.transformer.config.attention_wH,
+                "add_sta": self.transformer.config.attention_add_sta,
+                "visual_shape": (T, H, W),
+                "method": self.transformer.config.attention_method,
+            }
+        else:
+            sparse_params = None
+
+        return sparse_params
+
+    def _encode_prompt_qwen(
+        self,
+        prompt: Union[str, List[str]],
+        device: Optional[torch.device] = None,
+        max_sequence_length: int = 256,
+        dtype: Optional[torch.dtype] = None,
+    ):
+        """
+        Encode prompt using Qwen2.5-VL text encoder.
+
+        This method processes the input prompt through the Qwen2.5-VL model to generate text embeddings suitable for
+        video generation.
+
+        Args:
+            prompt (Union[str, List[str]]): Input prompt or list of prompts
+            device (torch.device): Device to run encoding on
+            max_sequence_length (int): Maximum sequence length for tokenization
+            dtype (torch.dtype): Data type for embeddings
+
+        Returns:
+            Tuple[torch.Tensor, torch.Tensor]: Text embeddings and cumulative sequence lengths
+        """
+        device = device or self._execution_device
+        dtype = dtype or self.text_encoder.dtype
+
+        full_texts = [self.prompt_template.format(p) for p in prompt]
+        max_allowed_len = self.prompt_template_encode_start_idx + max_sequence_length
+
+        untruncated_ids = self.tokenizer(
+            text=full_texts,
+            images=None,
+            videos=None,
+            return_tensors="pt",
+            padding="longest",
+        )["input_ids"]
+
+        if untruncated_ids.shape[-1] > max_allowed_len:
+            for i, text in enumerate(full_texts):
+                tokens = untruncated_ids[i][self.prompt_template_encode_start_idx : -2]
+                removed_text = self.tokenizer.decode(tokens[max_sequence_length - 2 :])
+                if len(removed_text) > 0:
+                    full_texts[i] = text[: -len(removed_text)]
+                    logger.warning(
+                        "The following part of your input was truncated because `max_sequence_length` is set to "
+                        f" {max_sequence_length} tokens: {removed_text}"
+                    )
+        inputs = self.tokenizer(
+            text=full_texts,
+            images=None,
+            videos=None,
+            max_length=max_allowed_len,
+            truncation=True,
+            return_tensors="pt",
+            padding=True,
+        ).to(device)
+
+        embeds = self.text_encoder(
+            input_ids=inputs["input_ids"],
+            return_dict=True,
+            output_hidden_states=True,
+        )["hidden_states"][-1][:, self.prompt_template_encode_start_idx :]
+
+        attention_mask = inputs["attention_mask"][:, self.prompt_template_encode_start_idx :]
+        cu_seqlens = torch.cumsum(attention_mask.sum(1), dim=0)
+        cu_seqlens = F.pad(cu_seqlens, (1, 0), value=0).to(dtype=torch.int32)
+
+        return embeds.to(dtype), cu_seqlens
+
+    def _encode_prompt_clip(
+        self,
+        prompt: Union[str, List[str]],
+        device: Optional[torch.device] = None,
+        dtype: Optional[torch.dtype] = None,
+    ):
+        """
+        Encode prompt using CLIP text encoder.
+
+        This method processes the input prompt through the CLIP model to generate pooled embeddings that capture
+        semantic information.
+
+        Args:
+            prompt (Union[str, List[str]]): Input prompt or list of prompts
+            device (torch.device): Device to run encoding on
+            dtype (torch.dtype): Data type for embeddings
+
+        Returns:
+            torch.Tensor: Pooled text embeddings from CLIP
+        """
+        device = device or self._execution_device
+        dtype = dtype or self.text_encoder_2.dtype
+
+        inputs = self.tokenizer_2(
+            prompt,
+            max_length=77,
+            truncation=True,
+            add_special_tokens=True,
+            padding="max_length",
+            return_tensors="pt",
+        ).to(device)
+
+        pooled_embed = self.text_encoder_2(**inputs)["pooler_output"]
+
+        return pooled_embed.to(dtype)
+
+    @staticmethod
+    def adaptive_mean_std_normalization(source, reference):
+        source_mean = source.mean(dim=(1, 2, 3, 4), keepdim=True)
+        source_std = source.std(dim=(1, 2, 3, 4), keepdim=True)
+        # magic constants - limit changes in latents
+        clump_mean_low = 0.05
+        clump_mean_high = 0.1
+        clump_std_low = 0.1
+        clump_std_high = 0.25
+
+        reference_mean = torch.clamp(reference.mean(), source_mean - clump_mean_low, source_mean + clump_mean_high)
+        reference_std = torch.clamp(reference.std(), source_std - clump_std_low, source_std + clump_std_high)
+
+        # normalization
+        normalized = (source - source_mean) / source_std
+        normalized = normalized * reference_std + reference_mean
+
+        return normalized
+
+    def normalize_first_frame(self, latents, reference_frames=5, clump_values=False):
+        latents_copy = latents.clone()
+        samples = latents_copy
+
+        if samples.shape[1] <= 1:
+            return (latents, "Only one frame, no normalization needed")
+
+        nFr = 4
+        first_frames = samples.clone()[:, :nFr]
+        reference_frames_data = samples[:, nFr : nFr + min(reference_frames, samples.shape[1] - 1)]
+
+        normalized_first = self.adaptive_mean_std_normalization(first_frames, reference_frames_data)
+        if clump_values:
+            min_val = reference_frames_data.min()
+            max_val = reference_frames_data.max()
+            normalized_first = torch.clamp(normalized_first, min_val, max_val)
+
+        samples[:, :nFr] = normalized_first
+
+        return samples
+
+    def encode_prompt(
+        self,
+        prompt: Union[str, List[str]],
+        num_videos_per_prompt: int = 1,
+        max_sequence_length: int = 512,
+        device: Optional[torch.device] = None,
+        dtype: Optional[torch.dtype] = None,
+    ):
+        r"""
+        Encodes a single prompt (positive or negative) into text encoder hidden states.
+
+        This method combines embeddings from both Qwen2.5-VL and CLIP text encoders to create comprehensive text
+        representations for video generation.
+
+        Args:
+            prompt (`str` or `List[str]`):
+                Prompt to be encoded.
+            num_videos_per_prompt (`int`, *optional*, defaults to 1):
+                Number of videos to generate per prompt.
+            max_sequence_length (`int`, *optional*, defaults to 512):
+                Maximum sequence length for text encoding.
+            device (`torch.device`, *optional*):
+                Torch device.
+            dtype (`torch.dtype`, *optional*):
+                Torch dtype.
+
+        Returns:
+            Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+                - Qwen text embeddings of shape (batch_size * num_videos_per_prompt, sequence_length, embedding_dim)
+                - CLIP pooled embeddings of shape (batch_size * num_videos_per_prompt, clip_embedding_dim)
+                - Cumulative sequence lengths (`cu_seqlens`) for Qwen embeddings of shape (batch_size *
+                  num_videos_per_prompt + 1,)
+        """
+        device = device or self._execution_device
+        dtype = dtype or self.text_encoder.dtype
+
+        if not isinstance(prompt, list):
+            prompt = [prompt]
+
+        batch_size = len(prompt)
+
+        prompt = [prompt_clean(p) for p in prompt]
+
+        # Encode with Qwen2.5-VL
+        prompt_embeds_qwen, prompt_cu_seqlens = self._encode_prompt_qwen(
+            prompt=prompt,
+            device=device,
+            max_sequence_length=max_sequence_length,
+            dtype=dtype,
+        )
+        # prompt_embeds_qwen shape: [batch_size, seq_len, embed_dim]
+
+        # Encode with CLIP
+        prompt_embeds_clip = self._encode_prompt_clip(
+            prompt=prompt,
+            device=device,
+            dtype=dtype,
+        )
+        # prompt_embeds_clip shape: [batch_size, clip_embed_dim]
+
+        # Repeat embeddings for num_videos_per_prompt
+        # Qwen embeddings: repeat sequence for each video, then reshape
+        prompt_embeds_qwen = prompt_embeds_qwen.repeat(
+            1, num_videos_per_prompt, 1
+        )  # [batch_size, seq_len * num_videos_per_prompt, embed_dim]
+        # Reshape to [batch_size * num_videos_per_prompt, seq_len, embed_dim]
+        prompt_embeds_qwen = prompt_embeds_qwen.view(
+            batch_size * num_videos_per_prompt, -1, prompt_embeds_qwen.shape[-1]
+        )
+
+        # CLIP embeddings: repeat for each video
+        prompt_embeds_clip = prompt_embeds_clip.repeat(
+            1, num_videos_per_prompt, 1
+        )  # [batch_size, num_videos_per_prompt, clip_embed_dim]
+        # Reshape to [batch_size * num_videos_per_prompt, clip_embed_dim]
+        prompt_embeds_clip = prompt_embeds_clip.view(batch_size * num_videos_per_prompt, -1)
+
+        # Repeat cumulative sequence lengths for num_videos_per_prompt
+        # Original differences (lengths) for each prompt in the batch
+        original_lengths = prompt_cu_seqlens.diff()  # [len1, len2, ...]
+        # Repeat the lengths for num_videos_per_prompt
+        repeated_lengths = original_lengths.repeat_interleave(
+            num_videos_per_prompt
+        )  # [len1, len1, ..., len2, len2, ...]
+        # Reconstruct the cumulative lengths
+        repeated_cu_seqlens = torch.cat(
+            [torch.tensor([0], device=device, dtype=torch.int32), repeated_lengths.cumsum(0)]
+        )
+
+        return prompt_embeds_qwen, prompt_embeds_clip, repeated_cu_seqlens
+
+    def check_inputs(
+        self,
+        prompt,
+        negative_prompt,
+        image,
+        height,
+        width,
+        prompt_embeds_qwen=None,
+        prompt_embeds_clip=None,
+        negative_prompt_embeds_qwen=None,
+        negative_prompt_embeds_clip=None,
+        prompt_cu_seqlens=None,
+        negative_prompt_cu_seqlens=None,
+        callback_on_step_end_tensor_inputs=None,
+        max_sequence_length=None,
+    ):
+        """
+        Validate input parameters for the pipeline.
+
+        Args:
+            prompt: Input prompt
+            negative_prompt: Negative prompt for guidance
+            image: Input image for conditioning
+            height: Video height
+            width: Video width
+            prompt_embeds_qwen: Pre-computed Qwen prompt embeddings
+            prompt_embeds_clip: Pre-computed CLIP prompt embeddings
+            negative_prompt_embeds_qwen: Pre-computed Qwen negative prompt embeddings
+            negative_prompt_embeds_clip: Pre-computed CLIP negative prompt embeddings
+            prompt_cu_seqlens: Pre-computed cumulative sequence lengths for Qwen positive prompt
+            negative_prompt_cu_seqlens: Pre-computed cumulative sequence lengths for Qwen negative prompt
+            callback_on_step_end_tensor_inputs: Callback tensor inputs
+
+        Raises:
+            ValueError: If inputs are invalid
+        """
+
+        if max_sequence_length is not None and max_sequence_length > 1024:
+            raise ValueError("max_sequence_length must be less than 1024")
+
+        if image is None:
+            raise ValueError("`image` must be provided for image-to-video generation")
+
+        if height % 16 != 0 or width % 16 != 0:
+            raise ValueError(f"`height` and `width` have to be divisible by 16 but are {height} and {width}.")
+
+        if callback_on_step_end_tensor_inputs is not None and not all(
+            k in self._callback_tensor_inputs for k in callback_on_step_end_tensor_inputs
+        ):
+            raise ValueError(
+                f"`callback_on_step_end_tensor_inputs` has to be in {self._callback_tensor_inputs}, but found {[k for k in callback_on_step_end_tensor_inputs if k not in self._callback_tensor_inputs]}"
+            )
+
+        # Check for consistency within positive prompt embeddings and sequence lengths
+        if prompt_embeds_qwen is not None or prompt_embeds_clip is not None or prompt_cu_seqlens is not None:
+            if prompt_embeds_qwen is None or prompt_embeds_clip is None or prompt_cu_seqlens is None:
+                raise ValueError(
+                    "If any of `prompt_embeds_qwen`, `prompt_embeds_clip`, or `prompt_cu_seqlens` is provided, "
+                    "all three must be provided."
+                )
+
+        # Check for consistency within negative prompt embeddings and sequence lengths
+        if (
+            negative_prompt_embeds_qwen is not None
+            or negative_prompt_embeds_clip is not None
+            or negative_prompt_cu_seqlens is not None
+        ):
+            if (
+                negative_prompt_embeds_qwen is None
+                or negative_prompt_embeds_clip is None
+                or negative_prompt_cu_seqlens is None
+            ):
+                raise ValueError(
+                    "If any of `negative_prompt_embeds_qwen`, `negative_prompt_embeds_clip`, or `negative_prompt_cu_seqlens` is provided, "
+                    "all three must be provided."
+                )
+
+        # Check if prompt or embeddings are provided (either prompt or all required embedding components for positive)
+        if prompt is None and prompt_embeds_qwen is None:
+            raise ValueError(
+                "Provide either `prompt` or `prompt_embeds_qwen` (and corresponding `prompt_embeds_clip` and `prompt_cu_seqlens`). Cannot leave all undefined."
+            )
+
+        # Validate types for prompt and negative_prompt if provided
+        if prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+        if negative_prompt is not None and (
+            not isinstance(negative_prompt, str) and not isinstance(negative_prompt, list)
+        ):
+            raise ValueError(f"`negative_prompt` has to be of type `str` or `list` but is {type(negative_prompt)}")
+
+    def prepare_latents(
+        self,
+        image: PipelineImageInput,
+        batch_size: int,
+        num_channels_latents: int = 16,
+        height: int = 480,
+        width: int = 832,
+        num_frames: int = 81,
+        dtype: Optional[torch.dtype] = None,
+        device: Optional[torch.device] = None,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        """
+        Prepare initial latent variables for image-to-video generation.
+
+        This method creates random noise latents for all frames except the first frame, which is replaced with the
+        encoded input image.
+
+        Args:
+            image (PipelineImageInput): Input image to condition the generation on
+            batch_size (int): Number of videos to generate
+            num_channels_latents (int): Number of channels in latent space
+            height (int): Height of generated video
+            width (int): Width of generated video
+            num_frames (int): Number of frames in video
+            dtype (torch.dtype): Data type for latents
+            device (torch.device): Device to create latents on
+            generator (torch.Generator): Random number generator
+            latents (torch.Tensor): Pre-existing latents to use
+
+        Returns:
+            torch.Tensor: Prepared latent tensor with first frame as encoded image
+        """
+        if latents is not None:
+            return latents.to(device=device, dtype=dtype)
+
+        num_latent_frames = (num_frames - 1) // self.vae_scale_factor_temporal + 1
+        shape = (
+            batch_size,
+            num_latent_frames,
+            int(height) // self.vae_scale_factor_spatial,
+            int(width) // self.vae_scale_factor_spatial,
+            num_channels_latents,
+        )
+
+        if isinstance(generator, list) and len(generator) != batch_size:
+            raise ValueError(
+                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+            )
+
+        # Generate random noise for all frames
+        latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+
+        # Encode the input image to use as first frame
+        # Preprocess image
+        image_tensor = self.video_processor.preprocess(image, height=height, width=width).to(device, dtype=dtype)
+
+        # Encode image to latents using VAE
+        with torch.no_grad():
+            # Convert image to video format [batch, channels, 1, height, width]
+            image_video = image_tensor.unsqueeze(2)  # Add temporal dimension
+            image_latents = self.vae.encode(image_video).latent_dist.sample(generator=generator)
+
+            # Normalize latents if needed
+            if hasattr(self.vae.config, "scaling_factor"):
+                image_latents = image_latents * self.vae.config.scaling_factor
+
+            # Reshape to match latent dimensions [batch, frames, height, width, channels]
+            image_latents = image_latents.permute(0, 2, 3, 4, 1)  # [batch, 1, H, W, C]
+
+            # Replace first frame with encoded image
+            latents[:, 0:1] = image_latents
+
+        if self.transformer.visual_cond:
+            # For visual conditioning, concatenate with zeros and mask
+            visual_cond = torch.zeros_like(latents)
+            visual_cond_mask = torch.zeros(
+                [
+                    batch_size,
+                    num_latent_frames,
+                    int(height) // self.vae_scale_factor_spatial,
+                    int(width) // self.vae_scale_factor_spatial,
+                    1,
+                ],
+                dtype=latents.dtype,
+                device=latents.device,
+            )
+
+            visual_cond_mask[:, 0:1] = 1
+            visual_cond[:, 0:1] = image_latents
+
+            latents = torch.cat([latents, visual_cond, visual_cond_mask], dim=-1)
+
+        return latents
+
+    @property
+    def guidance_scale(self):
+        """Get the current guidance scale value."""
+        return self._guidance_scale
+
+    @property
+    def num_timesteps(self):
+        """Get the number of denoising timesteps."""
+        return self._num_timesteps
+
+    @property
+    def interrupt(self):
+        """Check if generation has been interrupted."""
+        return self._interrupt
+
+    @torch.no_grad()
+    @replace_example_docstring(EXAMPLE_DOC_STRING)
+    def __call__(
+        self,
+        image: PipelineImageInput,
+        prompt: Union[str, List[str]] = None,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        height: int = 512,
+        width: int = 768,
+        num_frames: int = 121,
+        num_inference_steps: int = 50,
+        guidance_scale: float = 5.0,
+        num_videos_per_prompt: Optional[int] = 1,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.Tensor] = None,
+        prompt_embeds_qwen: Optional[torch.Tensor] = None,
+        prompt_embeds_clip: Optional[torch.Tensor] = None,
+        negative_prompt_embeds_qwen: Optional[torch.Tensor] = None,
+        negative_prompt_embeds_clip: Optional[torch.Tensor] = None,
+        prompt_cu_seqlens: Optional[torch.Tensor] = None,
+        negative_prompt_cu_seqlens: Optional[torch.Tensor] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        callback_on_step_end: Optional[
+            Union[Callable[[int, int, Dict], None], PipelineCallback, MultiPipelineCallbacks]
+        ] = None,
+        callback_on_step_end_tensor_inputs: List[str] = ["latents"],
+        max_sequence_length: int = 512,
+    ):
+        r"""
+        The call function to the pipeline for image-to-video generation.
+
+        Args:
+            image (`PipelineImageInput`):
+                The input image to condition the generation on. Must be an image, a list of images or a `torch.Tensor`.
+            prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide the video generation. If not defined, pass `prompt_embeds` instead.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to avoid during video generation. If not defined, pass `negative_prompt_embeds`
+                instead. Ignored when not using guidance (`guidance_scale` < `1`).
+            height (`int`, defaults to `512`):
+                The height in pixels of the generated video.
+            width (`int`, defaults to `768`):
+                The width in pixels of the generated video.
+            num_frames (`int`, defaults to `121`):
+                The number of frames in the generated video.
+            num_inference_steps (`int`, defaults to `50`):
+                The number of denoising steps.
+            guidance_scale (`float`, defaults to `5.0`):
+                Guidance scale as defined in classifier-free guidance.
+            num_videos_per_prompt (`int`, *optional*, defaults to 1):
+                The number of videos to generate per prompt.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                A torch generator to make generation deterministic.
+            latents (`torch.Tensor`, *optional*):
+                Pre-generated noisy latents.
+            prompt_embeds_qwen (`torch.Tensor`, *optional*):
+                Pre-generated Qwen text embeddings.
+            prompt_embeds_clip (`torch.Tensor`, *optional*):
+                Pre-generated CLIP text embeddings.
+            negative_prompt_embeds_qwen (`torch.Tensor`, *optional*):
+                Pre-generated Qwen negative text embeddings.
+            negative_prompt_embeds_clip (`torch.Tensor`, *optional*):
+                Pre-generated CLIP negative text embeddings.
+            prompt_cu_seqlens (`torch.Tensor`, *optional*):
+                Pre-generated cumulative sequence lengths for Qwen positive prompt.
+            negative_prompt_cu_seqlens (`torch.Tensor`, *optional*):
+                Pre-generated cumulative sequence lengths for Qwen negative prompt.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generated video.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`KandinskyPipelineOutput`].
+            callback_on_step_end (`Callable`, `PipelineCallback`, `MultiPipelineCallbacks`, *optional*):
+                A function that is called at the end of each denoising step.
+            callback_on_step_end_tensor_inputs (`List`, *optional*):
+                The list of tensor inputs for the `callback_on_step_end` function.
+            max_sequence_length (`int`, defaults to `512`):
+                The maximum sequence length for text encoding.
+
+        Examples:
+
+        Returns:
+            [`~KandinskyPipelineOutput`] or `tuple`:
+                If `return_dict` is `True`, [`KandinskyPipelineOutput`] is returned, otherwise a `tuple` is returned
+                where the first element is a list with the generated videos.
+        """
+        if isinstance(callback_on_step_end, (PipelineCallback, MultiPipelineCallbacks)):
+            callback_on_step_end_tensor_inputs = callback_on_step_end.tensor_inputs
+
+        # 1. Check inputs. Raise error if not correct
+        self.check_inputs(
+            prompt=prompt,
+            negative_prompt=negative_prompt,
+            image=image,
+            height=height,
+            width=width,
+            prompt_embeds_qwen=prompt_embeds_qwen,
+            prompt_embeds_clip=prompt_embeds_clip,
+            negative_prompt_embeds_qwen=negative_prompt_embeds_qwen,
+            negative_prompt_embeds_clip=negative_prompt_embeds_clip,
+            prompt_cu_seqlens=prompt_cu_seqlens,
+            negative_prompt_cu_seqlens=negative_prompt_cu_seqlens,
+            callback_on_step_end_tensor_inputs=callback_on_step_end_tensor_inputs,
+            max_sequence_length=max_sequence_length,
+        )
+
+        if num_frames % self.vae_scale_factor_temporal != 1:
+            logger.warning(
+                f"`num_frames - 1` has to be divisible by {self.vae_scale_factor_temporal}. Rounding to the nearest number."
+            )
+            num_frames = num_frames // self.vae_scale_factor_temporal * self.vae_scale_factor_temporal + 1
+        num_frames = max(num_frames, 1)
+
+        self._guidance_scale = guidance_scale
+        self._interrupt = False
+
+        device = self._execution_device
+        dtype = self.transformer.dtype
+
+        # 2. Define call parameters
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+            prompt = [prompt]
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds_qwen.shape[0]
+
+        # 3. Encode input prompt
+        if prompt_embeds_qwen is None:
+            prompt_embeds_qwen, prompt_embeds_clip, prompt_cu_seqlens = self.encode_prompt(
+                prompt=prompt,
+                num_videos_per_prompt=num_videos_per_prompt,
+                max_sequence_length=max_sequence_length,
+                device=device,
+                dtype=dtype,
+            )
+
+        if self.guidance_scale > 1.0:
+            if negative_prompt is None:
+                negative_prompt = "Static, 2D cartoon, cartoon, 2d animation, paintings, images, worst quality, low quality, ugly, deformed, walking backwards"
+
+            if isinstance(negative_prompt, str):
+                negative_prompt = [negative_prompt] * len(prompt) if prompt is not None else [negative_prompt]
+            elif len(negative_prompt) != len(prompt):
+                raise ValueError(
+                    f"`negative_prompt` must have same length as `prompt`. Got {len(negative_prompt)} vs {len(prompt)}."
+                )
+
+            if negative_prompt_embeds_qwen is None:
+                negative_prompt_embeds_qwen, negative_prompt_embeds_clip, negative_prompt_cu_seqlens = (
+                    self.encode_prompt(
+                        prompt=negative_prompt,
+                        num_videos_per_prompt=num_videos_per_prompt,
+                        max_sequence_length=max_sequence_length,
+                        device=device,
+                        dtype=dtype,
+                    )
+                )
+
+        # 4. Prepare timesteps
+        self.scheduler.set_timesteps(num_inference_steps, device=device)
+        timesteps = self.scheduler.timesteps
+
+        # 5. Prepare latent variables with image conditioning
+        num_channels_latents = self.transformer.config.in_visual_dim
+        latents = self.prepare_latents(
+            image=image,
+            batch_size=batch_size * num_videos_per_prompt,
+            num_channels_latents=num_channels_latents,
+            height=height,
+            width=width,
+            num_frames=num_frames,
+            dtype=dtype,
+            device=device,
+            generator=generator,
+            latents=latents,
+        )
+
+        # 6. Prepare rope positions for positional encoding
+        num_latent_frames = (num_frames - 1) // self.vae_scale_factor_temporal + 1
+        visual_rope_pos = [
+            torch.arange(num_latent_frames, device=device),
+            torch.arange(height // self.vae_scale_factor_spatial // 2, device=device),
+            torch.arange(width // self.vae_scale_factor_spatial // 2, device=device),
+        ]
+
+        text_rope_pos = torch.arange(prompt_cu_seqlens.diff().max().item(), device=device)
+
+        negative_text_rope_pos = (
+            torch.arange(negative_prompt_cu_seqlens.diff().max().item(), device=device)
+            if negative_prompt_cu_seqlens is not None
+            else None
+        )
+
+        # 7. Calculate dynamic scale factor based on resolution
+        scale_factor = self._get_scale_factor(height, width)
+
+        # 8. Sparse Params for efficient attention
+        sparse_params = self.get_sparse_params(latents, device)
+
+        # 9. Denoising loop
+        num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
+        self._num_timesteps = len(timesteps)
+
+        with self.progress_bar(total=num_inference_steps) as progress_bar:
+            for i, t in enumerate(timesteps):
+                if self.interrupt:
+                    continue
+
+                timestep = t.unsqueeze(0).repeat(batch_size * num_videos_per_prompt)
+
+                # Predict noise residual
+                pred_velocity = self.transformer(
+                    hidden_states=latents.to(dtype),
+                    encoder_hidden_states=prompt_embeds_qwen.to(dtype),
+                    pooled_projections=prompt_embeds_clip.to(dtype),
+                    timestep=timestep.to(dtype),
+                    visual_rope_pos=visual_rope_pos,
+                    text_rope_pos=text_rope_pos,
+                    scale_factor=scale_factor,
+                    sparse_params=sparse_params,
+                    return_dict=True,
+                ).sample
+
+                if self.guidance_scale > 1.0 and negative_prompt_embeds_qwen is not None:
+                    uncond_pred_velocity = self.transformer(
+                        hidden_states=latents.to(dtype),
+                        encoder_hidden_states=negative_prompt_embeds_qwen.to(dtype),
+                        pooled_projections=negative_prompt_embeds_clip.to(dtype),
+                        timestep=timestep.to(dtype),
+                        visual_rope_pos=visual_rope_pos,
+                        text_rope_pos=negative_text_rope_pos,
+                        scale_factor=scale_factor,
+                        sparse_params=sparse_params,
+                        return_dict=True,
+                    ).sample
+
+                    pred_velocity = uncond_pred_velocity + guidance_scale * (pred_velocity - uncond_pred_velocity)
+
+                latents[:, 1:, :, :, :num_channels_latents] = self.scheduler.step(
+                    pred_velocity[:, 1:], t, latents[:, 1:, :, :, :num_channels_latents], return_dict=False
+                )[0]
+
+                if callback_on_step_end is not None:
+                    callback_kwargs = {}
+                    for k in callback_on_step_end_tensor_inputs:
+                        callback_kwargs[k] = locals()[k]
+                    callback_outputs = callback_on_step_end(self, i, t, callback_kwargs)
+
+                    latents = callback_outputs.pop("latents", latents)
+                    prompt_embeds_qwen = callback_outputs.pop("prompt_embeds_qwen", prompt_embeds_qwen)
+                    prompt_embeds_clip = callback_outputs.pop("prompt_embeds_clip", prompt_embeds_clip)
+                    negative_prompt_embeds_qwen = callback_outputs.pop(
+                        "negative_prompt_embeds_qwen", negative_prompt_embeds_qwen
+                    )
+                    negative_prompt_embeds_clip = callback_outputs.pop(
+                        "negative_prompt_embeds_clip", negative_prompt_embeds_clip
+                    )
+
+                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+                    progress_bar.update()
+
+                if XLA_AVAILABLE:
+                    xm.mark_step()
+
+        # 9. Post-processing - extract main latents
+        latents = latents[:, :, :, :, :num_channels_latents]
+
+        # 10. fix mesh artifacts
+        latents = self.normalize_first_frame(latents)
+
+        # 11. Decode latents to video
+        if output_type != "latent":
+            latents = latents.to(self.vae.dtype)
+            # Reshape and normalize latents
+            video = latents.reshape(
+                batch_size,
+                num_videos_per_prompt,
+                (num_frames - 1) // self.vae_scale_factor_temporal + 1,
+                height // self.vae_scale_factor_spatial,
+                width // self.vae_scale_factor_spatial,
+                num_channels_latents,
+            )
+            video = video.permute(0, 1, 5, 2, 3, 4)  # [batch, num_videos, channels, frames, height, width]
+            video = video.reshape(
+                batch_size * num_videos_per_prompt,
+                num_channels_latents,
+                (num_frames - 1) // self.vae_scale_factor_temporal + 1,
+                height // self.vae_scale_factor_spatial,
+                width // self.vae_scale_factor_spatial,
+            )
+
+            # Normalize and decode through VAE
+            video = video / self.vae.config.scaling_factor
+            video = self.vae.decode(video).sample
+            video = self.video_processor.postprocess_video(video, output_type=output_type)
+        else:
+            video = latents
+
+        # Offload all models
+        self.maybe_free_model_hooks()
+
+        if not return_dict:
+            return (video,)
+
+        return KandinskyPipelineOutput(frames=video)
diff --git a/src/diffusers/pipelines/kandinsky5/pipeline_kandinsky_t2i.py b/src/diffusers/pipelines/kandinsky5/pipeline_kandinsky_t2i.py
new file mode 100644
index 000000000000..bb5c40327b4e
--- /dev/null
+++ b/src/diffusers/pipelines/kandinsky5/pipeline_kandinsky_t2i.py
@@ -0,0 +1,818 @@
+# Copyright 2025 The Kandinsky Team and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import html
+from typing import Callable, Dict, List, Optional, Union
+
+import numpy as np
+import regex as re
+import torch
+from torch.nn import functional as F
+from transformers import CLIPTextModel, CLIPTokenizer, Qwen2_5_VLForConditionalGeneration, Qwen2VLProcessor
+
+from ...callbacks import MultiPipelineCallbacks, PipelineCallback
+from ...image_processor import VaeImageProcessor
+from ...loaders import KandinskyLoraLoaderMixin
+from ...models import AutoencoderKL
+from ...models.transformers import Kandinsky5Transformer3DModel
+from ...schedulers import FlowMatchEulerDiscreteScheduler
+
+# Add imports for offloading and tiling
+from ...utils import (
+    is_ftfy_available,
+    is_torch_xla_available,
+    logging,
+    replace_example_docstring,
+)
+from ...utils.torch_utils import randn_tensor
+from ..pipeline_utils import DiffusionPipeline
+from .pipeline_output import KandinskyImagePipelineOutput
+
+
+if is_torch_xla_available():
+    import torch_xla.core.xla_model as xm
+
+    XLA_AVAILABLE = True
+else:
+    XLA_AVAILABLE = False
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+if is_ftfy_available():
+    import ftfy
+
+
+logger = logging.get_logger(__name__)
+
+EXAMPLE_DOC_STRING = """
+    Examples:
+
+        ```python
+        >>> import torch
+        >>> from diffusers import Kandinsky5T2IPipeline
+
+        >>> # Available models:
+        >>> # kandinskylab/Kandinsky-5.0-T2I-Lite-sft-Diffusers
+        >>> # kandinskylab/Kandinsky-5.0-T2I-Lite-pretrain-Diffusers
+
+        >>> model_id = "kandinskylab/Kandinsky-5.0-T2I-Lite-sft-Diffusers"
+        >>> pipe = Kandinsky5T2IPipeline.from_pretrained(model_id, torch_dtype=torch.bfloat16)
+        >>> pipe = pipe.to("cuda")
+
+        >>> prompt = "A cat and a dog baking a cake together in a kitchen."
+
+        >>> output = pipe(
+        ...     prompt=prompt,
+        ...     negative_prompt="",
+        ...     height=1024,
+        ...     width=1024,
+        ...     num_inference_steps=50,
+        ...     guidance_scale=3.5,
+        ... ).frames[0]
+        ```
+"""
+
+
+def basic_clean(text):
+    """
+    Copied from https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/wan/pipeline_wan.py
+
+    Clean text using ftfy if available and unescape HTML entities.
+    """
+    if is_ftfy_available():
+        text = ftfy.fix_text(text)
+    text = html.unescape(html.unescape(text))
+    return text.strip()
+
+
+def whitespace_clean(text):
+    """
+    Copied from https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/wan/pipeline_wan.py
+
+    Normalize whitespace in text by replacing multiple spaces with single space.
+    """
+    text = re.sub(r"\s+", " ", text)
+    text = text.strip()
+    return text
+
+
+def prompt_clean(text):
+    """
+    Copied from https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/wan/pipeline_wan.py
+
+    Apply both basic cleaning and whitespace normalization to prompts.
+    """
+    text = whitespace_clean(basic_clean(text))
+    return text
+
+
+class Kandinsky5T2IPipeline(DiffusionPipeline, KandinskyLoraLoaderMixin):
+    r"""
+    Pipeline for text-to-image generation using Kandinsky 5.0.
+
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods
+    implemented for all pipelines (downloading, saving, running on a particular device, etc.).
+
+    Args:
+        transformer ([`Kandinsky5Transformer3DModel`]):
+            Conditional Transformer to denoise the encoded image latents.
+        vae ([`AutoencoderKL`]):
+            Variational Auto-Encoder Model [black-forest-labs/FLUX.1-dev
+            (vae)](https://huggingface.co/black-forest-labs/FLUX.1-dev) to encode and decode videos to and from latent
+            representations.
+        text_encoder ([`Qwen2_5_VLForConditionalGeneration`]):
+            Frozen text-encoder [Qwen2.5-VL](https://huggingface.co/Qwen/Qwen2.5-VL-7B-Instruct).
+        tokenizer ([`AutoProcessor`]):
+            Tokenizer for Qwen2.5-VL.
+        text_encoder_2 ([`CLIPTextModel`]):
+            Frozen [CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModel),
+            specifically the [clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14) variant.
+        tokenizer_2 ([`CLIPTokenizer`]):
+            Tokenizer for CLIP.
+        scheduler ([`FlowMatchEulerDiscreteScheduler`]):
+            A scheduler to be used in combination with `transformer` to denoise the encoded image latents.
+    """
+
+    model_cpu_offload_seq = "text_encoder->text_encoder_2->transformer->vae"
+    _callback_tensor_inputs = [
+        "latents",
+        "prompt_embeds_qwen",
+        "prompt_embeds_clip",
+        "negative_prompt_embeds_qwen",
+        "negative_prompt_embeds_clip",
+    ]
+
+    def __init__(
+        self,
+        transformer: Kandinsky5Transformer3DModel,
+        vae: AutoencoderKL,
+        text_encoder: Qwen2_5_VLForConditionalGeneration,
+        tokenizer: Qwen2VLProcessor,
+        text_encoder_2: CLIPTextModel,
+        tokenizer_2: CLIPTokenizer,
+        scheduler: FlowMatchEulerDiscreteScheduler,
+    ):
+        super().__init__()
+
+        self.register_modules(
+            transformer=transformer,
+            vae=vae,
+            text_encoder=text_encoder,
+            tokenizer=tokenizer,
+            text_encoder_2=text_encoder_2,
+            tokenizer_2=tokenizer_2,
+            scheduler=scheduler,
+        )
+
+        self.prompt_template = "<|im_start|>system\nYou are a promt engineer. Describe the image by detailing the color, shape, size, texture, quantity, text, spatial relationships of the objects and background:<|im_end|>\n<|im_start|>user\n{}<|im_end|>"
+        self.prompt_template_encode_start_idx = 41
+
+        self.vae_scale_factor_spatial = 8
+        self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor_spatial)
+        self.resolutions = [(1024, 1024), (640, 1408), (1408, 640), (768, 1280), (1280, 768), (896, 1152), (1152, 896)]
+
+    def _encode_prompt_qwen(
+        self,
+        prompt: List[str],
+        device: Optional[torch.device] = None,
+        max_sequence_length: int = 512,
+        dtype: Optional[torch.dtype] = None,
+    ):
+        """
+        Encode prompt using Qwen2.5-VL text encoder.
+
+        This method processes the input prompt through the Qwen2.5-VL model to generate text embeddings suitable for
+        image generation.
+
+        Args:
+            prompt List[str]: Input list of prompts
+            device (torch.device): Device to run encoding on
+            max_sequence_length (int): Maximum sequence length for tokenization
+            dtype (torch.dtype): Data type for embeddings
+
+        Returns:
+            Tuple[torch.Tensor, torch.Tensor]: Text embeddings and cumulative sequence lengths
+        """
+        device = device or self._execution_device
+        dtype = dtype or self.text_encoder.dtype
+
+        full_texts = [self.prompt_template.format(p) for p in prompt]
+        max_allowed_len = self.prompt_template_encode_start_idx + max_sequence_length
+
+        untruncated_ids = self.tokenizer(
+            text=full_texts,
+            images=None,
+            videos=None,
+            return_tensors="pt",
+            padding="longest",
+        )["input_ids"]
+
+        if untruncated_ids.shape[-1] > max_allowed_len:
+            for i, text in enumerate(full_texts):
+                tokens = untruncated_ids[i][self.prompt_template_encode_start_idx : -2]
+                removed_text = self.tokenizer.decode(tokens[max_sequence_length - 2 :])
+                if len(removed_text) > 0:
+                    full_texts[i] = text[: -len(removed_text)]
+                    logger.warning(
+                        "The following part of your input was truncated because `max_sequence_length` is set to "
+                        f" {max_sequence_length} tokens: {removed_text}"
+                    )
+
+        inputs = self.tokenizer(
+            text=full_texts,
+            images=None,
+            videos=None,
+            max_length=max_allowed_len,
+            truncation=True,
+            return_tensors="pt",
+            padding=True,
+        ).to(device)
+
+        embeds = self.text_encoder(
+            input_ids=inputs["input_ids"],
+            return_dict=True,
+            output_hidden_states=True,
+        )["hidden_states"][-1][:, self.prompt_template_encode_start_idx :]
+        attention_mask = inputs["attention_mask"][:, self.prompt_template_encode_start_idx :]
+        cu_seqlens = torch.cumsum(attention_mask.sum(1), dim=0)
+        cu_seqlens = F.pad(cu_seqlens, (1, 0), value=0).to(dtype=torch.int32)
+
+        return embeds.to(dtype), cu_seqlens
+
+    def _encode_prompt_clip(
+        self,
+        prompt: Union[str, List[str]],
+        device: Optional[torch.device] = None,
+        dtype: Optional[torch.dtype] = None,
+    ):
+        """
+        Encode prompt using CLIP text encoder.
+
+        This method processes the input prompt through the CLIP model to generate pooled embeddings that capture
+        semantic information.
+
+        Args:
+            prompt (Union[str, List[str]]): Input prompt or list of prompts
+            device (torch.device): Device to run encoding on
+            dtype (torch.dtype): Data type for embeddings
+
+        Returns:
+            torch.Tensor: Pooled text embeddings from CLIP
+        """
+        device = device or self._execution_device
+        dtype = dtype or self.text_encoder_2.dtype
+
+        inputs = self.tokenizer_2(
+            prompt,
+            max_length=77,
+            truncation=True,
+            add_special_tokens=True,
+            padding="max_length",
+            return_tensors="pt",
+        ).to(device)
+
+        pooled_embed = self.text_encoder_2(**inputs)["pooler_output"]
+
+        return pooled_embed.to(dtype)
+
+    def encode_prompt(
+        self,
+        prompt: Union[str, List[str]],
+        num_images_per_prompt: int = 1,
+        max_sequence_length: int = 512,
+        device: Optional[torch.device] = None,
+        dtype: Optional[torch.dtype] = None,
+    ):
+        r"""
+        Encodes a single prompt (positive or negative) into text encoder hidden states.
+
+        This method combines embeddings from both Qwen2.5-VL and CLIP text encoders to create comprehensive text
+        representations for image generation.
+
+        Args:
+            prompt (`str` or `List[str]`):
+                Prompt to be encoded.
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                Number of images to generate per prompt.
+            max_sequence_length (`int`, *optional*, defaults to 512):
+                Maximum sequence length for text encoding. Must be less than 1024
+            device (`torch.device`, *optional*):
+                Torch device.
+            dtype (`torch.dtype`, *optional*):
+                Torch dtype.
+
+        Returns:
+            Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+                - Qwen text embeddings of shape (batch_size * num_images_per_prompt, sequence_length, embedding_dim)
+                - CLIP pooled embeddings of shape (batch_size * num_images_per_prompt, clip_embedding_dim)
+                - Cumulative sequence lengths (`cu_seqlens`) for Qwen embeddings of shape (batch_size *
+                  num_images_per_prompt + 1,)
+        """
+        device = device or self._execution_device
+        dtype = dtype or self.text_encoder.dtype
+
+        if not isinstance(prompt, list):
+            prompt = [prompt]
+
+        batch_size = len(prompt)
+
+        prompt = [prompt_clean(p) for p in prompt]
+
+        # Encode with Qwen2.5-VL
+        prompt_embeds_qwen, prompt_cu_seqlens = self._encode_prompt_qwen(
+            prompt=prompt,
+            device=device,
+            max_sequence_length=max_sequence_length,
+            dtype=dtype,
+        )
+        # prompt_embeds_qwen shape: [batch_size, seq_len, embed_dim]
+
+        # Encode with CLIP
+        prompt_embeds_clip = self._encode_prompt_clip(
+            prompt=prompt,
+            device=device,
+            dtype=dtype,
+        )
+        # prompt_embeds_clip shape: [batch_size, clip_embed_dim]
+
+        # Repeat embeddings for num_images_per_prompt
+        # Qwen embeddings: repeat sequence for each image, then reshape
+        prompt_embeds_qwen = prompt_embeds_qwen.repeat(
+            1, num_images_per_prompt, 1
+        )  # [batch_size, seq_len * num_images_per_prompt, embed_dim]
+        # Reshape to [batch_size * num_images_per_prompt, seq_len, embed_dim]
+        prompt_embeds_qwen = prompt_embeds_qwen.view(
+            batch_size * num_images_per_prompt, -1, prompt_embeds_qwen.shape[-1]
+        )
+
+        # CLIP embeddings: repeat for each image
+        prompt_embeds_clip = prompt_embeds_clip.repeat(
+            1, num_images_per_prompt, 1
+        )  # [batch_size, num_images_per_prompt, clip_embed_dim]
+        # Reshape to [batch_size * num_images_per_prompt, clip_embed_dim]
+        prompt_embeds_clip = prompt_embeds_clip.view(batch_size * num_images_per_prompt, -1)
+
+        # Repeat cumulative sequence lengths for num_images_per_prompt
+        # Original differences (lengths) for each prompt in the batch
+        original_lengths = prompt_cu_seqlens.diff()  # [len1, len2, ...]
+        # Repeat the lengths for num_images_per_prompt
+        repeated_lengths = original_lengths.repeat_interleave(
+            num_images_per_prompt
+        )  # [len1, len1, ..., len2, len2, ...]
+        # Reconstruct the cumulative lengths
+        repeated_cu_seqlens = torch.cat(
+            [torch.tensor([0], device=device, dtype=torch.int32), repeated_lengths.cumsum(0)]
+        )
+
+        return prompt_embeds_qwen, prompt_embeds_clip, repeated_cu_seqlens
+
+    def check_inputs(
+        self,
+        prompt,
+        negative_prompt,
+        height,
+        width,
+        prompt_embeds_qwen=None,
+        prompt_embeds_clip=None,
+        negative_prompt_embeds_qwen=None,
+        negative_prompt_embeds_clip=None,
+        prompt_cu_seqlens=None,
+        negative_prompt_cu_seqlens=None,
+        callback_on_step_end_tensor_inputs=None,
+        max_sequence_length=None,
+    ):
+        """
+        Validate input parameters for the pipeline.
+
+        Args:
+            prompt: Input prompt
+            negative_prompt: Negative prompt for guidance
+            height: Image height
+            width: Image width
+            prompt_embeds_qwen: Pre-computed Qwen prompt embeddings
+            prompt_embeds_clip: Pre-computed CLIP prompt embeddings
+            negative_prompt_embeds_qwen: Pre-computed Qwen negative prompt embeddings
+            negative_prompt_embeds_clip: Pre-computed CLIP negative prompt embeddings
+            prompt_cu_seqlens: Pre-computed cumulative sequence lengths for Qwen positive prompt
+            negative_prompt_cu_seqlens: Pre-computed cumulative sequence lengths for Qwen negative prompt
+            callback_on_step_end_tensor_inputs: Callback tensor inputs
+
+        Raises:
+            ValueError: If inputs are invalid
+        """
+
+        if max_sequence_length is not None and max_sequence_length > 1024:
+            raise ValueError("max_sequence_length must be less than 1024")
+
+        if (width, height) not in self.resolutions:
+            resolutions_str = ",".join([f"({w},{h})" for w, h in self.resolutions])
+            logger.warning(
+                f"`height` and `width` have to be one of {resolutions_str}, but are {height} and {width}. Dimensions will be resized accordingly"
+            )
+
+        if callback_on_step_end_tensor_inputs is not None and not all(
+            k in self._callback_tensor_inputs for k in callback_on_step_end_tensor_inputs
+        ):
+            raise ValueError(
+                f"`callback_on_step_end_tensor_inputs` has to be in {self._callback_tensor_inputs}, but found {[k for k in callback_on_step_end_tensor_inputs if k not in self._callback_tensor_inputs]}"
+            )
+
+        # Check for consistency within positive prompt embeddings and sequence lengths
+        if prompt_embeds_qwen is not None or prompt_embeds_clip is not None or prompt_cu_seqlens is not None:
+            if prompt_embeds_qwen is None or prompt_embeds_clip is None or prompt_cu_seqlens is None:
+                raise ValueError(
+                    "If any of `prompt_embeds_qwen`, `prompt_embeds_clip`, or `prompt_cu_seqlens` is provided, "
+                    "all three must be provided."
+                )
+
+        # Check for consistency within negative prompt embeddings and sequence lengths
+        if (
+            negative_prompt_embeds_qwen is not None
+            or negative_prompt_embeds_clip is not None
+            or negative_prompt_cu_seqlens is not None
+        ):
+            if (
+                negative_prompt_embeds_qwen is None
+                or negative_prompt_embeds_clip is None
+                or negative_prompt_cu_seqlens is None
+            ):
+                raise ValueError(
+                    "If any of `negative_prompt_embeds_qwen`, `negative_prompt_embeds_clip`, or `negative_prompt_cu_seqlens` is provided, "
+                    "all three must be provided."
+                )
+
+        # Check if prompt or embeddings are provided (either prompt or all required embedding components for positive)
+        if prompt is None and prompt_embeds_qwen is None:
+            raise ValueError(
+                "Provide either `prompt` or `prompt_embeds_qwen` (and corresponding `prompt_embeds_clip` and `prompt_cu_seqlens`). Cannot leave all undefined."
+            )
+
+        # Validate types for prompt and negative_prompt if provided
+        if prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+        if negative_prompt is not None and (
+            not isinstance(negative_prompt, str) and not isinstance(negative_prompt, list)
+        ):
+            raise ValueError(f"`negative_prompt` has to be of type `str` or `list` but is {type(negative_prompt)}")
+
+    def prepare_latents(
+        self,
+        batch_size: int,
+        num_channels_latents: int = 16,
+        height: int = 1024,
+        width: int = 1024,
+        dtype: Optional[torch.dtype] = None,
+        device: Optional[torch.device] = None,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        """
+        Prepare initial latent variables for text-to-image generation.
+
+        This method creates random noise latents
+
+        Args:
+            batch_size (int): Number of images to generate
+            num_channels_latents (int): Number of channels in latent space
+            height (int): Height of generated image
+            width (int): Width of generated image
+            dtype (torch.dtype): Data type for latents
+            device (torch.device): Device to create latents on
+            generator (torch.Generator): Random number generator
+            latents (torch.Tensor): Pre-existing latents to use
+
+        Returns:
+            torch.Tensor: Prepared latent tensor
+        """
+        if latents is not None:
+            return latents.to(device=device, dtype=dtype)
+
+        shape = (
+            batch_size,
+            1,
+            int(height) // self.vae_scale_factor_spatial,
+            int(width) // self.vae_scale_factor_spatial,
+            num_channels_latents,
+        )
+
+        if isinstance(generator, list) and len(generator) != batch_size:
+            raise ValueError(
+                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+            )
+
+        # Generate random noise
+        latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+        return latents
+
+    @property
+    def guidance_scale(self):
+        """Get the current guidance scale value."""
+        return self._guidance_scale
+
+    @property
+    def num_timesteps(self):
+        """Get the number of denoising timesteps."""
+        return self._num_timesteps
+
+    @property
+    def interrupt(self):
+        """Check if generation has been interrupted."""
+        return self._interrupt
+
+    @torch.no_grad()
+    @replace_example_docstring(EXAMPLE_DOC_STRING)
+    def __call__(
+        self,
+        prompt: Union[str, List[str]] = None,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        height: int = 1024,
+        width: int = 1024,
+        num_inference_steps: int = 50,
+        guidance_scale: float = 3.5,
+        num_images_per_prompt: Optional[int] = 1,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.Tensor] = None,
+        prompt_embeds_qwen: Optional[torch.Tensor] = None,
+        prompt_embeds_clip: Optional[torch.Tensor] = None,
+        negative_prompt_embeds_qwen: Optional[torch.Tensor] = None,
+        negative_prompt_embeds_clip: Optional[torch.Tensor] = None,
+        prompt_cu_seqlens: Optional[torch.Tensor] = None,
+        negative_prompt_cu_seqlens: Optional[torch.Tensor] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        callback_on_step_end: Optional[
+            Union[Callable[[int, int, Dict], None], PipelineCallback, MultiPipelineCallbacks]
+        ] = None,
+        callback_on_step_end_tensor_inputs: List[str] = ["latents"],
+        max_sequence_length: int = 512,
+    ):
+        r"""
+        The call function to the pipeline for text-to-image generation.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide the image generation. If not defined, pass `prompt_embeds` instead.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to avoid during image generation. If not defined, pass `negative_prompt_embeds`
+                instead. Ignored when not using guidance (`guidance_scale` < `1`).
+            height (`int`, defaults to `1024`):
+                The height in pixels of the generated image.
+            width (`int`, defaults to `1024`):
+                The width in pixels of the generated image.
+            num_inference_steps (`int`, defaults to `50`):
+                The number of denoising steps.
+            guidance_scale (`float`, defaults to `5.0`):
+                Guidance scale as defined in classifier-free guidance.
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                A torch generator to make generation deterministic.
+            latents (`torch.Tensor`, *optional*):
+                Pre-generated noisy latents.
+            prompt_embeds_qwen (`torch.Tensor`, *optional*):
+                Pre-generated Qwen text embeddings.
+            prompt_embeds_clip (`torch.Tensor`, *optional*):
+                Pre-generated CLIP text embeddings.
+            negative_prompt_embeds_qwen (`torch.Tensor`, *optional*):
+                Pre-generated Qwen negative text embeddings.
+            negative_prompt_embeds_clip (`torch.Tensor`, *optional*):
+                Pre-generated CLIP negative text embeddings.
+            prompt_cu_seqlens (`torch.Tensor`, *optional*):
+                Pre-generated cumulative sequence lengths for Qwen positive prompt.
+            negative_prompt_cu_seqlens (`torch.Tensor`, *optional*):
+                Pre-generated cumulative sequence lengths for Qwen negative prompt.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generated image.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`KandinskyImagePipelineOutput`].
+            callback_on_step_end (`Callable`, `PipelineCallback`, `MultiPipelineCallbacks`, *optional*):
+                A function that is called at the end of each denoising step.
+            callback_on_step_end_tensor_inputs (`List`, *optional*):
+                The list of tensor inputs for the `callback_on_step_end` function.
+            max_sequence_length (`int`, defaults to `512`):
+                The maximum sequence length for text encoding.
+
+        Examples:
+
+        Returns:
+            [`~KandinskyImagePipelineOutput`] or `tuple`:
+                If `return_dict` is `True`, [`KandinskyImagePipelineOutput`] is returned, otherwise a `tuple` is
+                returned where the first element is a list with the generated images.
+        """
+        if isinstance(callback_on_step_end, (PipelineCallback, MultiPipelineCallbacks)):
+            callback_on_step_end_tensor_inputs = callback_on_step_end.tensor_inputs
+        self.check_inputs(
+            prompt=prompt,
+            negative_prompt=negative_prompt,
+            height=height,
+            width=width,
+            prompt_embeds_qwen=prompt_embeds_qwen,
+            prompt_embeds_clip=prompt_embeds_clip,
+            negative_prompt_embeds_qwen=negative_prompt_embeds_qwen,
+            negative_prompt_embeds_clip=negative_prompt_embeds_clip,
+            prompt_cu_seqlens=prompt_cu_seqlens,
+            negative_prompt_cu_seqlens=negative_prompt_cu_seqlens,
+            callback_on_step_end_tensor_inputs=callback_on_step_end_tensor_inputs,
+            max_sequence_length=max_sequence_length,
+        )
+        if (width, height) not in self.resolutions:
+            width, height = self.resolutions[
+                np.argmin([abs((i[0] / i[1]) - (width / height)) for i in self.resolutions])
+            ]
+
+        self._guidance_scale = guidance_scale
+        self._interrupt = False
+
+        device = self._execution_device
+        dtype = self.transformer.dtype
+
+        # 2. Define call parameters
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+            prompt = [prompt]
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds_qwen.shape[0]
+
+        # 3. Encode input prompt
+        if prompt_embeds_qwen is None:
+            prompt_embeds_qwen, prompt_embeds_clip, prompt_cu_seqlens = self.encode_prompt(
+                prompt=prompt,
+                num_images_per_prompt=num_images_per_prompt,
+                max_sequence_length=max_sequence_length,
+                device=device,
+                dtype=dtype,
+            )
+
+        if self.guidance_scale > 1.0:
+            if negative_prompt is None:
+                negative_prompt = ""
+
+            if isinstance(negative_prompt, str):
+                negative_prompt = [negative_prompt] * len(prompt) if prompt is not None else [negative_prompt]
+            elif len(negative_prompt) != len(prompt):
+                raise ValueError(
+                    f"`negative_prompt` must have same length as `prompt`. Got {len(negative_prompt)} vs {len(prompt)}."
+                )
+
+            if negative_prompt_embeds_qwen is None:
+                negative_prompt_embeds_qwen, negative_prompt_embeds_clip, negative_prompt_cu_seqlens = (
+                    self.encode_prompt(
+                        prompt=negative_prompt,
+                        num_images_per_prompt=num_images_per_prompt,
+                        max_sequence_length=max_sequence_length,
+                        device=device,
+                        dtype=dtype,
+                    )
+                )
+
+        # 4. Prepare timesteps
+        self.scheduler.set_timesteps(num_inference_steps, device=device)
+        timesteps = self.scheduler.timesteps
+
+        # 5. Prepare latent variables
+        num_channels_latents = self.transformer.config.in_visual_dim
+        latents = self.prepare_latents(
+            batch_size=batch_size * num_images_per_prompt,
+            num_channels_latents=num_channels_latents,
+            height=height,
+            width=width,
+            dtype=dtype,
+            device=device,
+            generator=generator,
+            latents=latents,
+        )
+
+        # 6. Prepare rope positions for positional encoding
+        visual_rope_pos = [
+            torch.arange(1, device=device),
+            torch.arange(height // self.vae_scale_factor_spatial // 2, device=device),
+            torch.arange(width // self.vae_scale_factor_spatial // 2, device=device),
+        ]
+
+        text_rope_pos = torch.arange(prompt_cu_seqlens.diff().max().item(), device=device)
+
+        negative_text_rope_pos = (
+            torch.arange(negative_prompt_cu_seqlens.diff().max().item(), device=device)
+            if negative_prompt_cu_seqlens is not None
+            else None
+        )
+
+        # 7. Calculate dynamic scale factor based on resolution
+        scale_factor = [1.0, 1.0, 1.0]
+
+        # 8. Sparse Params for efficient attention
+        sparse_params = None
+
+        # 9. Denoising loop
+        num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
+        self._num_timesteps = len(timesteps)
+        with self.progress_bar(total=num_inference_steps) as progress_bar:
+            for i, t in enumerate(timesteps):
+                if self.interrupt:
+                    continue
+
+                timestep = t.unsqueeze(0).repeat(batch_size * num_images_per_prompt)
+
+                # Predict noise residual
+                pred_velocity = self.transformer(
+                    hidden_states=latents.to(dtype),
+                    encoder_hidden_states=prompt_embeds_qwen.to(dtype),
+                    pooled_projections=prompt_embeds_clip.to(dtype),
+                    timestep=timestep.to(dtype),
+                    visual_rope_pos=visual_rope_pos,
+                    text_rope_pos=text_rope_pos,
+                    scale_factor=scale_factor,
+                    sparse_params=sparse_params,
+                    return_dict=True,
+                ).sample
+
+                if self.guidance_scale > 1.0 and negative_prompt_embeds_qwen is not None:
+                    uncond_pred_velocity = self.transformer(
+                        hidden_states=latents.to(dtype),
+                        encoder_hidden_states=negative_prompt_embeds_qwen.to(dtype),
+                        pooled_projections=negative_prompt_embeds_clip.to(dtype),
+                        timestep=timestep.to(dtype),
+                        visual_rope_pos=visual_rope_pos,
+                        text_rope_pos=negative_text_rope_pos,
+                        scale_factor=scale_factor,
+                        sparse_params=sparse_params,
+                        return_dict=True,
+                    ).sample
+
+                    pred_velocity = uncond_pred_velocity + guidance_scale * (pred_velocity - uncond_pred_velocity)
+
+                latents = self.scheduler.step(pred_velocity[:, :], t, latents, return_dict=False)[0]
+
+                if callback_on_step_end is not None:
+                    callback_kwargs = {}
+                    for k in callback_on_step_end_tensor_inputs:
+                        callback_kwargs[k] = locals()[k]
+                    callback_outputs = callback_on_step_end(self, i, t, callback_kwargs)
+
+                    latents = callback_outputs.pop("latents", latents)
+                    prompt_embeds_qwen = callback_outputs.pop("prompt_embeds_qwen", prompt_embeds_qwen)
+                    prompt_embeds_clip = callback_outputs.pop("prompt_embeds_clip", prompt_embeds_clip)
+                    negative_prompt_embeds_qwen = callback_outputs.pop(
+                        "negative_prompt_embeds_qwen", negative_prompt_embeds_qwen
+                    )
+                    negative_prompt_embeds_clip = callback_outputs.pop(
+                        "negative_prompt_embeds_clip", negative_prompt_embeds_clip
+                    )
+
+                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+                    progress_bar.update()
+
+                if XLA_AVAILABLE:
+                    xm.mark_step()
+
+        # 9. Post-processing - extract main latents
+        latents = latents[:, :, :, :, :num_channels_latents]
+
+        # 10. Decode latents to image
+        if output_type != "latent":
+            latents = latents.to(self.vae.dtype)
+            # Reshape and normalize latents
+            latents = latents.reshape(
+                batch_size,
+                num_images_per_prompt,
+                1,
+                height // self.vae_scale_factor_spatial,
+                width // self.vae_scale_factor_spatial,
+                num_channels_latents,
+            )
+            latents = latents.permute(0, 1, 5, 2, 3, 4)  # [batch, num_images, channels, 1, height, width]
+            latents = latents.reshape(
+                batch_size * num_images_per_prompt,
+                num_channels_latents,
+                height // self.vae_scale_factor_spatial,
+                width // self.vae_scale_factor_spatial,
+            )
+
+            # Normalize and decode through VAE
+            latents = latents / self.vae.config.scaling_factor
+            image = self.vae.decode(latents).sample
+            image = self.image_processor.postprocess(image, output_type=output_type)
+        else:
+            image = latents
+
+        # Offload all models
+        self.maybe_free_model_hooks()
+
+        if not return_dict:
+            return (image,)
+
+        return KandinskyImagePipelineOutput(image=image)
diff --git a/src/diffusers/pipelines/kandinsky5/pipeline_output.py b/src/diffusers/pipelines/kandinsky5/pipeline_output.py
index ed77d42a9a83..2172ddff7e22 100644
--- a/src/diffusers/pipelines/kandinsky5/pipeline_output.py
+++ b/src/diffusers/pipelines/kandinsky5/pipeline_output.py
@@ -8,7 +8,7 @@
 @dataclass
 class KandinskyPipelineOutput(BaseOutput):
     r"""
-    Output class for Wan pipelines.
+    Output class for kandinsky video pipelines.
 
     Args:
         frames (`torch.Tensor`, `np.ndarray`, or List[List[PIL.Image.Image]]):
@@ -18,3 +18,18 @@ class KandinskyPipelineOutput(BaseOutput):
     """
 
     frames: torch.Tensor
+
+
+@dataclass
+class KandinskyImagePipelineOutput(BaseOutput):
+    r"""
+    Output class for kandinsky image pipelines.
+
+    Args:
+        image (`torch.Tensor`, `np.ndarray`, or List[PIL.Image.Image]):
+            List of image outputs - It can be a nested list of length `batch_size,` with each sub-list containing
+            denoised PIL image. It can also be a NumPy array or Torch tensor of shape `(batch_size, channels, height,
+            width)`.
+    """
+
+    image: torch.Tensor
diff --git a/src/diffusers/pipelines/kolors/pipeline_kolors.py b/src/diffusers/pipelines/kolors/pipeline_kolors.py
index 948f73ed91eb..7c8468bcb109 100644
--- a/src/diffusers/pipelines/kolors/pipeline_kolors.py
+++ b/src/diffusers/pipelines/kolors/pipeline_kolors.py
@@ -21,9 +21,8 @@
 from ...image_processor import PipelineImageInput, VaeImageProcessor
 from ...loaders import IPAdapterMixin, StableDiffusionLoraLoaderMixin
 from ...models import AutoencoderKL, ImageProjection, UNet2DConditionModel
-from ...models.attention_processor import AttnProcessor2_0, FusedAttnProcessor2_0, XFormersAttnProcessor
 from ...schedulers import KarrasDiffusionSchedulers
-from ...utils import is_torch_xla_available, logging, replace_example_docstring
+from ...utils import deprecate, is_torch_xla_available, logging, replace_example_docstring
 from ...utils.torch_utils import randn_tensor
 from ..pipeline_utils import DiffusionPipeline, StableDiffusionMixin
 from .pipeline_output import KolorsPipelineOutput
@@ -580,22 +579,12 @@ def _get_add_time_ids(
 
     # Copied from diffusers.pipelines.stable_diffusion_xl.pipeline_stable_diffusion_xl.StableDiffusionXLPipeline.upcast_vae
     def upcast_vae(self):
-        dtype = self.vae.dtype
-        self.vae.to(dtype=torch.float32)
-        use_torch_2_0_or_xformers = isinstance(
-            self.vae.decoder.mid_block.attentions[0].processor,
-            (
-                AttnProcessor2_0,
-                XFormersAttnProcessor,
-                FusedAttnProcessor2_0,
-            ),
+        deprecate(
+            "upcast_vae",
+            "1.0.0",
+            "`upcast_vae` is deprecated. Please use `pipe.vae.to(torch.float32)`. For more details, please refer to: https://github.com/huggingface/diffusers/pull/12619#issue-3606633695.",
         )
-        # if xformers or torch_2_0 is used attention block does not need
-        # to be in float32 which can save lots of memory
-        if use_torch_2_0_or_xformers:
-            self.vae.post_quant_conv.to(dtype)
-            self.vae.decoder.conv_in.to(dtype)
-            self.vae.decoder.mid_block.to(dtype)
+        self.vae.to(dtype=torch.float32)
 
     # Copied from diffusers.pipelines.latent_consistency_models.pipeline_latent_consistency_text2img.LatentConsistencyModelPipeline.get_guidance_scale_embedding
     def get_guidance_scale_embedding(
diff --git a/src/diffusers/pipelines/kolors/pipeline_kolors_img2img.py b/src/diffusers/pipelines/kolors/pipeline_kolors_img2img.py
index 67d49b9a8c5e..10a7962c258c 100644
--- a/src/diffusers/pipelines/kolors/pipeline_kolors_img2img.py
+++ b/src/diffusers/pipelines/kolors/pipeline_kolors_img2img.py
@@ -22,9 +22,8 @@
 from ...image_processor import PipelineImageInput, VaeImageProcessor
 from ...loaders import IPAdapterMixin, StableDiffusionXLLoraLoaderMixin
 from ...models import AutoencoderKL, ImageProjection, UNet2DConditionModel
-from ...models.attention_processor import AttnProcessor2_0, FusedAttnProcessor2_0, XFormersAttnProcessor
 from ...schedulers import KarrasDiffusionSchedulers
-from ...utils import is_torch_xla_available, logging, replace_example_docstring
+from ...utils import deprecate, is_torch_xla_available, logging, replace_example_docstring
 from ...utils.torch_utils import empty_device_cache, randn_tensor
 from ..pipeline_utils import DiffusionPipeline, StableDiffusionMixin
 from .pipeline_output import KolorsPipelineOutput
@@ -708,22 +707,12 @@ def _get_add_time_ids(
 
     # Copied from diffusers.pipelines.stable_diffusion_xl.pipeline_stable_diffusion_xl.StableDiffusionXLPipeline.upcast_vae
     def upcast_vae(self):
-        dtype = self.vae.dtype
-        self.vae.to(dtype=torch.float32)
-        use_torch_2_0_or_xformers = isinstance(
-            self.vae.decoder.mid_block.attentions[0].processor,
-            (
-                AttnProcessor2_0,
-                XFormersAttnProcessor,
-                FusedAttnProcessor2_0,
-            ),
+        deprecate(
+            "upcast_vae",
+            "1.0.0",
+            "`upcast_vae` is deprecated. Please use `pipe.vae.to(torch.float32)`. For more details, please refer to: https://github.com/huggingface/diffusers/pull/12619#issue-3606633695.",
         )
-        # if xformers or torch_2_0 is used attention block does not need
-        # to be in float32 which can save lots of memory
-        if use_torch_2_0_or_xformers:
-            self.vae.post_quant_conv.to(dtype)
-            self.vae.decoder.conv_in.to(dtype)
-            self.vae.decoder.mid_block.to(dtype)
+        self.vae.to(dtype=torch.float32)
 
     # Copied from diffusers.pipelines.latent_consistency_models.pipeline_latent_consistency_text2img.LatentConsistencyModelPipeline.get_guidance_scale_embedding
     def get_guidance_scale_embedding(
diff --git a/src/diffusers/pipelines/ledits_pp/pipeline_leditspp_stable_diffusion_xl.py b/src/diffusers/pipelines/ledits_pp/pipeline_leditspp_stable_diffusion_xl.py
index c1f9a98f0632..993957a052fc 100644
--- a/src/diffusers/pipelines/ledits_pp/pipeline_leditspp_stable_diffusion_xl.py
+++ b/src/diffusers/pipelines/ledits_pp/pipeline_leditspp_stable_diffusion_xl.py
@@ -37,8 +37,6 @@
 from ...models.attention_processor import (
     Attention,
     AttnProcessor,
-    AttnProcessor2_0,
-    XFormersAttnProcessor,
 )
 from ...models.lora import adjust_lora_scale_text_encoder
 from ...schedulers import DDIMScheduler, DPMSolverMultistepScheduler
@@ -688,21 +686,12 @@ def _get_add_time_ids(
 
     # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_upscale.StableDiffusionUpscalePipeline.upcast_vae
     def upcast_vae(self):
-        dtype = self.vae.dtype
-        self.vae.to(dtype=torch.float32)
-        use_torch_2_0_or_xformers = isinstance(
-            self.vae.decoder.mid_block.attentions[0].processor,
-            (
-                AttnProcessor2_0,
-                XFormersAttnProcessor,
-            ),
+        deprecate(
+            "upcast_vae",
+            "1.0.0",
+            "`upcast_vae` is deprecated. Please use `pipe.vae.to(torch.float32)`. For more details, please refer to: https://github.com/huggingface/diffusers/pull/12619#issue-3606633695.",
         )
-        # if xformers or torch_2_0 is used attention block does not need
-        # to be in float32 which can save lots of memory
-        if use_torch_2_0_or_xformers:
-            self.vae.post_quant_conv.to(dtype)
-            self.vae.decoder.conv_in.to(dtype)
-            self.vae.decoder.mid_block.to(dtype)
+        self.vae.to(dtype=torch.float32)
 
     # Copied from diffusers.pipelines.latent_consistency_models.pipeline_latent_consistency_text2img.LatentConsistencyModelPipeline.get_guidance_scale_embedding
     def get_guidance_scale_embedding(
diff --git a/src/diffusers/pipelines/ltx/pipeline_ltx.py b/src/diffusers/pipelines/ltx/pipeline_ltx.py
index bd23e657c408..8ca8b4419e18 100644
--- a/src/diffusers/pipelines/ltx/pipeline_ltx.py
+++ b/src/diffusers/pipelines/ltx/pipeline_ltx.py
@@ -590,9 +590,10 @@ def __call__(
                 the text `prompt`, usually at the expense of lower image quality.
             guidance_rescale (`float`, *optional*, defaults to 0.0):
                 Guidance rescale factor proposed by [Common Diffusion Noise Schedules and Sample Steps are
-                Flawed](https://arxiv.org/pdf/2305.08891.pdf) `guidance_scale` is defined as `φ` in equation 16. of
-                [Common Diffusion Noise Schedules and Sample Steps are Flawed](https://arxiv.org/pdf/2305.08891.pdf).
-                Guidance rescale factor should fix overexposure when using zero terminal SNR.
+                Flawed](https://huggingface.co/papers/2305.08891) `guidance_scale` is defined as `φ` in equation 16. of
+                [Common Diffusion Noise Schedules and Sample Steps are
+                Flawed](https://huggingface.co/papers/2305.08891). Guidance rescale factor should fix overexposure when
+                using zero terminal SNR.
             num_videos_per_prompt (`int`, *optional*, defaults to 1):
                 The number of videos to generate per prompt.
             generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
@@ -777,7 +778,7 @@ def __call__(
                     noise_pred = noise_pred_uncond + self.guidance_scale * (noise_pred_text - noise_pred_uncond)
 
                     if self.guidance_rescale > 0:
-                        # Based on 3.4. in https://arxiv.org/pdf/2305.08891.pdf
+                        # Based on 3.4. in https://huggingface.co/papers/2305.08891
                         noise_pred = rescale_noise_cfg(
                             noise_pred, noise_pred_text, guidance_rescale=self.guidance_rescale
                         )
diff --git a/src/diffusers/pipelines/ltx/pipeline_ltx_condition.py b/src/diffusers/pipelines/ltx/pipeline_ltx_condition.py
index 537588f67c95..48a6f0837c8d 100644
--- a/src/diffusers/pipelines/ltx/pipeline_ltx_condition.py
+++ b/src/diffusers/pipelines/ltx/pipeline_ltx_condition.py
@@ -927,9 +927,10 @@ def __call__(
                 the text `prompt`, usually at the expense of lower image quality.
             guidance_rescale (`float`, *optional*, defaults to 0.0):
                 Guidance rescale factor proposed by [Common Diffusion Noise Schedules and Sample Steps are
-                Flawed](https://arxiv.org/pdf/2305.08891.pdf) `guidance_scale` is defined as `φ` in equation 16. of
-                [Common Diffusion Noise Schedules and Sample Steps are Flawed](https://arxiv.org/pdf/2305.08891.pdf).
-                Guidance rescale factor should fix overexposure when using zero terminal SNR.
+                Flawed](https://huggingface.co/papers/2305.08891) `guidance_scale` is defined as `φ` in equation 16. of
+                [Common Diffusion Noise Schedules and Sample Steps are
+                Flawed](https://huggingface.co/papers/2305.08891). Guidance rescale factor should fix overexposure when
+                using zero terminal SNR.
             num_videos_per_prompt (`int`, *optional*, defaults to 1):
                 The number of videos to generate per prompt.
             generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
@@ -1194,7 +1195,7 @@ def __call__(
                     timestep, _ = timestep.chunk(2)
 
                     if self.guidance_rescale > 0:
-                        # Based on 3.4. in https://arxiv.org/pdf/2305.08891.pdf
+                        # Based on 3.4. in https://huggingface.co/papers/2305.08891
                         noise_pred = rescale_noise_cfg(
                             noise_pred, noise_pred_text, guidance_rescale=self.guidance_rescale
                         )
diff --git a/src/diffusers/pipelines/ltx/pipeline_ltx_image2video.py b/src/diffusers/pipelines/ltx/pipeline_ltx_image2video.py
index 694378b4f040..f30f8a3dc8f6 100644
--- a/src/diffusers/pipelines/ltx/pipeline_ltx_image2video.py
+++ b/src/diffusers/pipelines/ltx/pipeline_ltx_image2video.py
@@ -654,9 +654,10 @@ def __call__(
                 the text `prompt`, usually at the expense of lower image quality.
             guidance_rescale (`float`, *optional*, defaults to 0.0):
                 Guidance rescale factor proposed by [Common Diffusion Noise Schedules and Sample Steps are
-                Flawed](https://arxiv.org/pdf/2305.08891.pdf) `guidance_scale` is defined as `φ` in equation 16. of
-                [Common Diffusion Noise Schedules and Sample Steps are Flawed](https://arxiv.org/pdf/2305.08891.pdf).
-                Guidance rescale factor should fix overexposure when using zero terminal SNR.
+                Flawed](https://huggingface.co/papers/2305.08891) `guidance_scale` is defined as `φ` in equation 16. of
+                [Common Diffusion Noise Schedules and Sample Steps are
+                Flawed](https://huggingface.co/papers/2305.08891). Guidance rescale factor should fix overexposure when
+                using zero terminal SNR.
             num_videos_per_prompt (`int`, *optional*, defaults to 1):
                 The number of videos to generate per prompt.
             generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
@@ -851,7 +852,7 @@ def __call__(
                     timestep, _ = timestep.chunk(2)
 
                     if self.guidance_rescale > 0:
-                        # Based on 3.4. in https://arxiv.org/pdf/2305.08891.pdf
+                        # Based on 3.4. in https://huggingface.co/papers/2305.08891
                         noise_pred = rescale_noise_cfg(
                             noise_pred, noise_pred_text, guidance_rescale=self.guidance_rescale
                         )
diff --git a/src/diffusers/pipelines/ovis_image/__init__.py b/src/diffusers/pipelines/ovis_image/__init__.py
new file mode 100644
index 000000000000..275061b1f626
--- /dev/null
+++ b/src/diffusers/pipelines/ovis_image/__init__.py
@@ -0,0 +1,50 @@
+from typing import TYPE_CHECKING
+
+from ...utils import (
+    DIFFUSERS_SLOW_IMPORT,
+    OptionalDependencyNotAvailable,
+    _LazyModule,
+    get_objects_from_module,
+    is_torch_available,
+    is_transformers_available,
+)
+
+
+_dummy_objects = {}
+_import_structure = {}
+
+try:
+    if not (is_transformers_available() and is_torch_available()):
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    from ...utils import dummy_torch_and_transformers_objects  # noqa: F403
+
+    _dummy_objects.update(get_objects_from_module(dummy_torch_and_transformers_objects))
+else:
+    _import_structure["pipeline_output"] = ["OvisImagePipelineOutput"]
+    _import_structure["pipeline_ovis_image"] = ["OvisImagePipeline"]
+
+
+if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
+    try:
+        if not (is_transformers_available() and is_torch_available()):
+            raise OptionalDependencyNotAvailable()
+
+    except OptionalDependencyNotAvailable:
+        from ...utils.dummy_torch_and_transformers_objects import *
+    else:
+        from .pipeline_output import OvisImagePipelineOutput
+        from .pipeline_ovis_image import OvisImagePipeline
+
+else:
+    import sys
+
+    sys.modules[__name__] = _LazyModule(
+        __name__,
+        globals()["__file__"],
+        _import_structure,
+        module_spec=__spec__,
+    )
+
+    for name, value in _dummy_objects.items():
+        setattr(sys.modules[__name__], name, value)
diff --git a/src/diffusers/pipelines/ovis_image/pipeline_output.py b/src/diffusers/pipelines/ovis_image/pipeline_output.py
new file mode 100644
index 000000000000..160c5b73a917
--- /dev/null
+++ b/src/diffusers/pipelines/ovis_image/pipeline_output.py
@@ -0,0 +1,35 @@
+# Copyright 2025 Alibaba Ovis-Image Team and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from dataclasses import dataclass
+from typing import List, Union
+
+import numpy as np
+import PIL.Image
+
+from diffusers.utils import BaseOutput
+
+
+@dataclass
+class OvisImagePipelineOutput(BaseOutput):
+    """
+    Output class for Ovis-Image pipelines.
+
+    Args:
+        images (`List[PIL.Image.Image]` or `np.ndarray`)
+            List of denoised PIL images of length `batch_size` or numpy array of shape `(batch_size, height, width,
+            num_channels)`. PIL images or numpy array present the denoised images of the diffusion pipeline.
+    """
+
+    images: Union[List[PIL.Image.Image], np.ndarray]
diff --git a/src/diffusers/pipelines/ovis_image/pipeline_ovis_image.py b/src/diffusers/pipelines/ovis_image/pipeline_ovis_image.py
new file mode 100644
index 000000000000..94d6cee93d7e
--- /dev/null
+++ b/src/diffusers/pipelines/ovis_image/pipeline_ovis_image.py
@@ -0,0 +1,668 @@
+# Copyright 2025 Alibaba Ovis-Image Team and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import inspect
+from typing import Any, Callable, Dict, List, Optional, Union
+
+import numpy as np
+import torch
+from transformers import Qwen2TokenizerFast, Qwen3Model
+
+from ...image_processor import VaeImageProcessor
+from ...models import AutoencoderKL, OvisImageTransformer2DModel
+from ...schedulers import FlowMatchEulerDiscreteScheduler
+from ...utils import is_torch_xla_available, logging, replace_example_docstring
+from ...utils.torch_utils import randn_tensor
+from ..pipeline_utils import DiffusionPipeline
+from .pipeline_output import OvisImagePipelineOutput
+
+
+if is_torch_xla_available():
+    import torch_xla.core.xla_model as xm
+
+    XLA_AVAILABLE = True
+else:
+    XLA_AVAILABLE = False
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+EXAMPLE_DOC_STRING = """
+    Examples:
+        ```py
+        >>> import torch
+        >>> from diffusers import OvisImagePipeline
+
+        >>> pipe = OvisImagePipeline.from_pretrained("AIDC-AI/Ovis-Image-7B", torch_dtype=torch.bfloat16)
+        >>> pipe.to("cuda")
+        >>> prompt = 'A creative 3D artistic render where the text "OVIS-IMAGE" is written in a bold, expressive handwritten brush style using thick, wet oil paint. The paint is a mix of vibrant rainbow colors (red, blue, yellow) swirling together like toothpaste or impasto art. You can see the ridges of the brush bristles and the glossy, wet texture of the paint. The background is a clean artist\'s canvas. Dynamic lighting creates soft shadows behind the floating paint strokes. Colorful, expressive, tactile texture, 4k detail.'
+        >>> image = pipe(prompt, negative_prompt="", num_inference_steps=50, guidance_scale=5.0).images[0]
+        >>> image.save("ovis_image.png")
+        ```
+"""
+
+
+def calculate_shift(
+    image_seq_len,
+    base_seq_len: int = 256,
+    max_seq_len: int = 4096,
+    base_shift: float = 0.5,
+    max_shift: float = 1.15,
+):
+    m = (max_shift - base_shift) / (max_seq_len - base_seq_len)
+    b = base_shift - m * base_seq_len
+    mu = image_seq_len * m + b
+    return mu
+
+
+# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.retrieve_timesteps
+def retrieve_timesteps(
+    scheduler,
+    num_inference_steps: Optional[int] = None,
+    device: Optional[Union[str, torch.device]] = None,
+    timesteps: Optional[List[int]] = None,
+    sigmas: Optional[List[float]] = None,
+    **kwargs,
+):
+    r"""
+    Calls the scheduler's `set_timesteps` method and retrieves timesteps from the scheduler after the call. Handles
+    custom timesteps. Any kwargs will be supplied to `scheduler.set_timesteps`.
+
+    Args:
+        scheduler (`SchedulerMixin`):
+            The scheduler to get timesteps from.
+        num_inference_steps (`int`):
+            The number of diffusion steps used when generating samples with a pre-trained model. If used, `timesteps`
+            must be `None`.
+        device (`str` or `torch.device`, *optional*):
+            The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
+        timesteps (`List[int]`, *optional*):
+            Custom timesteps used to override the timestep spacing strategy of the scheduler. If `timesteps` is passed,
+            `num_inference_steps` and `sigmas` must be `None`.
+        sigmas (`List[float]`, *optional*):
+            Custom sigmas used to override the timestep spacing strategy of the scheduler. If `sigmas` is passed,
+            `num_inference_steps` and `timesteps` must be `None`.
+
+    Returns:
+        `Tuple[torch.Tensor, int]`: A tuple where the first element is the timestep schedule from the scheduler and the
+        second element is the number of inference steps.
+    """
+    if timesteps is not None and sigmas is not None:
+        raise ValueError("Only one of `timesteps` or `sigmas` can be passed. Please choose one to set custom values")
+    if timesteps is not None:
+        accepts_timesteps = "timesteps" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
+        if not accepts_timesteps:
+            raise ValueError(
+                f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
+                f" timestep schedules. Please check whether you are using the correct scheduler."
+            )
+        scheduler.set_timesteps(timesteps=timesteps, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+        num_inference_steps = len(timesteps)
+    elif sigmas is not None:
+        accept_sigmas = "sigmas" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
+        if not accept_sigmas:
+            raise ValueError(
+                f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
+                f" sigmas schedules. Please check whether you are using the correct scheduler."
+            )
+        scheduler.set_timesteps(sigmas=sigmas, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+        num_inference_steps = len(timesteps)
+    else:
+        scheduler.set_timesteps(num_inference_steps, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+    return timesteps, num_inference_steps
+
+
+class OvisImagePipeline(
+    DiffusionPipeline,
+):
+    r"""
+    The Ovis-Image pipeline for text-to-image generation.
+
+    Reference: https://github.com/AIDC-AI/Ovis-Image
+
+    Args:
+        transformer ([`OvisImageTransformer2DModel`]):
+            Conditional Transformer (MMDiT) architecture to denoise the encoded image latents.
+        scheduler ([`FlowMatchEulerDiscreteScheduler`]):
+            A scheduler to be used in combination with `transformer` to denoise the encoded image latents.
+        vae ([`AutoencoderKL`]):
+            Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.
+        text_encoder ([`Qwen3Model`]):
+            Text encoder of class
+            [Qwen3Model](https://huggingface.co/docs/transformers/en/model_doc/qwen3#transformers.Qwen3Model).
+        tokenizer (`Qwen2TokenizerFast`):
+            Tokenizer of class
+            [Qwen2TokenizerFast](https://huggingface.co/docs/transformers/en/model_doc/qwen2#transformers.Qwen2TokenizerFast).
+    """
+
+    model_cpu_offload_seq = "text_encoder->transformer->vae"
+    _optional_components = []
+    _callback_tensor_inputs = ["latents", "prompt_embeds"]
+
+    def __init__(
+        self,
+        scheduler: FlowMatchEulerDiscreteScheduler,
+        vae: AutoencoderKL,
+        text_encoder: Qwen3Model,
+        tokenizer: Qwen2TokenizerFast,
+        transformer: OvisImageTransformer2DModel,
+    ):
+        super().__init__()
+
+        self.register_modules(
+            vae=vae,
+            text_encoder=text_encoder,
+            tokenizer=tokenizer,
+            transformer=transformer,
+            scheduler=scheduler,
+        )
+        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1) if getattr(self, "vae", None) else 8
+        # Ovis-Image latents are turned into 2x2 patches and packed. This means the latent width and height has to be divisible
+        # by the patch size. So the vae scale factor is multiplied by the patch size to account for this
+        self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor * 2)
+        self.system_prompt = "Describe the image by detailing the color, quantity, text, shape, size, texture, spatial relationships of the objects and background: "
+        self.user_prompt_begin_id = 28
+        self.tokenizer_max_length = 256 + self.user_prompt_begin_id
+        self.default_sample_size = 128
+
+    def _get_messages(
+        self,
+        prompt: Union[str, List[str]] = None,
+    ):
+        prompt = [prompt] if isinstance(prompt, str) else prompt
+        messages = []
+        for each_prompt in prompt:
+            message = [
+                {
+                    "role": "user",
+                    "content": self.system_prompt + each_prompt,
+                }
+            ]
+            message = self.tokenizer.apply_chat_template(
+                message, tokenize=False, add_generation_prompt=True, enable_thinking=False
+            )
+            messages.append(message)
+        return messages
+
+    def _get_ovis_prompt_embeds(
+        self,
+        prompt: Union[str, List[str]] = None,
+        num_images_per_prompt: int = 1,
+        device: Optional[torch.device] = None,
+        dtype: Optional[torch.dtype] = None,
+    ):
+        device = device or self._execution_device
+        dtype = dtype or self.text_encoder.dtype
+
+        messages = self._get_messages(prompt)
+        batch_size = len(messages)
+
+        tokens = self.tokenizer(
+            messages,
+            padding="max_length",
+            truncation=True,
+            max_length=self.tokenizer_max_length,
+            return_tensors="pt",
+            add_special_tokens=False,
+        )
+        input_ids = tokens.input_ids.to(device)
+        attention_mask = tokens.attention_mask.to(device)
+        outputs = self.text_encoder(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+        )
+        prompt_embeds = outputs.last_hidden_state
+        prompt_embeds = prompt_embeds * attention_mask[..., None]
+        prompt_embeds = prompt_embeds[:, self.user_prompt_begin_id :, :]
+
+        _, seq_len, _ = prompt_embeds.shape
+
+        # duplicate text embeddings and attention mask for each generation per prompt, using mps friendly method
+        prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
+        prompt_embeds = prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
+
+        return prompt_embeds
+
+    def encode_prompt(
+        self,
+        prompt: Union[str, List[str]],
+        device: Optional[torch.device] = None,
+        num_images_per_prompt: int = 1,
+        prompt_embeds: Optional[torch.FloatTensor] = None,
+    ):
+        r"""
+
+        Args:
+            prompt (`str`, *optional*):
+                prompt to be encoded
+            device: (`torch.device`):
+                torch device
+            num_images_per_prompt (`int`):
+                number of images that should be generated per prompt
+            prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+        """
+        device = device or self._execution_device
+
+        if prompt_embeds is None:
+            prompt_embeds = self._get_ovis_prompt_embeds(
+                prompt=prompt,
+                device=device,
+                num_images_per_prompt=num_images_per_prompt,
+            )
+
+        dtype = self.text_encoder.dtype if self.text_encoder is not None else self.transformer.dtype
+        text_ids = torch.zeros(prompt_embeds.shape[1], 3)
+        text_ids[..., 1] = text_ids[..., 1] + torch.arange(prompt_embeds.shape[1])[None, :]
+        text_ids[..., 2] = text_ids[..., 2] + torch.arange(prompt_embeds.shape[1])[None, :]
+        text_ids = text_ids.to(device=device, dtype=dtype)
+        return prompt_embeds, text_ids
+
+    def check_inputs(
+        self,
+        prompt,
+        height,
+        width,
+        negative_prompt=None,
+        prompt_embeds=None,
+        negative_prompt_embeds=None,
+        callback_on_step_end_tensor_inputs=None,
+        max_sequence_length=None,
+    ):
+        if height % (self.vae_scale_factor * 2) != 0 or width % (self.vae_scale_factor * 2) != 0:
+            logger.warning(
+                f"`height` and `width` have to be divisible by {self.vae_scale_factor * 2} but are {height} and {width}. Dimensions will be resized accordingly"
+            )
+
+        if callback_on_step_end_tensor_inputs is not None and not all(
+            k in self._callback_tensor_inputs for k in callback_on_step_end_tensor_inputs
+        ):
+            raise ValueError(
+                f"`callback_on_step_end_tensor_inputs` has to be in {self._callback_tensor_inputs}, but found {[k for k in callback_on_step_end_tensor_inputs if k not in self._callback_tensor_inputs]}"
+            )
+
+        if prompt is not None and prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
+                " only forward one of the two."
+            )
+        elif prompt is None and prompt_embeds is None:
+            raise ValueError(
+                "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
+            )
+        elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+        if negative_prompt is not None and negative_prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
+                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
+            )
+
+        if max_sequence_length is not None and max_sequence_length > 256:
+            raise ValueError(f"`max_sequence_length` cannot be greater than 256 but is {max_sequence_length}")
+
+    @staticmethod
+    def _prepare_latent_image_ids(batch_size, height, width, device, dtype):
+        latent_image_ids = torch.zeros(height, width, 3)
+        latent_image_ids[..., 1] = latent_image_ids[..., 1] + torch.arange(height)[:, None]
+        latent_image_ids[..., 2] = latent_image_ids[..., 2] + torch.arange(width)[None, :]
+
+        latent_image_id_height, latent_image_id_width, latent_image_id_channels = latent_image_ids.shape
+
+        latent_image_ids = latent_image_ids.reshape(
+            latent_image_id_height * latent_image_id_width, latent_image_id_channels
+        )
+
+        return latent_image_ids.to(device=device, dtype=dtype)
+
+    @staticmethod
+    def _pack_latents(latents, batch_size, num_channels_latents, height, width):
+        latents = latents.view(batch_size, num_channels_latents, height // 2, 2, width // 2, 2)
+        latents = latents.permute(0, 2, 4, 1, 3, 5)
+        latents = latents.reshape(batch_size, (height // 2) * (width // 2), num_channels_latents * 4)
+
+        return latents
+
+    @staticmethod
+    def _unpack_latents(latents, height, width, vae_scale_factor):
+        batch_size, num_patches, channels = latents.shape
+
+        # VAE applies 8x compression on images but we must also account for packing which requires
+        # latent height and width to be divisible by 2.
+        height = 2 * (int(height) // (vae_scale_factor * 2))
+        width = 2 * (int(width) // (vae_scale_factor * 2))
+
+        latents = latents.view(batch_size, height // 2, width // 2, channels // 4, 2, 2)
+        latents = latents.permute(0, 3, 1, 4, 2, 5)
+
+        latents = latents.reshape(batch_size, channels // (2 * 2), height, width)
+
+        return latents
+
+    def prepare_latents(
+        self,
+        batch_size,
+        num_channels_latents,
+        height,
+        width,
+        dtype,
+        device,
+        generator,
+        latents=None,
+    ):
+        # VAE applies 8x compression on images but we must also account for packing which requires
+        # latent height and width to be divisible by 2.
+        height = 2 * (int(height) // (self.vae_scale_factor * 2))
+        width = 2 * (int(width) // (self.vae_scale_factor * 2))
+
+        shape = (batch_size, num_channels_latents, height, width)
+
+        if latents is not None:
+            latent_image_ids = self._prepare_latent_image_ids(batch_size, height // 2, width // 2, device, dtype)
+            return latents.to(device=device, dtype=dtype), latent_image_ids
+
+        if isinstance(generator, list) and len(generator) != batch_size:
+            raise ValueError(
+                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+            )
+
+        latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+        latents = self._pack_latents(latents, batch_size, num_channels_latents, height, width)
+
+        latent_image_ids = self._prepare_latent_image_ids(batch_size, height // 2, width // 2, device, dtype)
+
+        return latents, latent_image_ids
+
+    @property
+    def guidance_scale(self):
+        return self._guidance_scale
+
+    @property
+    def joint_attention_kwargs(self):
+        return self._joint_attention_kwargs
+
+    @property
+    def num_timesteps(self):
+        return self._num_timesteps
+
+    @property
+    def current_timestep(self):
+        return self._current_timestep
+
+    @property
+    def interrupt(self):
+        return self._interrupt
+
+    @torch.no_grad()
+    @replace_example_docstring(EXAMPLE_DOC_STRING)
+    def __call__(
+        self,
+        prompt: Union[str, List[str]] = None,
+        negative_prompt: Union[str, List[str]] = "",
+        guidance_scale: float = 5.0,
+        height: Optional[int] = None,
+        width: Optional[int] = None,
+        num_inference_steps: int = 50,
+        sigmas: Optional[List[float]] = None,
+        num_images_per_prompt: Optional[int] = 1,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.FloatTensor] = None,
+        prompt_embeds: Optional[torch.FloatTensor] = None,
+        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        joint_attention_kwargs: Optional[Dict[str, Any]] = None,
+        callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None,
+        callback_on_step_end_tensor_inputs: List[str] = ["latents"],
+        max_sequence_length: int = 256,
+    ):
+        r"""
+        Function invoked when calling the pipeline for generation.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
+                instead.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. If not defined, one has to pass
+                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
+                not greater than `1`).
+            guidance_scale (`float`, *optional*, defaults to 1.0):
+                True classifier-free guidance (guidance scale) is enabled when `guidance_scale` > 1 and
+                `negative_prompt` is provided.
+            height (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
+                The height in pixels of the generated image. This is set to 1024 by default for the best results.
+            width (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
+                The width in pixels of the generated image. This is set to 1024 by default for the best results.
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            sigmas (`List[float]`, *optional*):
+                Custom sigmas to use for the denoising process with schedulers which support a `sigmas` argument in
+                their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is passed
+                will be used.
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
+                to make generation deterministic.
+            latents (`torch.FloatTensor`, *optional*):
+                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor will be generated by sampling using the supplied random `generator`.
+            prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generate image. Choose between
+                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.flux.FluxPipelineOutput`] instead of a plain tuple.
+            joint_attention_kwargs (`dict`, *optional*):
+                A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
+                `self.processor` in
+                [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
+            callback_on_step_end (`Callable`, *optional*):
+                A function that calls at the end of each denoising steps during the inference. The function is called
+                with the following arguments: `callback_on_step_end(self: DiffusionPipeline, step: int, timestep: int,
+                callback_kwargs: Dict)`. `callback_kwargs` will include a list of all tensors as specified by
+                `callback_on_step_end_tensor_inputs`.
+            callback_on_step_end_tensor_inputs (`List`, *optional*):
+                The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
+                will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
+                `._callback_tensor_inputs` attribute of your pipeline class.
+            max_sequence_length (`int` defaults to 512): Maximum sequence length to use with the `prompt`.
+
+        Examples:
+
+        Returns:
+            [`~pipelines.ovis_image.OvisImagePipelineOutput`] or `tuple`:
+            [`~pipelines.ovis_image.OvisImagePipelineOutput`] if `return_dict` is True, otherwise a `tuple`. When
+            returning a tuple, the first element is a list with the generated images.
+        """
+
+        height = height or self.default_sample_size * self.vae_scale_factor
+        width = width or self.default_sample_size * self.vae_scale_factor
+
+        # 1. Check inputs. Raise error if not correct
+        self.check_inputs(
+            prompt,
+            height,
+            width,
+            negative_prompt=negative_prompt,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            callback_on_step_end_tensor_inputs=callback_on_step_end_tensor_inputs,
+            max_sequence_length=max_sequence_length,
+        )
+
+        self._joint_attention_kwargs = joint_attention_kwargs
+        self._current_timestep = None
+        self._interrupt = False
+
+        # 2. Define call parameters
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        device = self._execution_device
+
+        do_classifier_free_guidance = guidance_scale > 1
+        (
+            prompt_embeds,
+            text_ids,
+        ) = self.encode_prompt(
+            prompt=prompt,
+            prompt_embeds=prompt_embeds,
+            device=device,
+            num_images_per_prompt=num_images_per_prompt,
+        )
+        if do_classifier_free_guidance:
+            (
+                negative_prompt_embeds,
+                negative_text_ids,
+            ) = self.encode_prompt(
+                prompt=negative_prompt,
+                prompt_embeds=negative_prompt_embeds,
+                device=device,
+                num_images_per_prompt=num_images_per_prompt,
+            )
+
+        # 4. Prepare latent variables
+        num_channels_latents = self.transformer.config.in_channels // 4
+        latents, latent_image_ids = self.prepare_latents(
+            batch_size * num_images_per_prompt,
+            num_channels_latents,
+            height,
+            width,
+            prompt_embeds.dtype,
+            device,
+            generator,
+            latents,
+        )
+
+        # 5. Prepare timesteps
+        sigmas = np.linspace(1.0, 1 / num_inference_steps, num_inference_steps) if sigmas is None else sigmas
+        if hasattr(self.scheduler.config, "use_flow_sigmas") and self.scheduler.config.use_flow_sigmas:
+            sigmas = None
+        image_seq_len = latents.shape[1]
+        mu = calculate_shift(
+            image_seq_len,
+            self.scheduler.config.get("base_image_seq_len", 256),
+            self.scheduler.config.get("max_image_seq_len", 4096),
+            self.scheduler.config.get("base_shift", 0.5),
+            self.scheduler.config.get("max_shift", 1.15),
+        )
+        timesteps, num_inference_steps = retrieve_timesteps(
+            self.scheduler,
+            num_inference_steps,
+            device,
+            sigmas=sigmas,
+            mu=mu,
+        )
+        num_warmup_steps = max(len(timesteps) - num_inference_steps * self.scheduler.order, 0)
+        self._num_timesteps = len(timesteps)
+
+        if self.joint_attention_kwargs is None:
+            self._joint_attention_kwargs = {}
+
+        # 6. Denoising loop
+        # We set the index here to remove DtoH sync, helpful especially during compilation.
+        # Check out more details here: https://github.com/huggingface/diffusers/pull/11696
+        self.scheduler.set_begin_index(0)
+        with self.progress_bar(total=num_inference_steps) as progress_bar:
+            for i, t in enumerate(timesteps):
+                if self.interrupt:
+                    continue
+
+                self._current_timestep = t
+                # broadcast to batch dimension in a way that's compatible with ONNX/Core ML
+                timestep = t.expand(latents.shape[0]).to(latents.dtype)
+
+                with self.transformer.cache_context("cond"):
+                    noise_pred = self.transformer(
+                        hidden_states=latents,
+                        timestep=timestep / 1000,
+                        encoder_hidden_states=prompt_embeds,
+                        txt_ids=text_ids,
+                        img_ids=latent_image_ids,
+                        return_dict=False,
+                    )[0]
+
+                if do_classifier_free_guidance:
+                    with self.transformer.cache_context("uncond"):
+                        neg_noise_pred = self.transformer(
+                            hidden_states=latents,
+                            timestep=timestep / 1000,
+                            encoder_hidden_states=negative_prompt_embeds,
+                            txt_ids=negative_text_ids,
+                            img_ids=latent_image_ids,
+                            return_dict=False,
+                        )[0]
+                    noise_pred = neg_noise_pred + guidance_scale * (noise_pred - neg_noise_pred)
+
+                # compute the previous noisy sample x_t -> x_t-1
+                latents_dtype = latents.dtype
+                latents = self.scheduler.step(noise_pred, t, latents, return_dict=False)[0]
+
+                if latents.dtype != latents_dtype:
+                    if torch.backends.mps.is_available():
+                        # some platforms (eg. apple mps) misbehave due to a pytorch bug: https://github.com/pytorch/pytorch/pull/99272
+                        latents = latents.to(latents_dtype)
+
+                if callback_on_step_end is not None:
+                    callback_kwargs = {}
+                    for k in callback_on_step_end_tensor_inputs:
+                        callback_kwargs[k] = locals()[k]
+                    callback_outputs = callback_on_step_end(self, i, t, callback_kwargs)
+
+                    latents = callback_outputs.pop("latents", latents)
+                    prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds)
+
+                # call the callback, if provided
+                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+                    progress_bar.update()
+
+                if XLA_AVAILABLE:
+                    xm.mark_step()
+
+        self._current_timestep = None
+
+        if output_type == "latent":
+            image = latents
+        else:
+            latents = self._unpack_latents(latents, height, width, self.vae_scale_factor)
+            latents = (latents / self.vae.config.scaling_factor) + self.vae.config.shift_factor
+            image = self.vae.decode(latents, return_dict=False)[0]
+            image = self.image_processor.postprocess(image, output_type=output_type)
+
+        # Offload all models
+        self.maybe_free_model_hooks()
+
+        if not return_dict:
+            return (image,)
+
+        return OvisImagePipelineOutput(images=image)
diff --git a/src/diffusers/pipelines/pag/pipeline_pag_controlnet_sd_xl.py b/src/diffusers/pipelines/pag/pipeline_pag_controlnet_sd_xl.py
index 24cbab43c966..381352ccc5d4 100644
--- a/src/diffusers/pipelines/pag/pipeline_pag_controlnet_sd_xl.py
+++ b/src/diffusers/pipelines/pag/pipeline_pag_controlnet_sd_xl.py
@@ -39,14 +39,11 @@
     TextualInversionLoaderMixin,
 )
 from ...models import AutoencoderKL, ControlNetModel, ImageProjection, MultiControlNetModel, UNet2DConditionModel
-from ...models.attention_processor import (
-    AttnProcessor2_0,
-    XFormersAttnProcessor,
-)
 from ...models.lora import adjust_lora_scale_text_encoder
 from ...schedulers import KarrasDiffusionSchedulers
 from ...utils import (
     USE_PEFT_BACKEND,
+    deprecate,
     logging,
     replace_example_docstring,
     scale_lora_layers,
@@ -936,21 +933,12 @@ def _get_add_time_ids(
 
     # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_upscale.StableDiffusionUpscalePipeline.upcast_vae
     def upcast_vae(self):
-        dtype = self.vae.dtype
-        self.vae.to(dtype=torch.float32)
-        use_torch_2_0_or_xformers = isinstance(
-            self.vae.decoder.mid_block.attentions[0].processor,
-            (
-                AttnProcessor2_0,
-                XFormersAttnProcessor,
-            ),
+        deprecate(
+            "upcast_vae",
+            "1.0.0",
+            "`upcast_vae` is deprecated. Please use `pipe.vae.to(torch.float32)`. For more details, please refer to: https://github.com/huggingface/diffusers/pull/12619#issue-3606633695.",
         )
-        # if xformers or torch_2_0 is used attention block does not need
-        # to be in float32 which can save lots of memory
-        if use_torch_2_0_or_xformers:
-            self.vae.post_quant_conv.to(dtype)
-            self.vae.decoder.conv_in.to(dtype)
-            self.vae.decoder.mid_block.to(dtype)
+        self.vae.to(dtype=torch.float32)
 
     # Copied from diffusers.pipelines.latent_consistency_models.pipeline_latent_consistency_text2img.LatentConsistencyModelPipeline.get_guidance_scale_embedding
     def get_guidance_scale_embedding(
diff --git a/src/diffusers/pipelines/pag/pipeline_pag_controlnet_sd_xl_img2img.py b/src/diffusers/pipelines/pag/pipeline_pag_controlnet_sd_xl_img2img.py
index a6df1b22c8b9..df5b3f5c10a5 100644
--- a/src/diffusers/pipelines/pag/pipeline_pag_controlnet_sd_xl_img2img.py
+++ b/src/diffusers/pipelines/pag/pipeline_pag_controlnet_sd_xl_img2img.py
@@ -39,14 +39,11 @@
     TextualInversionLoaderMixin,
 )
 from ...models import AutoencoderKL, ControlNetModel, ImageProjection, MultiControlNetModel, UNet2DConditionModel
-from ...models.attention_processor import (
-    AttnProcessor2_0,
-    XFormersAttnProcessor,
-)
 from ...models.lora import adjust_lora_scale_text_encoder
 from ...schedulers import KarrasDiffusionSchedulers
 from ...utils import (
     USE_PEFT_BACKEND,
+    deprecate,
     logging,
     replace_example_docstring,
     scale_lora_layers,
@@ -1049,21 +1046,12 @@ def _get_add_time_ids(
 
     # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_upscale.StableDiffusionUpscalePipeline.upcast_vae
     def upcast_vae(self):
-        dtype = self.vae.dtype
-        self.vae.to(dtype=torch.float32)
-        use_torch_2_0_or_xformers = isinstance(
-            self.vae.decoder.mid_block.attentions[0].processor,
-            (
-                AttnProcessor2_0,
-                XFormersAttnProcessor,
-            ),
+        deprecate(
+            "upcast_vae",
+            "1.0.0",
+            "`upcast_vae` is deprecated. Please use `pipe.vae.to(torch.float32)`. For more details, please refer to: https://github.com/huggingface/diffusers/pull/12619#issue-3606633695.",
         )
-        # if xformers or torch_2_0 is used attention block does not need
-        # to be in float32 which can save lots of memory
-        if use_torch_2_0_or_xformers:
-            self.vae.post_quant_conv.to(dtype)
-            self.vae.decoder.conv_in.to(dtype)
-            self.vae.decoder.mid_block.to(dtype)
+        self.vae.to(dtype=torch.float32)
 
     @property
     def guidance_scale(self):
diff --git a/src/diffusers/pipelines/pag/pipeline_pag_kolors.py b/src/diffusers/pipelines/pag/pipeline_pag_kolors.py
index 1368358db6ba..1403be03a620 100644
--- a/src/diffusers/pipelines/pag/pipeline_pag_kolors.py
+++ b/src/diffusers/pipelines/pag/pipeline_pag_kolors.py
@@ -21,9 +21,8 @@
 from ...image_processor import PipelineImageInput, VaeImageProcessor
 from ...loaders import IPAdapterMixin, StableDiffusionXLLoraLoaderMixin
 from ...models import AutoencoderKL, ImageProjection, UNet2DConditionModel
-from ...models.attention_processor import AttnProcessor2_0, FusedAttnProcessor2_0, XFormersAttnProcessor
 from ...schedulers import KarrasDiffusionSchedulers
-from ...utils import is_torch_xla_available, logging, replace_example_docstring
+from ...utils import deprecate, is_torch_xla_available, logging, replace_example_docstring
 from ...utils.torch_utils import randn_tensor
 from ..kolors.pipeline_output import KolorsPipelineOutput
 from ..kolors.text_encoder import ChatGLMModel
@@ -598,22 +597,12 @@ def _get_add_time_ids(
 
     # Copied from diffusers.pipelines.stable_diffusion_xl.pipeline_stable_diffusion_xl.StableDiffusionXLPipeline.upcast_vae
     def upcast_vae(self):
-        dtype = self.vae.dtype
-        self.vae.to(dtype=torch.float32)
-        use_torch_2_0_or_xformers = isinstance(
-            self.vae.decoder.mid_block.attentions[0].processor,
-            (
-                AttnProcessor2_0,
-                XFormersAttnProcessor,
-                FusedAttnProcessor2_0,
-            ),
+        deprecate(
+            "upcast_vae",
+            "1.0.0",
+            "`upcast_vae` is deprecated. Please use `pipe.vae.to(torch.float32)`. For more details, please refer to: https://github.com/huggingface/diffusers/pull/12619#issue-3606633695.",
         )
-        # if xformers or torch_2_0 is used attention block does not need
-        # to be in float32 which can save lots of memory
-        if use_torch_2_0_or_xformers:
-            self.vae.post_quant_conv.to(dtype)
-            self.vae.decoder.conv_in.to(dtype)
-            self.vae.decoder.mid_block.to(dtype)
+        self.vae.to(dtype=torch.float32)
 
     # Copied from diffusers.pipelines.latent_consistency_models.pipeline_latent_consistency_text2img.LatentConsistencyModelPipeline.get_guidance_scale_embedding
     def get_guidance_scale_embedding(
diff --git a/src/diffusers/pipelines/pag/pipeline_pag_sd_3.py b/src/diffusers/pipelines/pag/pipeline_pag_sd_3.py
index bc281428e257..941b675099b9 100644
--- a/src/diffusers/pipelines/pag/pipeline_pag_sd_3.py
+++ b/src/diffusers/pipelines/pag/pipeline_pag_sd_3.py
@@ -326,7 +326,7 @@ def _get_clip_prompt_embeds(
         prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
         prompt_embeds = prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
 
-        pooled_prompt_embeds = pooled_prompt_embeds.repeat(1, num_images_per_prompt, 1)
+        pooled_prompt_embeds = pooled_prompt_embeds.repeat(1, num_images_per_prompt)
         pooled_prompt_embeds = pooled_prompt_embeds.view(batch_size * num_images_per_prompt, -1)
 
         return prompt_embeds, pooled_prompt_embeds
diff --git a/src/diffusers/pipelines/pag/pipeline_pag_sd_3_img2img.py b/src/diffusers/pipelines/pag/pipeline_pag_sd_3_img2img.py
index 22a8dac238f5..f40dd52fc244 100644
--- a/src/diffusers/pipelines/pag/pipeline_pag_sd_3_img2img.py
+++ b/src/diffusers/pipelines/pag/pipeline_pag_sd_3_img2img.py
@@ -342,7 +342,7 @@ def _get_clip_prompt_embeds(
         prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
         prompt_embeds = prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
 
-        pooled_prompt_embeds = pooled_prompt_embeds.repeat(1, num_images_per_prompt, 1)
+        pooled_prompt_embeds = pooled_prompt_embeds.repeat(1, num_images_per_prompt)
         pooled_prompt_embeds = pooled_prompt_embeds.view(batch_size * num_images_per_prompt, -1)
 
         return prompt_embeds, pooled_prompt_embeds
diff --git a/src/diffusers/pipelines/pag/pipeline_pag_sd_xl.py b/src/diffusers/pipelines/pag/pipeline_pag_sd_xl.py
index 6b62ddcc7ca5..a69f06536a55 100644
--- a/src/diffusers/pipelines/pag/pipeline_pag_sd_xl.py
+++ b/src/diffusers/pipelines/pag/pipeline_pag_sd_xl.py
@@ -32,15 +32,11 @@
     TextualInversionLoaderMixin,
 )
 from ...models import AutoencoderKL, ImageProjection, UNet2DConditionModel
-from ...models.attention_processor import (
-    AttnProcessor2_0,
-    FusedAttnProcessor2_0,
-    XFormersAttnProcessor,
-)
 from ...models.lora import adjust_lora_scale_text_encoder
 from ...schedulers import KarrasDiffusionSchedulers
 from ...utils import (
     USE_PEFT_BACKEND,
+    deprecate,
     is_invisible_watermark_available,
     is_torch_xla_available,
     logging,
@@ -762,22 +758,12 @@ def _get_add_time_ids(
 
     # Copied from diffusers.pipelines.stable_diffusion_xl.pipeline_stable_diffusion_xl.StableDiffusionXLPipeline.upcast_vae
     def upcast_vae(self):
-        dtype = self.vae.dtype
-        self.vae.to(dtype=torch.float32)
-        use_torch_2_0_or_xformers = isinstance(
-            self.vae.decoder.mid_block.attentions[0].processor,
-            (
-                AttnProcessor2_0,
-                XFormersAttnProcessor,
-                FusedAttnProcessor2_0,
-            ),
+        deprecate(
+            "upcast_vae",
+            "1.0.0",
+            "`upcast_vae` is deprecated. Please use `pipe.vae.to(torch.float32)`. For more details, please refer to: https://github.com/huggingface/diffusers/pull/12619#issue-3606633695.",
         )
-        # if xformers or torch_2_0 is used attention block does not need
-        # to be in float32 which can save lots of memory
-        if use_torch_2_0_or_xformers:
-            self.vae.post_quant_conv.to(dtype)
-            self.vae.decoder.conv_in.to(dtype)
-            self.vae.decoder.mid_block.to(dtype)
+        self.vae.to(dtype=torch.float32)
 
     # Copied from diffusers.pipelines.latent_consistency_models.pipeline_latent_consistency_text2img.LatentConsistencyModelPipeline.get_guidance_scale_embedding
     def get_guidance_scale_embedding(
diff --git a/src/diffusers/pipelines/pag/pipeline_pag_sd_xl_img2img.py b/src/diffusers/pipelines/pag/pipeline_pag_sd_xl_img2img.py
index b6422b23648c..416d9e5677b4 100644
--- a/src/diffusers/pipelines/pag/pipeline_pag_sd_xl_img2img.py
+++ b/src/diffusers/pipelines/pag/pipeline_pag_sd_xl_img2img.py
@@ -34,14 +34,11 @@
     TextualInversionLoaderMixin,
 )
 from ...models import AutoencoderKL, ImageProjection, UNet2DConditionModel
-from ...models.attention_processor import (
-    AttnProcessor2_0,
-    XFormersAttnProcessor,
-)
 from ...models.lora import adjust_lora_scale_text_encoder
 from ...schedulers import KarrasDiffusionSchedulers
 from ...utils import (
     USE_PEFT_BACKEND,
+    deprecate,
     is_invisible_watermark_available,
     is_torch_xla_available,
     logging,
@@ -910,21 +907,12 @@ def _get_add_time_ids(
 
     # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_upscale.StableDiffusionUpscalePipeline.upcast_vae
     def upcast_vae(self):
-        dtype = self.vae.dtype
-        self.vae.to(dtype=torch.float32)
-        use_torch_2_0_or_xformers = isinstance(
-            self.vae.decoder.mid_block.attentions[0].processor,
-            (
-                AttnProcessor2_0,
-                XFormersAttnProcessor,
-            ),
+        deprecate(
+            "upcast_vae",
+            "1.0.0",
+            "`upcast_vae` is deprecated. Please use `pipe.vae.to(torch.float32)`. For more details, please refer to: https://github.com/huggingface/diffusers/pull/12619#issue-3606633695.",
         )
-        # if xformers or torch_2_0 is used attention block does not need
-        # to be in float32 which can save lots of memory
-        if use_torch_2_0_or_xformers:
-            self.vae.post_quant_conv.to(dtype)
-            self.vae.decoder.conv_in.to(dtype)
-            self.vae.decoder.mid_block.to(dtype)
+        self.vae.to(dtype=torch.float32)
 
     # Copied from diffusers.pipelines.latent_consistency_models.pipeline_latent_consistency_text2img.LatentConsistencyModelPipeline.get_guidance_scale_embedding
     def get_guidance_scale_embedding(
diff --git a/src/diffusers/pipelines/pag/pipeline_pag_sd_xl_inpaint.py b/src/diffusers/pipelines/pag/pipeline_pag_sd_xl_inpaint.py
index 2a8f7a448d5e..6be341e07b1a 100644
--- a/src/diffusers/pipelines/pag/pipeline_pag_sd_xl_inpaint.py
+++ b/src/diffusers/pipelines/pag/pipeline_pag_sd_xl_inpaint.py
@@ -34,14 +34,11 @@
     TextualInversionLoaderMixin,
 )
 from ...models import AutoencoderKL, ImageProjection, UNet2DConditionModel
-from ...models.attention_processor import (
-    AttnProcessor2_0,
-    XFormersAttnProcessor,
-)
 from ...models.lora import adjust_lora_scale_text_encoder
 from ...schedulers import KarrasDiffusionSchedulers
 from ...utils import (
     USE_PEFT_BACKEND,
+    deprecate,
     is_invisible_watermark_available,
     is_torch_xla_available,
     logging,
@@ -1001,21 +998,12 @@ def _get_add_time_ids(
 
     # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_upscale.StableDiffusionUpscalePipeline.upcast_vae
     def upcast_vae(self):
-        dtype = self.vae.dtype
-        self.vae.to(dtype=torch.float32)
-        use_torch_2_0_or_xformers = isinstance(
-            self.vae.decoder.mid_block.attentions[0].processor,
-            (
-                AttnProcessor2_0,
-                XFormersAttnProcessor,
-            ),
+        deprecate(
+            "upcast_vae",
+            "1.0.0",
+            "`upcast_vae` is deprecated. Please use `pipe.vae.to(torch.float32)`. For more details, please refer to: https://github.com/huggingface/diffusers/pull/12619#issue-3606633695.",
         )
-        # if xformers or torch_2_0 is used attention block does not need
-        # to be in float32 which can save lots of memory
-        if use_torch_2_0_or_xformers:
-            self.vae.post_quant_conv.to(dtype)
-            self.vae.decoder.conv_in.to(dtype)
-            self.vae.decoder.mid_block.to(dtype)
+        self.vae.to(dtype=torch.float32)
 
     # Copied from diffusers.pipelines.latent_consistency_models.pipeline_latent_consistency_text2img.LatentConsistencyModelPipeline.get_guidance_scale_embedding
     def get_guidance_scale_embedding(
diff --git a/src/diffusers/pipelines/prx/pipeline_prx.py b/src/diffusers/pipelines/prx/pipeline_prx.py
index a3bd3e6b45e7..873f25316e6d 100644
--- a/src/diffusers/pipelines/prx/pipeline_prx.py
+++ b/src/diffusers/pipelines/prx/pipeline_prx.py
@@ -69,6 +69,39 @@
     "2.0": [704, 352],
 }
 
+ASPECT_RATIO_1024_BIN = {
+    "0.49": [704, 1440],
+    "0.52": [736, 1408],
+    "0.53": [736, 1376],
+    "0.57": [768, 1344],
+    "0.59": [768, 1312],
+    "0.62": [800, 1280],
+    "0.67": [832, 1248],
+    "0.68": [832, 1216],
+    "0.78": [896, 1152],
+    "0.83": [928, 1120],
+    "0.94": [992, 1056],
+    "1.0": [1024, 1024],
+    "1.06": [1056, 992],
+    "1.13": [1088, 960],
+    "1.21": [1120, 928],
+    "1.29": [1152, 896],
+    "1.37": [1184, 864],
+    "1.46": [1216, 832],
+    "1.5": [1248, 832],
+    "1.71": [1312, 768],
+    "1.75": [1344, 768],
+    "1.87": [1376, 736],
+    "1.91": [1408, 736],
+    "2.05": [1440, 704],
+}
+
+ASPECT_RATIO_BINS = {
+    256: ASPECT_RATIO_256_BIN,
+    512: ASPECT_RATIO_512_BIN,
+    1024: ASPECT_RATIO_1024_BIN,
+}
+
 logger = logging.get_logger(__name__)
 
 
@@ -536,11 +569,11 @@ def __call__(
                 in their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is
                 passed will be used. Must be in descending order.
             guidance_scale (`float`, *optional*, defaults to 4.0):
-                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
-                `guidance_scale` is defined as `w` of equation 2. of [Imagen
-                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
-                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
-                usually at the expense of lower image quality.
+                Guidance scale as defined in [Classifier-Free Diffusion
+                Guidance](https://huggingface.co/papers/2207.12598). `guidance_scale` is defined as `w` of equation 2.
+                of [Imagen Paper](https://huggingface.co/papers/2205.11487). Guidance scale is enabled by setting
+                `guidance_scale > 1`. Higher guidance scale encourages to generate images that are closely linked to
+                the text `prompt`, usually at the expense of lower image quality.
             num_images_per_prompt (`int`, *optional*, defaults to 1):
                 The number of images to generate per prompt.
             generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
@@ -600,10 +633,12 @@ def __call__(
                     "Resolution binning requires a VAE with image_processor, but VAE is not available. "
                     "Set use_resolution_binning=False or provide a VAE."
                 )
-            if self.default_sample_size <= 256:
-                aspect_ratio_bin = ASPECT_RATIO_256_BIN
-            else:
-                aspect_ratio_bin = ASPECT_RATIO_512_BIN
+            if self.default_sample_size not in ASPECT_RATIO_BINS:
+                raise ValueError(
+                    f"Resolution binning is only supported for default_sample_size in {list(ASPECT_RATIO_BINS.keys())}, "
+                    f"but got {self.default_sample_size}. Set use_resolution_binning=False to disable aspect ratio binning."
+                )
+            aspect_ratio_bin = ASPECT_RATIO_BINS[self.default_sample_size]
 
             # Store original dimensions
             orig_height, orig_width = height, width
diff --git a/src/diffusers/pipelines/sana/pipeline_sana.py b/src/diffusers/pipelines/sana/pipeline_sana.py
index ac979305ca6d..2beff802c6e0 100644
--- a/src/diffusers/pipelines/sana/pipeline_sana.py
+++ b/src/diffusers/pipelines/sana/pipeline_sana.py
@@ -1,4 +1,4 @@
-# Copyright 2025 PixArt-Sigma Authors and The HuggingFace Team. All rights reserved.
+# Copyright 2025 SANA Authors and The HuggingFace Team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/src/diffusers/pipelines/sana/pipeline_sana_sprint.py b/src/diffusers/pipelines/sana/pipeline_sana_sprint.py
index 62b978829271..04f45f817efb 100644
--- a/src/diffusers/pipelines/sana/pipeline_sana_sprint.py
+++ b/src/diffusers/pipelines/sana/pipeline_sana_sprint.py
@@ -1,4 +1,4 @@
-# Copyright 2025 PixArt-Sigma Authors and The HuggingFace Team. All rights reserved.
+# Copyright 2025 SANA-Sprint Authors and The HuggingFace Team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/src/diffusers/pipelines/sana_video/__init__.py b/src/diffusers/pipelines/sana_video/__init__.py
new file mode 100644
index 000000000000..73e224bf749d
--- /dev/null
+++ b/src/diffusers/pipelines/sana_video/__init__.py
@@ -0,0 +1,49 @@
+from typing import TYPE_CHECKING
+
+from ...utils import (
+    DIFFUSERS_SLOW_IMPORT,
+    OptionalDependencyNotAvailable,
+    _LazyModule,
+    get_objects_from_module,
+    is_torch_available,
+    is_transformers_available,
+)
+
+
+_dummy_objects = {}
+_import_structure = {}
+
+
+try:
+    if not (is_transformers_available() and is_torch_available()):
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    from ...utils import dummy_torch_and_transformers_objects  # noqa F403
+
+    _dummy_objects.update(get_objects_from_module(dummy_torch_and_transformers_objects))
+else:
+    _import_structure["pipeline_sana_video"] = ["SanaVideoPipeline"]
+    _import_structure["pipeline_sana_video_i2v"] = ["SanaImageToVideoPipeline"]
+
+if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
+    try:
+        if not (is_transformers_available() and is_torch_available()):
+            raise OptionalDependencyNotAvailable()
+
+    except OptionalDependencyNotAvailable:
+        from ...utils.dummy_torch_and_transformers_objects import *
+    else:
+        from .pipeline_sana_video import SanaVideoPipeline
+        from .pipeline_sana_video_i2v import SanaImageToVideoPipeline
+else:
+    import sys
+
+    sys.modules[__name__] = _LazyModule(
+        __name__,
+        globals()["__file__"],
+        _import_structure,
+        module_spec=__spec__,
+    )
+
+    for name, value in _dummy_objects.items():
+        setattr(sys.modules[__name__], name, value)
diff --git a/src/diffusers/pipelines/sana_video/pipeline_output.py b/src/diffusers/pipelines/sana_video/pipeline_output.py
new file mode 100644
index 000000000000..4d37923889eb
--- /dev/null
+++ b/src/diffusers/pipelines/sana_video/pipeline_output.py
@@ -0,0 +1,20 @@
+from dataclasses import dataclass
+
+import torch
+
+from ...utils import BaseOutput
+
+
+@dataclass
+class SanaVideoPipelineOutput(BaseOutput):
+    r"""
+    Output class for Sana-Video pipelines.
+
+    Args:
+        frames (`torch.Tensor`, `np.ndarray`, or List[List[PIL.Image.Image]]):
+            List of video outputs - It can be a nested list of length `batch_size,` with each sub-list containing
+            denoised PIL image sequences of length `num_frames.` It can also be a NumPy array or Torch tensor of shape
+            `(batch_size, num_frames, channels, height, width)`.
+    """
+
+    frames: torch.Tensor
diff --git a/src/diffusers/pipelines/sana_video/pipeline_sana_video.py b/src/diffusers/pipelines/sana_video/pipeline_sana_video.py
new file mode 100644
index 000000000000..a786275e45a9
--- /dev/null
+++ b/src/diffusers/pipelines/sana_video/pipeline_sana_video.py
@@ -0,0 +1,1017 @@
+# Copyright 2025 SANA-Video Authors and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import html
+import inspect
+import re
+import urllib.parse as ul
+import warnings
+from typing import Any, Callable, Dict, List, Optional, Tuple, Union
+
+import torch
+from transformers import Gemma2PreTrainedModel, GemmaTokenizer, GemmaTokenizerFast
+
+from ...callbacks import MultiPipelineCallbacks, PipelineCallback
+from ...loaders import SanaLoraLoaderMixin
+from ...models import AutoencoderDC, AutoencoderKLWan, SanaVideoTransformer3DModel
+from ...schedulers import DPMSolverMultistepScheduler
+from ...utils import (
+    BACKENDS_MAPPING,
+    USE_PEFT_BACKEND,
+    is_bs4_available,
+    is_ftfy_available,
+    is_torch_xla_available,
+    logging,
+    replace_example_docstring,
+    scale_lora_layers,
+    unscale_lora_layers,
+)
+from ...utils.torch_utils import get_device, is_torch_version, randn_tensor
+from ...video_processor import VideoProcessor
+from ..pipeline_utils import DiffusionPipeline
+from .pipeline_output import SanaVideoPipelineOutput
+
+
+ASPECT_RATIO_480_BIN = {
+    "0.5": [448.0, 896.0],
+    "0.57": [480.0, 832.0],
+    "0.68": [528.0, 768.0],
+    "0.78": [560.0, 720.0],
+    "1.0": [624.0, 624.0],
+    "1.13": [672.0, 592.0],
+    "1.29": [720.0, 560.0],
+    "1.46": [768.0, 528.0],
+    "1.67": [816.0, 496.0],
+    "1.75": [832.0, 480.0],
+    "2.0": [896.0, 448.0],
+}
+
+
+ASPECT_RATIO_720_BIN = {
+    "0.5": [672.0, 1344.0],
+    "0.57": [704.0, 1280.0],
+    "0.68": [800.0, 1152.0],
+    "0.78": [832.0, 1088.0],
+    "1.0": [960.0, 960.0],
+    "1.13": [1024.0, 896.0],
+    "1.29": [1088.0, 832.0],
+    "1.46": [1152.0, 800.0],
+    "1.67": [1248.0, 736.0],
+    "1.75": [1280.0, 704.0],
+    "2.0": [1344.0, 672.0],
+}
+
+if is_torch_xla_available():
+    import torch_xla.core.xla_model as xm
+
+    XLA_AVAILABLE = True
+else:
+    XLA_AVAILABLE = False
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+if is_bs4_available():
+    from bs4 import BeautifulSoup
+
+if is_ftfy_available():
+    import ftfy
+
+
+EXAMPLE_DOC_STRING = """
+    Examples:
+        ```py
+        >>> import torch
+        >>> from diffusers import SanaVideoPipeline
+        >>> from diffusers.utils import export_to_video
+
+        >>> pipe = SanaVideoPipeline.from_pretrained("Efficient-Large-Model/SANA-Video_2B_480p_diffusers")
+        >>> pipe.transformer.to(torch.bfloat16)
+        >>> pipe.text_encoder.to(torch.bfloat16)
+        >>> pipe.vae.to(torch.float32)
+        >>> pipe.to("cuda")
+        >>> motion_score = 30
+
+        >>> prompt = "Evening, backlight, side lighting, soft light, high contrast, mid-shot, centered composition, clean solo shot, warm color. A young Caucasian man stands in a forest, golden light glimmers on his hair as sunlight filters through the leaves. He wears a light shirt, wind gently blowing his hair and collar, light dances across his face with his movements. The background is blurred, with dappled light and soft tree shadows in the distance. The camera focuses on his lifted gaze, clear and emotional."
+        >>> negative_prompt = "A chaotic sequence with misshapen, deformed limbs in heavy motion blur, sudden disappearance, jump cuts, jerky movements, rapid shot changes, frames out of sync, inconsistent character shapes, temporal artifacts, jitter, and ghosting effects, creating a disorienting visual experience."
+        >>> motion_prompt = f" motion score: {motion_score}."
+        >>> prompt = prompt + motion_prompt
+
+        >>> output = pipe(
+        ...     prompt=prompt,
+        ...     negative_prompt=negative_prompt,
+        ...     height=480,
+        ...     width=832,
+        ...     frames=81,
+        ...     guidance_scale=6,
+        ...     num_inference_steps=50,
+        ...     generator=torch.Generator(device="cuda").manual_seed(42),
+        ... ).frames[0]
+
+        >>> export_to_video(output, "sana-video-output.mp4", fps=16)
+        ```
+"""
+
+
+# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.retrieve_timesteps
+def retrieve_timesteps(
+    scheduler,
+    num_inference_steps: Optional[int] = None,
+    device: Optional[Union[str, torch.device]] = None,
+    timesteps: Optional[List[int]] = None,
+    sigmas: Optional[List[float]] = None,
+    **kwargs,
+):
+    r"""
+    Calls the scheduler's `set_timesteps` method and retrieves timesteps from the scheduler after the call. Handles
+    custom timesteps. Any kwargs will be supplied to `scheduler.set_timesteps`.
+
+    Args:
+        scheduler (`SchedulerMixin`):
+            The scheduler to get timesteps from.
+        num_inference_steps (`int`):
+            The number of diffusion steps used when generating samples with a pre-trained model. If used, `timesteps`
+            must be `None`.
+        device (`str` or `torch.device`, *optional*):
+            The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
+        timesteps (`List[int]`, *optional*):
+            Custom timesteps used to override the timestep spacing strategy of the scheduler. If `timesteps` is passed,
+            `num_inference_steps` and `sigmas` must be `None`.
+        sigmas (`List[float]`, *optional*):
+            Custom sigmas used to override the timestep spacing strategy of the scheduler. If `sigmas` is passed,
+            `num_inference_steps` and `timesteps` must be `None`.
+
+    Returns:
+        `Tuple[torch.Tensor, int]`: A tuple where the first element is the timestep schedule from the scheduler and the
+        second element is the number of inference steps.
+    """
+    if timesteps is not None and sigmas is not None:
+        raise ValueError("Only one of `timesteps` or `sigmas` can be passed. Please choose one to set custom values")
+    if timesteps is not None:
+        accepts_timesteps = "timesteps" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
+        if not accepts_timesteps:
+            raise ValueError(
+                f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
+                f" timestep schedules. Please check whether you are using the correct scheduler."
+            )
+        scheduler.set_timesteps(timesteps=timesteps, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+        num_inference_steps = len(timesteps)
+    elif sigmas is not None:
+        accept_sigmas = "sigmas" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
+        if not accept_sigmas:
+            raise ValueError(
+                f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
+                f" sigmas schedules. Please check whether you are using the correct scheduler."
+            )
+        scheduler.set_timesteps(sigmas=sigmas, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+        num_inference_steps = len(timesteps)
+    else:
+        scheduler.set_timesteps(num_inference_steps, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+    return timesteps, num_inference_steps
+
+
+class SanaVideoPipeline(DiffusionPipeline, SanaLoraLoaderMixin):
+    r"""
+    Pipeline for text-to-video generation using [Sana](https://huggingface.co/papers/2509.24695). This model inherits
+    from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods implemented for all
+    pipelines (downloading, saving, running on a particular device, etc.).
+
+    Args:
+        tokenizer ([`GemmaTokenizer`] or [`GemmaTokenizerFast`]):
+            The tokenizer used to tokenize the prompt.
+        text_encoder ([`Gemma2PreTrainedModel`]):
+            Text encoder model to encode the input prompts.
+        vae ([`AutoencoderKLWan` or `AutoencoderDCAEV`]):
+            Variational Auto-Encoder (VAE) Model to encode and decode videos to and from latent representations.
+        transformer ([`SanaVideoTransformer3DModel`]):
+            Conditional Transformer to denoise the input latents.
+        scheduler ([`DPMSolverMultistepScheduler`]):
+            A scheduler to be used in combination with `transformer` to denoise the encoded video latents.
+    """
+
+    # fmt: off
+    bad_punct_regex = re.compile(r"[" + "#®•©™&@·º½¾¿¡§~" + r"\)" + r"\(" + r"\]" + r"\[" + r"\}" + r"\{" + r"\|" + "\\" + r"\/" + r"\*" + r"]{1,}")
+    # fmt: on
+
+    model_cpu_offload_seq = "text_encoder->transformer->vae"
+    _callback_tensor_inputs = ["latents", "prompt_embeds", "negative_prompt_embeds"]
+
+    def __init__(
+        self,
+        tokenizer: Union[GemmaTokenizer, GemmaTokenizerFast],
+        text_encoder: Gemma2PreTrainedModel,
+        vae: Union[AutoencoderDC, AutoencoderKLWan],
+        transformer: SanaVideoTransformer3DModel,
+        scheduler: DPMSolverMultistepScheduler,
+    ):
+        super().__init__()
+
+        self.register_modules(
+            tokenizer=tokenizer, text_encoder=text_encoder, vae=vae, transformer=transformer, scheduler=scheduler
+        )
+
+        self.vae_scale_factor_temporal = self.vae.config.scale_factor_temporal if getattr(self, "vae", None) else 4
+        self.vae_scale_factor_spatial = self.vae.config.scale_factor_spatial if getattr(self, "vae", None) else 8
+
+        self.vae_scale_factor = self.vae_scale_factor_spatial
+
+        self.video_processor = VideoProcessor(vae_scale_factor=self.vae_scale_factor_spatial)
+
+    # Copied from diffusers.pipelines.sana.pipeline_sana.SanaPipeline._get_gemma_prompt_embeds
+    def _get_gemma_prompt_embeds(
+        self,
+        prompt: Union[str, List[str]],
+        device: torch.device,
+        dtype: torch.dtype,
+        clean_caption: bool = False,
+        max_sequence_length: int = 300,
+        complex_human_instruction: Optional[List[str]] = None,
+    ):
+        r"""
+        Encodes the prompt into text encoder hidden states.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                prompt to be encoded
+            device: (`torch.device`, *optional*):
+                torch device to place the resulting embeddings on
+            clean_caption (`bool`, defaults to `False`):
+                If `True`, the function will preprocess and clean the provided caption before encoding.
+            max_sequence_length (`int`, defaults to 300): Maximum sequence length to use for the prompt.
+            complex_human_instruction (`list[str]`, defaults to `complex_human_instruction`):
+                If `complex_human_instruction` is not empty, the function will use the complex Human instruction for
+                the prompt.
+        """
+        prompt = [prompt] if isinstance(prompt, str) else prompt
+
+        if getattr(self, "tokenizer", None) is not None:
+            self.tokenizer.padding_side = "right"
+
+        prompt = self._text_preprocessing(prompt, clean_caption=clean_caption)
+
+        # prepare complex human instruction
+        if not complex_human_instruction:
+            max_length_all = max_sequence_length
+        else:
+            chi_prompt = "\n".join(complex_human_instruction)
+            prompt = [chi_prompt + p for p in prompt]
+            num_chi_prompt_tokens = len(self.tokenizer.encode(chi_prompt))
+            max_length_all = num_chi_prompt_tokens + max_sequence_length - 2
+
+        text_inputs = self.tokenizer(
+            prompt,
+            padding="max_length",
+            max_length=max_length_all,
+            truncation=True,
+            add_special_tokens=True,
+            return_tensors="pt",
+        )
+        text_input_ids = text_inputs.input_ids
+
+        prompt_attention_mask = text_inputs.attention_mask
+        prompt_attention_mask = prompt_attention_mask.to(device)
+
+        prompt_embeds = self.text_encoder(text_input_ids.to(device), attention_mask=prompt_attention_mask)
+        prompt_embeds = prompt_embeds[0].to(dtype=dtype, device=device)
+
+        return prompt_embeds, prompt_attention_mask
+
+    def encode_prompt(
+        self,
+        prompt: Union[str, List[str]],
+        do_classifier_free_guidance: bool = True,
+        negative_prompt: str = "",
+        num_videos_per_prompt: int = 1,
+        device: Optional[torch.device] = None,
+        prompt_embeds: Optional[torch.Tensor] = None,
+        negative_prompt_embeds: Optional[torch.Tensor] = None,
+        prompt_attention_mask: Optional[torch.Tensor] = None,
+        negative_prompt_attention_mask: Optional[torch.Tensor] = None,
+        clean_caption: bool = False,
+        max_sequence_length: int = 300,
+        complex_human_instruction: Optional[List[str]] = None,
+        lora_scale: Optional[float] = None,
+    ):
+        r"""
+        Encodes the prompt into text encoder hidden states.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                prompt to be encoded
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt not to guide the video generation. If not defined, one has to pass `negative_prompt_embeds`
+                instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is less than `1`). For
+                PixArt-Alpha, this should be "".
+            do_classifier_free_guidance (`bool`, *optional*, defaults to `True`):
+                whether to use classifier free guidance or not
+            num_videos_per_prompt (`int`, *optional*, defaults to 1):
+                number of videos that should be generated per prompt
+            device: (`torch.device`, *optional*):
+                torch device to place the resulting embeddings on
+            prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated negative text embeddings. For Sana, it's should be the embeddings of the "" string.
+            clean_caption (`bool`, defaults to `False`):
+                If `True`, the function will preprocess and clean the provided caption before encoding.
+            max_sequence_length (`int`, defaults to 300): Maximum sequence length to use for the prompt.
+            complex_human_instruction (`list[str]`, defaults to `complex_human_instruction`):
+                If `complex_human_instruction` is not empty, the function will use the complex Human instruction for
+                the prompt.
+        """
+
+        if device is None:
+            device = self._execution_device
+
+        if self.text_encoder is not None:
+            dtype = self.text_encoder.dtype
+        else:
+            dtype = None
+
+        # set lora scale so that monkey patched LoRA
+        # function of text encoder can correctly access it
+        if lora_scale is not None and isinstance(self, SanaLoraLoaderMixin):
+            self._lora_scale = lora_scale
+
+            # dynamically adjust the LoRA scale
+            if self.text_encoder is not None and USE_PEFT_BACKEND:
+                scale_lora_layers(self.text_encoder, lora_scale)
+
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        if getattr(self, "tokenizer", None) is not None:
+            self.tokenizer.padding_side = "right"
+
+        # See Section 3.1. of the paper.
+        max_length = max_sequence_length
+        select_index = [0] + list(range(-max_length + 1, 0))
+
+        if prompt_embeds is None:
+            prompt_embeds, prompt_attention_mask = self._get_gemma_prompt_embeds(
+                prompt=prompt,
+                device=device,
+                dtype=dtype,
+                clean_caption=clean_caption,
+                max_sequence_length=max_sequence_length,
+                complex_human_instruction=complex_human_instruction,
+            )
+
+            prompt_embeds = prompt_embeds[:, select_index]
+            prompt_attention_mask = prompt_attention_mask[:, select_index]
+
+        bs_embed, seq_len, _ = prompt_embeds.shape
+        # duplicate text embeddings and attention mask for each generation per prompt, using mps friendly method
+        prompt_embeds = prompt_embeds.repeat(1, num_videos_per_prompt, 1)
+        prompt_embeds = prompt_embeds.view(bs_embed * num_videos_per_prompt, seq_len, -1)
+        prompt_attention_mask = prompt_attention_mask.view(bs_embed, -1)
+        prompt_attention_mask = prompt_attention_mask.repeat(num_videos_per_prompt, 1)
+
+        # get unconditional embeddings for classifier free guidance
+        if do_classifier_free_guidance and negative_prompt_embeds is None:
+            negative_prompt = [negative_prompt] * batch_size if isinstance(negative_prompt, str) else negative_prompt
+            negative_prompt_embeds, negative_prompt_attention_mask = self._get_gemma_prompt_embeds(
+                prompt=negative_prompt,
+                device=device,
+                dtype=dtype,
+                clean_caption=clean_caption,
+                max_sequence_length=max_sequence_length,
+                complex_human_instruction=False,
+            )
+
+        if do_classifier_free_guidance:
+            # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
+            seq_len = negative_prompt_embeds.shape[1]
+
+            negative_prompt_embeds = negative_prompt_embeds.to(dtype=dtype, device=device)
+
+            negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_videos_per_prompt, 1)
+            negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_videos_per_prompt, seq_len, -1)
+
+            negative_prompt_attention_mask = negative_prompt_attention_mask.view(bs_embed, -1)
+            negative_prompt_attention_mask = negative_prompt_attention_mask.repeat(num_videos_per_prompt, 1)
+        else:
+            negative_prompt_embeds = None
+            negative_prompt_attention_mask = None
+
+        if self.text_encoder is not None:
+            if isinstance(self, SanaLoraLoaderMixin) and USE_PEFT_BACKEND:
+                # Retrieve the original scale by scaling back the LoRA layers
+                unscale_lora_layers(self.text_encoder, lora_scale)
+
+        return prompt_embeds, prompt_attention_mask, negative_prompt_embeds, negative_prompt_attention_mask
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs
+    def prepare_extra_step_kwargs(self, generator, eta):
+        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
+        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
+        # eta corresponds to η in DDIM paper: https://huggingface.co/papers/2010.02502
+        # and should be between [0, 1]
+
+        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        extra_step_kwargs = {}
+        if accepts_eta:
+            extra_step_kwargs["eta"] = eta
+
+        # check if the scheduler accepts generator
+        accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        if accepts_generator:
+            extra_step_kwargs["generator"] = generator
+        return extra_step_kwargs
+
+    def check_inputs(
+        self,
+        prompt,
+        height,
+        width,
+        callback_on_step_end_tensor_inputs=None,
+        negative_prompt=None,
+        prompt_embeds=None,
+        negative_prompt_embeds=None,
+        prompt_attention_mask=None,
+        negative_prompt_attention_mask=None,
+    ):
+        if height % 32 != 0 or width % 32 != 0:
+            raise ValueError(f"`height` and `width` have to be divisible by 32 but are {height} and {width}.")
+
+        if callback_on_step_end_tensor_inputs is not None and not all(
+            k in self._callback_tensor_inputs for k in callback_on_step_end_tensor_inputs
+        ):
+            raise ValueError(
+                f"`callback_on_step_end_tensor_inputs` has to be in {self._callback_tensor_inputs}, but found {[k for k in callback_on_step_end_tensor_inputs if k not in self._callback_tensor_inputs]}"
+            )
+
+        if prompt is not None and prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
+                " only forward one of the two."
+            )
+        elif prompt is None and prompt_embeds is None:
+            raise ValueError(
+                "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
+            )
+        elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+
+        if prompt is not None and negative_prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt`: {prompt} and `negative_prompt_embeds`:"
+                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
+            )
+
+        if negative_prompt is not None and negative_prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
+                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
+            )
+
+        if prompt_embeds is not None and prompt_attention_mask is None:
+            raise ValueError("Must provide `prompt_attention_mask` when specifying `prompt_embeds`.")
+
+        if negative_prompt_embeds is not None and negative_prompt_attention_mask is None:
+            raise ValueError("Must provide `negative_prompt_attention_mask` when specifying `negative_prompt_embeds`.")
+
+        if prompt_embeds is not None and negative_prompt_embeds is not None:
+            if prompt_embeds.shape != negative_prompt_embeds.shape:
+                raise ValueError(
+                    "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
+                    f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
+                    f" {negative_prompt_embeds.shape}."
+                )
+            if prompt_attention_mask.shape != negative_prompt_attention_mask.shape:
+                raise ValueError(
+                    "`prompt_attention_mask` and `negative_prompt_attention_mask` must have the same shape when passed directly, but"
+                    f" got: `prompt_attention_mask` {prompt_attention_mask.shape} != `negative_prompt_attention_mask`"
+                    f" {negative_prompt_attention_mask.shape}."
+                )
+
+    # Copied from diffusers.pipelines.deepfloyd_if.pipeline_if.IFPipeline._text_preprocessing
+    def _text_preprocessing(self, text, clean_caption=False):
+        if clean_caption and not is_bs4_available():
+            logger.warning(BACKENDS_MAPPING["bs4"][-1].format("Setting `clean_caption=True`"))
+            logger.warning("Setting `clean_caption` to False...")
+            clean_caption = False
+
+        if clean_caption and not is_ftfy_available():
+            logger.warning(BACKENDS_MAPPING["ftfy"][-1].format("Setting `clean_caption=True`"))
+            logger.warning("Setting `clean_caption` to False...")
+            clean_caption = False
+
+        if not isinstance(text, (tuple, list)):
+            text = [text]
+
+        def process(text: str):
+            if clean_caption:
+                text = self._clean_caption(text)
+                text = self._clean_caption(text)
+            else:
+                text = text.lower().strip()
+            return text
+
+        return [process(t) for t in text]
+
+    # Copied from diffusers.pipelines.deepfloyd_if.pipeline_if.IFPipeline._clean_caption
+    def _clean_caption(self, caption):
+        caption = str(caption)
+        caption = ul.unquote_plus(caption)
+        caption = caption.strip().lower()
+        caption = re.sub("<person>", "person", caption)
+        # urls:
+        caption = re.sub(
+            r"\b((?:https?:(?:\/{1,3}|[a-zA-Z0-9%])|[a-zA-Z0-9.\-]+[.](?:com|co|ru|net|org|edu|gov|it)[\w/-]*\b\/?(?!@)))",  # noqa
+            "",
+            caption,
+        )  # regex for urls
+        caption = re.sub(
+            r"\b((?:www:(?:\/{1,3}|[a-zA-Z0-9%])|[a-zA-Z0-9.\-]+[.](?:com|co|ru|net|org|edu|gov|it)[\w/-]*\b\/?(?!@)))",  # noqa
+            "",
+            caption,
+        )  # regex for urls
+        # html:
+        caption = BeautifulSoup(caption, features="html.parser").text
+
+        # @<nickname>
+        caption = re.sub(r"@[\w\d]+\b", "", caption)
+
+        # 31C0—31EF CJK Strokes
+        # 31F0—31FF Katakana Phonetic Extensions
+        # 3200—32FF Enclosed CJK Letters and Months
+        # 3300—33FF CJK Compatibility
+        # 3400—4DBF CJK Unified Ideographs Extension A
+        # 4DC0—4DFF Yijing Hexagram Symbols
+        # 4E00—9FFF CJK Unified Ideographs
+        caption = re.sub(r"[\u31c0-\u31ef]+", "", caption)
+        caption = re.sub(r"[\u31f0-\u31ff]+", "", caption)
+        caption = re.sub(r"[\u3200-\u32ff]+", "", caption)
+        caption = re.sub(r"[\u3300-\u33ff]+", "", caption)
+        caption = re.sub(r"[\u3400-\u4dbf]+", "", caption)
+        caption = re.sub(r"[\u4dc0-\u4dff]+", "", caption)
+        caption = re.sub(r"[\u4e00-\u9fff]+", "", caption)
+        #######################################################
+
+        # все виды тире / all types of dash --> "-"
+        caption = re.sub(
+            r"[\u002D\u058A\u05BE\u1400\u1806\u2010-\u2015\u2E17\u2E1A\u2E3A\u2E3B\u2E40\u301C\u3030\u30A0\uFE31\uFE32\uFE58\uFE63\uFF0D]+",  # noqa
+            "-",
+            caption,
+        )
+
+        # кавычки к одному стандарту
+        caption = re.sub(r"[`´«»“”¨]", '"', caption)
+        caption = re.sub(r"[‘’]", "'", caption)
+
+        # &quot;
+        caption = re.sub(r"&quot;?", "", caption)
+        # &amp
+        caption = re.sub(r"&amp", "", caption)
+
+        # ip addresses:
+        caption = re.sub(r"\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}", " ", caption)
+
+        # article ids:
+        caption = re.sub(r"\d:\d\d\s+$", "", caption)
+
+        # \n
+        caption = re.sub(r"\\n", " ", caption)
+
+        # "#123"
+        caption = re.sub(r"#\d{1,3}\b", "", caption)
+        # "#12345.."
+        caption = re.sub(r"#\d{5,}\b", "", caption)
+        # "123456.."
+        caption = re.sub(r"\b\d{6,}\b", "", caption)
+        # filenames:
+        caption = re.sub(r"[\S]+\.(?:png|jpg|jpeg|bmp|webp|eps|pdf|apk|mp4)", "", caption)
+
+        #
+        caption = re.sub(r"[\"\']{2,}", r'"', caption)  # """AUSVERKAUFT"""
+        caption = re.sub(r"[\.]{2,}", r" ", caption)  # """AUSVERKAUFT"""
+
+        caption = re.sub(self.bad_punct_regex, r" ", caption)  # ***AUSVERKAUFT***, #AUSVERKAUFT
+        caption = re.sub(r"\s+\.\s+", r" ", caption)  # " . "
+
+        # this-is-my-cute-cat / this_is_my_cute_cat
+        regex2 = re.compile(r"(?:\-|\_)")
+        if len(re.findall(regex2, caption)) > 3:
+            caption = re.sub(regex2, " ", caption)
+
+        caption = ftfy.fix_text(caption)
+        caption = html.unescape(html.unescape(caption))
+
+        caption = re.sub(r"\b[a-zA-Z]{1,3}\d{3,15}\b", "", caption)  # jc6640
+        caption = re.sub(r"\b[a-zA-Z]+\d+[a-zA-Z]+\b", "", caption)  # jc6640vc
+        caption = re.sub(r"\b\d+[a-zA-Z]+\d+\b", "", caption)  # 6640vc231
+
+        caption = re.sub(r"(worldwide\s+)?(free\s+)?shipping", "", caption)
+        caption = re.sub(r"(free\s)?download(\sfree)?", "", caption)
+        caption = re.sub(r"\bclick\b\s(?:for|on)\s\w+", "", caption)
+        caption = re.sub(r"\b(?:png|jpg|jpeg|bmp|webp|eps|pdf|apk|mp4)(\simage[s]?)?", "", caption)
+        caption = re.sub(r"\bpage\s+\d+\b", "", caption)
+
+        caption = re.sub(r"\b\d*[a-zA-Z]+\d+[a-zA-Z]+\d+[a-zA-Z\d]*\b", r" ", caption)  # j2d1a2a...
+
+        caption = re.sub(r"\b\d+\.?\d*[xх×]\d+\.?\d*\b", "", caption)
+
+        caption = re.sub(r"\b\s+\:\s+", r": ", caption)
+        caption = re.sub(r"(\D[,\./])\b", r"\1 ", caption)
+        caption = re.sub(r"\s+", " ", caption)
+
+        caption.strip()
+
+        caption = re.sub(r"^[\"\']([\w\W]+)[\"\']$", r"\1", caption)
+        caption = re.sub(r"^[\'\_,\-\:;]", r"", caption)
+        caption = re.sub(r"[\'\_,\-\:\-\+]$", r"", caption)
+        caption = re.sub(r"^\.\S+$", "", caption)
+
+        return caption.strip()
+
+    def prepare_latents(
+        self,
+        batch_size: int,
+        num_channels_latents: int = 16,
+        height: int = 480,
+        width: int = 832,
+        num_frames: int = 81,
+        dtype: Optional[torch.dtype] = None,
+        device: Optional[torch.device] = None,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        if latents is not None:
+            return latents.to(device=device, dtype=dtype)
+
+        num_latent_frames = (num_frames - 1) // self.vae_scale_factor_temporal + 1
+        shape = (
+            batch_size,
+            num_channels_latents,
+            num_latent_frames,
+            int(height) // self.vae_scale_factor_spatial,
+            int(width) // self.vae_scale_factor_spatial,
+        )
+        if isinstance(generator, list) and len(generator) != batch_size:
+            raise ValueError(
+                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+            )
+
+        if latents is None:
+            latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+        else:
+            latents = latents.to(device=device, dtype=dtype)
+        return latents
+
+    @property
+    def guidance_scale(self):
+        return self._guidance_scale
+
+    @property
+    def attention_kwargs(self):
+        return self._attention_kwargs
+
+    @property
+    def do_classifier_free_guidance(self):
+        return self._guidance_scale > 1.0
+
+    @property
+    def num_timesteps(self):
+        return self._num_timesteps
+
+    @property
+    def interrupt(self):
+        return self._interrupt
+
+    @torch.no_grad()
+    @replace_example_docstring(EXAMPLE_DOC_STRING)
+    def __call__(
+        self,
+        prompt: Union[str, List[str]] = None,
+        negative_prompt: str = "",
+        num_inference_steps: int = 50,
+        timesteps: List[int] = None,
+        sigmas: List[float] = None,
+        guidance_scale: float = 6.0,
+        num_videos_per_prompt: Optional[int] = 1,
+        height: int = 480,
+        width: int = 832,
+        frames: int = 81,
+        eta: float = 0.0,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.Tensor] = None,
+        prompt_embeds: Optional[torch.Tensor] = None,
+        prompt_attention_mask: Optional[torch.Tensor] = None,
+        negative_prompt_embeds: Optional[torch.Tensor] = None,
+        negative_prompt_attention_mask: Optional[torch.Tensor] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        clean_caption: bool = False,
+        use_resolution_binning: bool = True,
+        attention_kwargs: Optional[Dict[str, Any]] = None,
+        callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None,
+        callback_on_step_end_tensor_inputs: List[str] = ["latents"],
+        max_sequence_length: int = 300,
+        complex_human_instruction: List[str] = [
+            "Given a user prompt, generate an 'Enhanced prompt' that provides detailed visual descriptions suitable for video generation. Evaluate the level of detail in the user prompt:",
+            "- If the prompt is simple, focus on adding specifics about colors, shapes, sizes, textures, motion, and temporal relationships to create vivid and dynamic scenes.",
+            "- If the prompt is already detailed, refine and enhance the existing details slightly without overcomplicating.",
+            "Here are examples of how to transform or refine prompts:",
+            "- User Prompt: A cat sleeping -> Enhanced: A small, fluffy white cat slowly settling into a curled position, peacefully falling asleep on a warm sunny windowsill, with gentle sunlight filtering through surrounding pots of blooming red flowers.",
+            "- User Prompt: A busy city street -> Enhanced: A bustling city street scene at dusk, featuring glowing street lamps gradually lighting up, a diverse crowd of people in colorful clothing walking past, and a double-decker bus smoothly passing by towering glass skyscrapers.",
+            "Please generate only the enhanced description for the prompt below and avoid including any additional commentary or evaluations:",
+            "User Prompt: ",
+        ],
+    ) -> Union[SanaVideoPipelineOutput, Tuple]:
+        """
+        Function invoked when calling the pipeline for generation.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide the video generation. If not defined, one has to pass `prompt_embeds`.
+                instead.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the video generation. If not defined, one has to pass
+                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
+                less than `1`).
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality video at the
+                expense of slower inference.
+            timesteps (`List[int]`, *optional*):
+                Custom timesteps to use for the denoising process with schedulers which support a `timesteps` argument
+                in their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is
+                passed will be used. Must be in descending order.
+            sigmas (`List[float]`, *optional*):
+                Custom sigmas to use for the denoising process with schedulers which support a `sigmas` argument in
+                their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is passed
+                will be used.
+            guidance_scale (`float`, *optional*, defaults to 4.5):
+                Guidance scale as defined in [Classifier-Free Diffusion
+                Guidance](https://huggingface.co/papers/2207.12598). `guidance_scale` is defined as `w` of equation 2.
+                of [Imagen Paper](https://huggingface.co/papers/2205.11487). Guidance scale is enabled by setting
+                `guidance_scale > 1`. Higher guidance scale encourages to generate videos that are closely linked to
+                the text `prompt`, usually at the expense of lower video quality.
+            num_videos_per_prompt (`int`, *optional*, defaults to 1):
+                The number of videos to generate per prompt.
+            height (`int`, *optional*, defaults to 480):
+                The height in pixels of the generated video.
+            width (`int`, *optional*, defaults to 832):
+                The width in pixels of the generated video.
+            frames (`int`, *optional*, defaults to 81):
+                The number of frames in the generated video.
+            eta (`float`, *optional*, defaults to 0.0):
+                Corresponds to parameter eta (η) in the DDIM paper: https://huggingface.co/papers/2010.02502. Only
+                applies to [`schedulers.DDIMScheduler`], will be ignored for others.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
+                to make generation deterministic.
+            latents (`torch.Tensor`, *optional*):
+                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for video
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor will be generated by sampling using the supplied random `generator`.
+            prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            prompt_attention_mask (`torch.Tensor`, *optional*): Pre-generated attention mask for text embeddings.
+            negative_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated negative text embeddings. For PixArt-Sigma this negative prompt should be "". If not
+                provided, negative_prompt_embeds will be generated from `negative_prompt` input argument.
+            negative_prompt_attention_mask (`torch.Tensor`, *optional*):
+                Pre-generated attention mask for negative text embeddings.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generated video. Choose between mp4 or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`SanaVideoPipelineOutput`] instead of a plain tuple.
+            attention_kwargs:
+                A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
+                `self.processor` in
+                [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
+            clean_caption (`bool`, *optional*, defaults to `True`):
+                Whether or not to clean the caption before creating embeddings. Requires `beautifulsoup4` and `ftfy` to
+                be installed. If the dependencies are not installed, the embeddings will be created from the raw
+                prompt.
+            use_resolution_binning (`bool` defaults to `True`):
+                If set to `True`, the requested height and width are first mapped to the closest resolutions using
+                `ASPECT_RATIO_480_BIN` or `ASPECT_RATIO_720_BIN`. After the produced latents are decoded into videos,
+                they are resized back to the requested resolution. Useful for generating non-square videos.
+            callback_on_step_end (`Callable`, *optional*):
+                A function that calls at the end of each denoising steps during the inference. The function is called
+                with the following arguments: `callback_on_step_end(self: DiffusionPipeline, step: int, timestep: int,
+                callback_kwargs: Dict)`. `callback_kwargs` will include a list of all tensors as specified by
+                `callback_on_step_end_tensor_inputs`.
+            callback_on_step_end_tensor_inputs (`List`, *optional*):
+                The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
+                will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
+                `._callback_tensor_inputs` attribute of your pipeline class.
+            max_sequence_length (`int` defaults to `300`):
+                Maximum sequence length to use with the `prompt`.
+            complex_human_instruction (`List[str]`, *optional*):
+                Instructions for complex human attention:
+                https://github.com/NVlabs/Sana/blob/main/configs/sana_app_config/Sana_1600M_app.yaml#L55.
+
+        Examples:
+
+        Returns:
+            [`~pipelines.sana_video.pipeline_output.SanaVideoPipelineOutput`] or `tuple`:
+                If `return_dict` is `True`, [`~pipelines.sana_video.pipeline_output.SanaVideoPipelineOutput`] is
+                returned, otherwise a `tuple` is returned where the first element is a list with the generated videos
+        """
+
+        if isinstance(callback_on_step_end, (PipelineCallback, MultiPipelineCallbacks)):
+            callback_on_step_end_tensor_inputs = callback_on_step_end.tensor_inputs
+
+        # 1. Check inputs. Raise error if not correct
+        if use_resolution_binning:
+            if self.transformer.config.sample_size == 30:
+                aspect_ratio_bin = ASPECT_RATIO_480_BIN
+            elif self.transformer.config.sample_size == 22:
+                aspect_ratio_bin = ASPECT_RATIO_720_BIN
+            else:
+                raise ValueError("Invalid sample size")
+            orig_height, orig_width = height, width
+            height, width = self.video_processor.classify_height_width_bin(height, width, ratios=aspect_ratio_bin)
+
+        self.check_inputs(
+            prompt,
+            height,
+            width,
+            callback_on_step_end_tensor_inputs,
+            negative_prompt,
+            prompt_embeds,
+            negative_prompt_embeds,
+            prompt_attention_mask,
+            negative_prompt_attention_mask,
+        )
+
+        self._guidance_scale = guidance_scale
+        self._attention_kwargs = attention_kwargs
+        self._interrupt = False
+
+        # 2. Default height and width to transformer
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        device = self._execution_device
+        lora_scale = self.attention_kwargs.get("scale", None) if self.attention_kwargs is not None else None
+
+        # 3. Encode input prompt
+        (
+            prompt_embeds,
+            prompt_attention_mask,
+            negative_prompt_embeds,
+            negative_prompt_attention_mask,
+        ) = self.encode_prompt(
+            prompt,
+            self.do_classifier_free_guidance,
+            negative_prompt=negative_prompt,
+            num_videos_per_prompt=num_videos_per_prompt,
+            device=device,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            prompt_attention_mask=prompt_attention_mask,
+            negative_prompt_attention_mask=negative_prompt_attention_mask,
+            clean_caption=clean_caption,
+            max_sequence_length=max_sequence_length,
+            complex_human_instruction=complex_human_instruction,
+            lora_scale=lora_scale,
+        )
+        if self.do_classifier_free_guidance:
+            prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds], dim=0)
+            prompt_attention_mask = torch.cat([negative_prompt_attention_mask, prompt_attention_mask], dim=0)
+
+        # 4. Prepare timesteps
+        timesteps, num_inference_steps = retrieve_timesteps(
+            self.scheduler, num_inference_steps, device, timesteps, sigmas
+        )
+
+        # 5. Prepare latents.
+        latent_channels = self.transformer.config.in_channels
+        latents = self.prepare_latents(
+            batch_size * num_videos_per_prompt,
+            latent_channels,
+            height,
+            width,
+            frames,
+            torch.float32,
+            device,
+            generator,
+            latents,
+        )
+
+        # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
+        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
+
+        # 7. Denoising loop
+        num_warmup_steps = max(len(timesteps) - num_inference_steps * self.scheduler.order, 0)
+        self._num_timesteps = len(timesteps)
+
+        transformer_dtype = self.transformer.dtype
+        with self.progress_bar(total=num_inference_steps) as progress_bar:
+            for i, t in enumerate(timesteps):
+                if self.interrupt:
+                    continue
+
+                latent_model_input = torch.cat([latents] * 2) if self.do_classifier_free_guidance else latents
+
+                # broadcast to batch dimension in a way that's compatible with ONNX/Core ML
+                timestep = t.expand(latent_model_input.shape[0])
+
+                # predict noise model_output
+                noise_pred = self.transformer(
+                    latent_model_input.to(dtype=transformer_dtype),
+                    encoder_hidden_states=prompt_embeds.to(dtype=transformer_dtype),
+                    encoder_attention_mask=prompt_attention_mask,
+                    timestep=timestep,
+                    return_dict=False,
+                    attention_kwargs=self.attention_kwargs,
+                )[0]
+                noise_pred = noise_pred.float()
+
+                # perform guidance
+                if self.do_classifier_free_guidance:
+                    noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                    noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
+
+                # learned sigma
+                if self.transformer.config.out_channels // 2 == latent_channels:
+                    noise_pred = noise_pred.chunk(2, dim=1)[0]
+
+                # compute previous image: x_t -> x_t-1
+                latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs, return_dict=False)[0]
+
+                if callback_on_step_end is not None:
+                    callback_kwargs = {}
+                    for k in callback_on_step_end_tensor_inputs:
+                        callback_kwargs[k] = locals()[k]
+                    callback_outputs = callback_on_step_end(self, i, t, callback_kwargs)
+
+                    latents = callback_outputs.pop("latents", latents)
+                    prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds)
+                    negative_prompt_embeds = callback_outputs.pop("negative_prompt_embeds", negative_prompt_embeds)
+
+                # call the callback, if provided
+                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+                    progress_bar.update()
+
+                if XLA_AVAILABLE:
+                    xm.mark_step()
+
+        if output_type == "latent":
+            video = latents
+        else:
+            latents = latents.to(self.vae.dtype)
+            torch_accelerator_module = getattr(torch, get_device(), torch.cuda)
+            oom_error = (
+                torch.OutOfMemoryError
+                if is_torch_version(">=", "2.5.0")
+                else torch_accelerator_module.OutOfMemoryError
+            )
+            latents_mean = (
+                torch.tensor(self.vae.config.latents_mean)
+                .view(1, self.vae.config.z_dim, 1, 1, 1)
+                .to(latents.device, latents.dtype)
+            )
+            latents_std = 1.0 / torch.tensor(self.vae.config.latents_std).view(1, self.vae.config.z_dim, 1, 1, 1).to(
+                latents.device, latents.dtype
+            )
+            latents = latents / latents_std + latents_mean
+            try:
+                video = self.vae.decode(latents, return_dict=False)[0]
+            except oom_error as e:
+                warnings.warn(
+                    f"{e}. \n"
+                    f"Try to use VAE tiling for large images. For example: \n"
+                    f"pipe.vae.enable_tiling(tile_sample_min_width=512, tile_sample_min_height=512)"
+                )
+
+            if use_resolution_binning:
+                video = self.video_processor.resize_and_crop_tensor(video, orig_width, orig_height)
+
+            video = self.video_processor.postprocess_video(video, output_type=output_type)
+
+        # Offload all models
+        self.maybe_free_model_hooks()
+
+        if not return_dict:
+            return (video,)
+
+        return SanaVideoPipelineOutput(frames=video)
diff --git a/src/diffusers/pipelines/sana_video/pipeline_sana_video_i2v.py b/src/diffusers/pipelines/sana_video/pipeline_sana_video_i2v.py
new file mode 100644
index 000000000000..e87880b64cee
--- /dev/null
+++ b/src/diffusers/pipelines/sana_video/pipeline_sana_video_i2v.py
@@ -0,0 +1,1066 @@
+# Copyright 2025 SANA-Video Authors and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import html
+import inspect
+import re
+import urllib.parse as ul
+import warnings
+from typing import Any, Callable, Dict, List, Optional, Tuple, Union
+
+import PIL
+import torch
+from transformers import Gemma2PreTrainedModel, GemmaTokenizer, GemmaTokenizerFast
+
+from ...callbacks import MultiPipelineCallbacks, PipelineCallback
+from ...image_processor import PipelineImageInput
+from ...loaders import SanaLoraLoaderMixin
+from ...models import AutoencoderDC, AutoencoderKLWan, SanaVideoTransformer3DModel
+from ...schedulers import FlowMatchEulerDiscreteScheduler
+from ...utils import (
+    BACKENDS_MAPPING,
+    USE_PEFT_BACKEND,
+    is_bs4_available,
+    is_ftfy_available,
+    is_torch_xla_available,
+    logging,
+    replace_example_docstring,
+    scale_lora_layers,
+    unscale_lora_layers,
+)
+from ...utils.torch_utils import get_device, is_torch_version, randn_tensor
+from ...video_processor import VideoProcessor
+from ..pipeline_utils import DiffusionPipeline
+from .pipeline_output import SanaVideoPipelineOutput
+from .pipeline_sana_video import ASPECT_RATIO_480_BIN, ASPECT_RATIO_720_BIN
+
+
+if is_torch_xla_available():
+    import torch_xla.core.xla_model as xm
+
+    XLA_AVAILABLE = True
+else:
+    XLA_AVAILABLE = False
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+if is_bs4_available():
+    from bs4 import BeautifulSoup
+
+if is_ftfy_available():
+    import ftfy
+
+
+EXAMPLE_DOC_STRING = """
+    Examples:
+        ```py
+        >>> import torch
+        >>> from diffusers import SanaImageToVideoPipeline
+        >>> from diffusers.utils import export_to_video, load_image
+
+        >>> pipe = SanaImageToVideoPipeline.from_pretrained("Efficient-Large-Model/SANA-Video_2B_480p_diffusers")
+        >>> pipe.transformer.to(torch.bfloat16)
+        >>> pipe.text_encoder.to(torch.bfloat16)
+        >>> pipe.vae.to(torch.float32)
+        >>> pipe.to("cuda")
+        >>> motion_score = 30
+
+        >>> prompt = "A woman stands against a stunning sunset backdrop, her long, wavy brown hair gently blowing in the breeze. She wears a sleeveless, light-colored blouse with a deep V-neckline, which accentuates her graceful posture. The warm hues of the setting sun cast a golden glow across her face and hair, creating a serene and ethereal atmosphere. The background features a blurred landscape with soft, rolling hills and scattered clouds, adding depth to the scene. The camera remains steady, capturing the tranquil moment from a medium close-up angle."
+        >>> negative_prompt = "A chaotic sequence with misshapen, deformed limbs in heavy motion blur, sudden disappearance, jump cuts, jerky movements, rapid shot changes, frames out of sync, inconsistent character shapes, temporal artifacts, jitter, and ghosting effects, creating a disorienting visual experience."
+        >>> motion_prompt = f" motion score: {motion_score}."
+        >>> prompt = prompt + motion_prompt
+        >>> image = load_image("https://raw.githubusercontent.com/NVlabs/Sana/refs/heads/main/asset/samples/i2v-1.png")
+
+        >>> output = pipe(
+        ...     image=image,
+        ...     prompt=prompt,
+        ...     negative_prompt=negative_prompt,
+        ...     height=480,
+        ...     width=832,
+        ...     frames=81,
+        ...     guidance_scale=6,
+        ...     num_inference_steps=50,
+        ...     generator=torch.Generator(device="cuda").manual_seed(42),
+        ... ).frames[0]
+
+        >>> export_to_video(output, "sana-ti2v-output.mp4", fps=16)
+        ```
+"""
+
+
+# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.retrieve_timesteps
+def retrieve_timesteps(
+    scheduler,
+    num_inference_steps: Optional[int] = None,
+    device: Optional[Union[str, torch.device]] = None,
+    timesteps: Optional[List[int]] = None,
+    sigmas: Optional[List[float]] = None,
+    **kwargs,
+):
+    r"""
+    Calls the scheduler's `set_timesteps` method and retrieves timesteps from the scheduler after the call. Handles
+    custom timesteps. Any kwargs will be supplied to `scheduler.set_timesteps`.
+
+    Args:
+        scheduler (`SchedulerMixin`):
+            The scheduler to get timesteps from.
+        num_inference_steps (`int`):
+            The number of diffusion steps used when generating samples with a pre-trained model. If used, `timesteps`
+            must be `None`.
+        device (`str` or `torch.device`, *optional*):
+            The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
+        timesteps (`List[int]`, *optional*):
+            Custom timesteps used to override the timestep spacing strategy of the scheduler. If `timesteps` is passed,
+            `num_inference_steps` and `sigmas` must be `None`.
+        sigmas (`List[float]`, *optional*):
+            Custom sigmas used to override the timestep spacing strategy of the scheduler. If `sigmas` is passed,
+            `num_inference_steps` and `timesteps` must be `None`.
+
+    Returns:
+        `Tuple[torch.Tensor, int]`: A tuple where the first element is the timestep schedule from the scheduler and the
+        second element is the number of inference steps.
+    """
+    if timesteps is not None and sigmas is not None:
+        raise ValueError("Only one of `timesteps` or `sigmas` can be passed. Please choose one to set custom values")
+    if timesteps is not None:
+        accepts_timesteps = "timesteps" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
+        if not accepts_timesteps:
+            raise ValueError(
+                f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
+                f" timestep schedules. Please check whether you are using the correct scheduler."
+            )
+        scheduler.set_timesteps(timesteps=timesteps, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+        num_inference_steps = len(timesteps)
+    elif sigmas is not None:
+        accept_sigmas = "sigmas" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
+        if not accept_sigmas:
+            raise ValueError(
+                f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
+                f" sigmas schedules. Please check whether you are using the correct scheduler."
+            )
+        scheduler.set_timesteps(sigmas=sigmas, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+        num_inference_steps = len(timesteps)
+    else:
+        scheduler.set_timesteps(num_inference_steps, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+    return timesteps, num_inference_steps
+
+
+# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img.retrieve_latents
+def retrieve_latents(
+    encoder_output: torch.Tensor, generator: Optional[torch.Generator] = None, sample_mode: str = "sample"
+):
+    if hasattr(encoder_output, "latent_dist") and sample_mode == "sample":
+        return encoder_output.latent_dist.sample(generator)
+    elif hasattr(encoder_output, "latent_dist") and sample_mode == "argmax":
+        return encoder_output.latent_dist.mode()
+    elif hasattr(encoder_output, "latents"):
+        return encoder_output.latents
+    else:
+        raise AttributeError("Could not access latents of provided encoder_output")
+
+
+class SanaImageToVideoPipeline(DiffusionPipeline, SanaLoraLoaderMixin):
+    r"""
+    Pipeline for image/text-to-video generation using [Sana](https://huggingface.co/papers/2509.24695). This model
+    inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods implemented for all
+    pipelines (downloading, saving, running on a particular device, etc.).
+
+    Args:
+        tokenizer ([`GemmaTokenizer`] or [`GemmaTokenizerFast`]):
+            The tokenizer used to tokenize the prompt.
+        text_encoder ([`Gemma2PreTrainedModel`]):
+            Text encoder model to encode the input prompts.
+        vae ([`AutoencoderKLWan` or `AutoencoderDCAEV`]):
+            Variational Auto-Encoder (VAE) Model to encode and decode videos to and from latent representations.
+        transformer ([`SanaVideoTransformer3DModel`]):
+            Conditional Transformer to denoise the input latents.
+        scheduler ([`FlowMatchEulerDiscreteScheduler`]):
+            A scheduler to be used in combination with `transformer` to denoise the encoded video latents.
+    """
+
+    # fmt: off
+    bad_punct_regex = re.compile(r"[" + "#®•©™&@·º½¾¿¡§~" + r"\)" + r"\(" + r"\]" + r"\[" + r"\}" + r"\{" + r"\|" + "\\" + r"\/" + r"\*" + r"]{1,}")
+    # fmt: on
+
+    model_cpu_offload_seq = "text_encoder->transformer->vae"
+    _callback_tensor_inputs = ["latents", "prompt_embeds", "negative_prompt_embeds"]
+
+    def __init__(
+        self,
+        tokenizer: Union[GemmaTokenizer, GemmaTokenizerFast],
+        text_encoder: Gemma2PreTrainedModel,
+        vae: Union[AutoencoderDC, AutoencoderKLWan],
+        transformer: SanaVideoTransformer3DModel,
+        scheduler: FlowMatchEulerDiscreteScheduler,
+    ):
+        super().__init__()
+
+        self.register_modules(
+            tokenizer=tokenizer, text_encoder=text_encoder, vae=vae, transformer=transformer, scheduler=scheduler
+        )
+
+        self.vae_scale_factor_temporal = self.vae.config.scale_factor_temporal if getattr(self, "vae", None) else 4
+        self.vae_scale_factor_spatial = self.vae.config.scale_factor_spatial if getattr(self, "vae", None) else 8
+
+        self.vae_scale_factor = self.vae_scale_factor_spatial
+
+        self.transformer_spatial_patch_size = (
+            self.transformer.config.patch_size[1] if getattr(self, "transformer", None) is not None else 1
+        )
+        self.transformer_temporal_patch_size = (
+            self.transformer.config.patch_size[0] if getattr(self, "transformer") is not None else 1
+        )
+
+        self.video_processor = VideoProcessor(vae_scale_factor=self.vae_scale_factor_spatial)
+
+    # Copied from diffusers.pipelines.sana.pipeline_sana.SanaPipeline._get_gemma_prompt_embeds
+    def _get_gemma_prompt_embeds(
+        self,
+        prompt: Union[str, List[str]],
+        device: torch.device,
+        dtype: torch.dtype,
+        clean_caption: bool = False,
+        max_sequence_length: int = 300,
+        complex_human_instruction: Optional[List[str]] = None,
+    ):
+        r"""
+        Encodes the prompt into text encoder hidden states.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                prompt to be encoded
+            device: (`torch.device`, *optional*):
+                torch device to place the resulting embeddings on
+            clean_caption (`bool`, defaults to `False`):
+                If `True`, the function will preprocess and clean the provided caption before encoding.
+            max_sequence_length (`int`, defaults to 300): Maximum sequence length to use for the prompt.
+            complex_human_instruction (`list[str]`, defaults to `complex_human_instruction`):
+                If `complex_human_instruction` is not empty, the function will use the complex Human instruction for
+                the prompt.
+        """
+        prompt = [prompt] if isinstance(prompt, str) else prompt
+
+        if getattr(self, "tokenizer", None) is not None:
+            self.tokenizer.padding_side = "right"
+
+        prompt = self._text_preprocessing(prompt, clean_caption=clean_caption)
+
+        # prepare complex human instruction
+        if not complex_human_instruction:
+            max_length_all = max_sequence_length
+        else:
+            chi_prompt = "\n".join(complex_human_instruction)
+            prompt = [chi_prompt + p for p in prompt]
+            num_chi_prompt_tokens = len(self.tokenizer.encode(chi_prompt))
+            max_length_all = num_chi_prompt_tokens + max_sequence_length - 2
+
+        text_inputs = self.tokenizer(
+            prompt,
+            padding="max_length",
+            max_length=max_length_all,
+            truncation=True,
+            add_special_tokens=True,
+            return_tensors="pt",
+        )
+        text_input_ids = text_inputs.input_ids
+
+        prompt_attention_mask = text_inputs.attention_mask
+        prompt_attention_mask = prompt_attention_mask.to(device)
+
+        prompt_embeds = self.text_encoder(text_input_ids.to(device), attention_mask=prompt_attention_mask)
+        prompt_embeds = prompt_embeds[0].to(dtype=dtype, device=device)
+
+        return prompt_embeds, prompt_attention_mask
+
+    # Copied from diffusers.pipelines.sana_video.pipeline_sana_video.SanaVideoPipeline.encode_prompt
+    def encode_prompt(
+        self,
+        prompt: Union[str, List[str]],
+        do_classifier_free_guidance: bool = True,
+        negative_prompt: str = "",
+        num_videos_per_prompt: int = 1,
+        device: Optional[torch.device] = None,
+        prompt_embeds: Optional[torch.Tensor] = None,
+        negative_prompt_embeds: Optional[torch.Tensor] = None,
+        prompt_attention_mask: Optional[torch.Tensor] = None,
+        negative_prompt_attention_mask: Optional[torch.Tensor] = None,
+        clean_caption: bool = False,
+        max_sequence_length: int = 300,
+        complex_human_instruction: Optional[List[str]] = None,
+        lora_scale: Optional[float] = None,
+    ):
+        r"""
+        Encodes the prompt into text encoder hidden states.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                prompt to be encoded
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt not to guide the video generation. If not defined, one has to pass `negative_prompt_embeds`
+                instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is less than `1`). For
+                PixArt-Alpha, this should be "".
+            do_classifier_free_guidance (`bool`, *optional*, defaults to `True`):
+                whether to use classifier free guidance or not
+            num_videos_per_prompt (`int`, *optional*, defaults to 1):
+                number of videos that should be generated per prompt
+            device: (`torch.device`, *optional*):
+                torch device to place the resulting embeddings on
+            prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated negative text embeddings. For Sana, it's should be the embeddings of the "" string.
+            clean_caption (`bool`, defaults to `False`):
+                If `True`, the function will preprocess and clean the provided caption before encoding.
+            max_sequence_length (`int`, defaults to 300): Maximum sequence length to use for the prompt.
+            complex_human_instruction (`list[str]`, defaults to `complex_human_instruction`):
+                If `complex_human_instruction` is not empty, the function will use the complex Human instruction for
+                the prompt.
+        """
+
+        if device is None:
+            device = self._execution_device
+
+        if self.text_encoder is not None:
+            dtype = self.text_encoder.dtype
+        else:
+            dtype = None
+
+        # set lora scale so that monkey patched LoRA
+        # function of text encoder can correctly access it
+        if lora_scale is not None and isinstance(self, SanaLoraLoaderMixin):
+            self._lora_scale = lora_scale
+
+            # dynamically adjust the LoRA scale
+            if self.text_encoder is not None and USE_PEFT_BACKEND:
+                scale_lora_layers(self.text_encoder, lora_scale)
+
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        if getattr(self, "tokenizer", None) is not None:
+            self.tokenizer.padding_side = "right"
+
+        # See Section 3.1. of the paper.
+        max_length = max_sequence_length
+        select_index = [0] + list(range(-max_length + 1, 0))
+
+        if prompt_embeds is None:
+            prompt_embeds, prompt_attention_mask = self._get_gemma_prompt_embeds(
+                prompt=prompt,
+                device=device,
+                dtype=dtype,
+                clean_caption=clean_caption,
+                max_sequence_length=max_sequence_length,
+                complex_human_instruction=complex_human_instruction,
+            )
+
+            prompt_embeds = prompt_embeds[:, select_index]
+            prompt_attention_mask = prompt_attention_mask[:, select_index]
+
+        bs_embed, seq_len, _ = prompt_embeds.shape
+        # duplicate text embeddings and attention mask for each generation per prompt, using mps friendly method
+        prompt_embeds = prompt_embeds.repeat(1, num_videos_per_prompt, 1)
+        prompt_embeds = prompt_embeds.view(bs_embed * num_videos_per_prompt, seq_len, -1)
+        prompt_attention_mask = prompt_attention_mask.view(bs_embed, -1)
+        prompt_attention_mask = prompt_attention_mask.repeat(num_videos_per_prompt, 1)
+
+        # get unconditional embeddings for classifier free guidance
+        if do_classifier_free_guidance and negative_prompt_embeds is None:
+            negative_prompt = [negative_prompt] * batch_size if isinstance(negative_prompt, str) else negative_prompt
+            negative_prompt_embeds, negative_prompt_attention_mask = self._get_gemma_prompt_embeds(
+                prompt=negative_prompt,
+                device=device,
+                dtype=dtype,
+                clean_caption=clean_caption,
+                max_sequence_length=max_sequence_length,
+                complex_human_instruction=False,
+            )
+
+        if do_classifier_free_guidance:
+            # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
+            seq_len = negative_prompt_embeds.shape[1]
+
+            negative_prompt_embeds = negative_prompt_embeds.to(dtype=dtype, device=device)
+
+            negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_videos_per_prompt, 1)
+            negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_videos_per_prompt, seq_len, -1)
+
+            negative_prompt_attention_mask = negative_prompt_attention_mask.view(bs_embed, -1)
+            negative_prompt_attention_mask = negative_prompt_attention_mask.repeat(num_videos_per_prompt, 1)
+        else:
+            negative_prompt_embeds = None
+            negative_prompt_attention_mask = None
+
+        if self.text_encoder is not None:
+            if isinstance(self, SanaLoraLoaderMixin) and USE_PEFT_BACKEND:
+                # Retrieve the original scale by scaling back the LoRA layers
+                unscale_lora_layers(self.text_encoder, lora_scale)
+
+        return prompt_embeds, prompt_attention_mask, negative_prompt_embeds, negative_prompt_attention_mask
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs
+    def prepare_extra_step_kwargs(self, generator, eta):
+        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
+        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
+        # eta corresponds to η in DDIM paper: https://huggingface.co/papers/2010.02502
+        # and should be between [0, 1]
+
+        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        extra_step_kwargs = {}
+        if accepts_eta:
+            extra_step_kwargs["eta"] = eta
+
+        # check if the scheduler accepts generator
+        accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        if accepts_generator:
+            extra_step_kwargs["generator"] = generator
+        return extra_step_kwargs
+
+    def check_inputs(
+        self,
+        prompt,
+        image,
+        height,
+        width,
+        callback_on_step_end_tensor_inputs=None,
+        negative_prompt=None,
+        prompt_embeds=None,
+        negative_prompt_embeds=None,
+        prompt_attention_mask=None,
+        negative_prompt_attention_mask=None,
+    ):
+        if height % 32 != 0 or width % 32 != 0:
+            raise ValueError(f"`height` and `width` have to be divisible by 32 but are {height} and {width}.")
+
+        if image is not None and not isinstance(image, torch.Tensor) and not isinstance(image, PIL.Image.Image):
+            raise ValueError(f"`image` has to be of type `torch.Tensor` or `PIL.Image.Image` but is {type(image)}")
+
+        if callback_on_step_end_tensor_inputs is not None and not all(
+            k in self._callback_tensor_inputs for k in callback_on_step_end_tensor_inputs
+        ):
+            raise ValueError(
+                f"`callback_on_step_end_tensor_inputs` has to be in {self._callback_tensor_inputs}, but found {[k for k in callback_on_step_end_tensor_inputs if k not in self._callback_tensor_inputs]}"
+            )
+
+        if prompt is not None and prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
+                " only forward one of the two."
+            )
+        elif prompt is None and prompt_embeds is None:
+            raise ValueError(
+                "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
+            )
+        elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+
+        if prompt is not None and negative_prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt`: {prompt} and `negative_prompt_embeds`:"
+                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
+            )
+
+        if negative_prompt is not None and negative_prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
+                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
+            )
+
+        if prompt_embeds is not None and prompt_attention_mask is None:
+            raise ValueError("Must provide `prompt_attention_mask` when specifying `prompt_embeds`.")
+
+        if negative_prompt_embeds is not None and negative_prompt_attention_mask is None:
+            raise ValueError("Must provide `negative_prompt_attention_mask` when specifying `negative_prompt_embeds`.")
+
+        if prompt_embeds is not None and negative_prompt_embeds is not None:
+            if prompt_embeds.shape != negative_prompt_embeds.shape:
+                raise ValueError(
+                    "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
+                    f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
+                    f" {negative_prompt_embeds.shape}."
+                )
+            if prompt_attention_mask.shape != negative_prompt_attention_mask.shape:
+                raise ValueError(
+                    "`prompt_attention_mask` and `negative_prompt_attention_mask` must have the same shape when passed directly, but"
+                    f" got: `prompt_attention_mask` {prompt_attention_mask.shape} != `negative_prompt_attention_mask`"
+                    f" {negative_prompt_attention_mask.shape}."
+                )
+
+    # Copied from diffusers.pipelines.deepfloyd_if.pipeline_if.IFPipeline._text_preprocessing
+    def _text_preprocessing(self, text, clean_caption=False):
+        if clean_caption and not is_bs4_available():
+            logger.warning(BACKENDS_MAPPING["bs4"][-1].format("Setting `clean_caption=True`"))
+            logger.warning("Setting `clean_caption` to False...")
+            clean_caption = False
+
+        if clean_caption and not is_ftfy_available():
+            logger.warning(BACKENDS_MAPPING["ftfy"][-1].format("Setting `clean_caption=True`"))
+            logger.warning("Setting `clean_caption` to False...")
+            clean_caption = False
+
+        if not isinstance(text, (tuple, list)):
+            text = [text]
+
+        def process(text: str):
+            if clean_caption:
+                text = self._clean_caption(text)
+                text = self._clean_caption(text)
+            else:
+                text = text.lower().strip()
+            return text
+
+        return [process(t) for t in text]
+
+    # Copied from diffusers.pipelines.deepfloyd_if.pipeline_if.IFPipeline._clean_caption
+    def _clean_caption(self, caption):
+        caption = str(caption)
+        caption = ul.unquote_plus(caption)
+        caption = caption.strip().lower()
+        caption = re.sub("<person>", "person", caption)
+        # urls:
+        caption = re.sub(
+            r"\b((?:https?:(?:\/{1,3}|[a-zA-Z0-9%])|[a-zA-Z0-9.\-]+[.](?:com|co|ru|net|org|edu|gov|it)[\w/-]*\b\/?(?!@)))",  # noqa
+            "",
+            caption,
+        )  # regex for urls
+        caption = re.sub(
+            r"\b((?:www:(?:\/{1,3}|[a-zA-Z0-9%])|[a-zA-Z0-9.\-]+[.](?:com|co|ru|net|org|edu|gov|it)[\w/-]*\b\/?(?!@)))",  # noqa
+            "",
+            caption,
+        )  # regex for urls
+        # html:
+        caption = BeautifulSoup(caption, features="html.parser").text
+
+        # @<nickname>
+        caption = re.sub(r"@[\w\d]+\b", "", caption)
+
+        # 31C0—31EF CJK Strokes
+        # 31F0—31FF Katakana Phonetic Extensions
+        # 3200—32FF Enclosed CJK Letters and Months
+        # 3300—33FF CJK Compatibility
+        # 3400—4DBF CJK Unified Ideographs Extension A
+        # 4DC0—4DFF Yijing Hexagram Symbols
+        # 4E00—9FFF CJK Unified Ideographs
+        caption = re.sub(r"[\u31c0-\u31ef]+", "", caption)
+        caption = re.sub(r"[\u31f0-\u31ff]+", "", caption)
+        caption = re.sub(r"[\u3200-\u32ff]+", "", caption)
+        caption = re.sub(r"[\u3300-\u33ff]+", "", caption)
+        caption = re.sub(r"[\u3400-\u4dbf]+", "", caption)
+        caption = re.sub(r"[\u4dc0-\u4dff]+", "", caption)
+        caption = re.sub(r"[\u4e00-\u9fff]+", "", caption)
+        #######################################################
+
+        # все виды тире / all types of dash --> "-"
+        caption = re.sub(
+            r"[\u002D\u058A\u05BE\u1400\u1806\u2010-\u2015\u2E17\u2E1A\u2E3A\u2E3B\u2E40\u301C\u3030\u30A0\uFE31\uFE32\uFE58\uFE63\uFF0D]+",  # noqa
+            "-",
+            caption,
+        )
+
+        # кавычки к одному стандарту
+        caption = re.sub(r"[`´«»“”¨]", '"', caption)
+        caption = re.sub(r"[‘’]", "'", caption)
+
+        # &quot;
+        caption = re.sub(r"&quot;?", "", caption)
+        # &amp
+        caption = re.sub(r"&amp", "", caption)
+
+        # ip addresses:
+        caption = re.sub(r"\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}", " ", caption)
+
+        # article ids:
+        caption = re.sub(r"\d:\d\d\s+$", "", caption)
+
+        # \n
+        caption = re.sub(r"\\n", " ", caption)
+
+        # "#123"
+        caption = re.sub(r"#\d{1,3}\b", "", caption)
+        # "#12345.."
+        caption = re.sub(r"#\d{5,}\b", "", caption)
+        # "123456.."
+        caption = re.sub(r"\b\d{6,}\b", "", caption)
+        # filenames:
+        caption = re.sub(r"[\S]+\.(?:png|jpg|jpeg|bmp|webp|eps|pdf|apk|mp4)", "", caption)
+
+        #
+        caption = re.sub(r"[\"\']{2,}", r'"', caption)  # """AUSVERKAUFT"""
+        caption = re.sub(r"[\.]{2,}", r" ", caption)  # """AUSVERKAUFT"""
+
+        caption = re.sub(self.bad_punct_regex, r" ", caption)  # ***AUSVERKAUFT***, #AUSVERKAUFT
+        caption = re.sub(r"\s+\.\s+", r" ", caption)  # " . "
+
+        # this-is-my-cute-cat / this_is_my_cute_cat
+        regex2 = re.compile(r"(?:\-|\_)")
+        if len(re.findall(regex2, caption)) > 3:
+            caption = re.sub(regex2, " ", caption)
+
+        caption = ftfy.fix_text(caption)
+        caption = html.unescape(html.unescape(caption))
+
+        caption = re.sub(r"\b[a-zA-Z]{1,3}\d{3,15}\b", "", caption)  # jc6640
+        caption = re.sub(r"\b[a-zA-Z]+\d+[a-zA-Z]+\b", "", caption)  # jc6640vc
+        caption = re.sub(r"\b\d+[a-zA-Z]+\d+\b", "", caption)  # 6640vc231
+
+        caption = re.sub(r"(worldwide\s+)?(free\s+)?shipping", "", caption)
+        caption = re.sub(r"(free\s)?download(\sfree)?", "", caption)
+        caption = re.sub(r"\bclick\b\s(?:for|on)\s\w+", "", caption)
+        caption = re.sub(r"\b(?:png|jpg|jpeg|bmp|webp|eps|pdf|apk|mp4)(\simage[s]?)?", "", caption)
+        caption = re.sub(r"\bpage\s+\d+\b", "", caption)
+
+        caption = re.sub(r"\b\d*[a-zA-Z]+\d+[a-zA-Z]+\d+[a-zA-Z\d]*\b", r" ", caption)  # j2d1a2a...
+
+        caption = re.sub(r"\b\d+\.?\d*[xх×]\d+\.?\d*\b", "", caption)
+
+        caption = re.sub(r"\b\s+\:\s+", r": ", caption)
+        caption = re.sub(r"(\D[,\./])\b", r"\1 ", caption)
+        caption = re.sub(r"\s+", " ", caption)
+
+        caption.strip()
+
+        caption = re.sub(r"^[\"\']([\w\W]+)[\"\']$", r"\1", caption)
+        caption = re.sub(r"^[\'\_,\-\:;]", r"", caption)
+        caption = re.sub(r"[\'\_,\-\:\-\+]$", r"", caption)
+        caption = re.sub(r"^\.\S+$", "", caption)
+
+        return caption.strip()
+
+    def prepare_latents(
+        self,
+        image: PipelineImageInput,
+        batch_size: int,
+        num_channels_latents: int = 16,
+        height: int = 480,
+        width: int = 832,
+        num_frames: int = 81,
+        dtype: Optional[torch.dtype] = None,
+        device: Optional[torch.device] = None,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        num_latent_frames = (num_frames - 1) // self.vae_scale_factor_temporal + 1
+        shape = (
+            batch_size,
+            num_channels_latents,
+            num_latent_frames,
+            int(height) // self.vae_scale_factor_spatial,
+            int(width) // self.vae_scale_factor_spatial,
+        )
+        if isinstance(generator, list) and len(generator) != batch_size:
+            raise ValueError(
+                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+            )
+
+        if latents is None:
+            latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+        else:
+            latents = latents.to(device=device, dtype=dtype)
+
+        image = image.unsqueeze(2)  # [B, C, 1, H, W]
+        image = image.to(device=device, dtype=self.vae.dtype)
+
+        if isinstance(generator, list):
+            image_latents = [retrieve_latents(self.vae.encode(image), sample_mode="argmax") for _ in generator]
+            image_latents = torch.cat(image_latents)
+        else:
+            image_latents = retrieve_latents(self.vae.encode(image), sample_mode="argmax")
+            image_latents = image_latents.repeat(batch_size, 1, 1, 1, 1)
+
+        latents_mean = (
+            torch.tensor(self.vae.config.latents_mean)
+            .view(1, -1, 1, 1, 1)
+            .to(image_latents.device, image_latents.dtype)
+        )
+        latents_std = 1.0 / torch.tensor(self.vae.config.latents_std).view(1, -1, 1, 1, 1).to(
+            image_latents.device, image_latents.dtype
+        )
+        image_latents = (image_latents - latents_mean) * latents_std
+
+        latents[:, :, 0:1] = image_latents.to(dtype)
+
+        return latents
+
+    @property
+    def guidance_scale(self):
+        return self._guidance_scale
+
+    @property
+    def attention_kwargs(self):
+        return self._attention_kwargs
+
+    @property
+    def do_classifier_free_guidance(self):
+        return self._guidance_scale > 1.0
+
+    @property
+    def num_timesteps(self):
+        return self._num_timesteps
+
+    @property
+    def interrupt(self):
+        return self._interrupt
+
+    @torch.no_grad()
+    @replace_example_docstring(EXAMPLE_DOC_STRING)
+    def __call__(
+        self,
+        image: PipelineImageInput,
+        prompt: Union[str, List[str]] = None,
+        negative_prompt: str = "",
+        num_inference_steps: int = 50,
+        timesteps: List[int] = None,
+        sigmas: List[float] = None,
+        guidance_scale: float = 6.0,
+        num_videos_per_prompt: Optional[int] = 1,
+        height: int = 480,
+        width: int = 832,
+        frames: int = 81,
+        eta: float = 0.0,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.Tensor] = None,
+        prompt_embeds: Optional[torch.Tensor] = None,
+        prompt_attention_mask: Optional[torch.Tensor] = None,
+        negative_prompt_embeds: Optional[torch.Tensor] = None,
+        negative_prompt_attention_mask: Optional[torch.Tensor] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        clean_caption: bool = False,
+        use_resolution_binning: bool = True,
+        attention_kwargs: Optional[Dict[str, Any]] = None,
+        callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None,
+        callback_on_step_end_tensor_inputs: List[str] = ["latents"],
+        max_sequence_length: int = 300,
+        complex_human_instruction: List[str] = [
+            "Given a user prompt, generate an 'Enhanced prompt' that provides detailed visual descriptions suitable for video generation. Evaluate the level of detail in the user prompt:",
+            "- If the prompt is simple, focus on adding specifics about colors, shapes, sizes, textures, motion, and temporal relationships to create vivid and dynamic scenes.",
+            "- If the prompt is already detailed, refine and enhance the existing details slightly without overcomplicating.",
+            "Here are examples of how to transform or refine prompts:",
+            "- User Prompt: A cat sleeping -> Enhanced: A small, fluffy white cat slowly settling into a curled position, peacefully falling asleep on a warm sunny windowsill, with gentle sunlight filtering through surrounding pots of blooming red flowers.",
+            "- User Prompt: A busy city street -> Enhanced: A bustling city street scene at dusk, featuring glowing street lamps gradually lighting up, a diverse crowd of people in colorful clothing walking past, and a double-decker bus smoothly passing by towering glass skyscrapers.",
+            "Please generate only the enhanced description for the prompt below and avoid including any additional commentary or evaluations:",
+            "User Prompt: ",
+        ],
+    ) -> Union[SanaVideoPipelineOutput, Tuple]:
+        """
+        Function invoked when calling the pipeline for generation.
+
+        Args:
+            image (`PipelineImageInput`):
+                The input image to condition the video generation on. The first frame of the generated video will be
+                conditioned on this image.
+            prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide the video generation. If not defined, one has to pass `prompt_embeds`.
+                instead.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the video generation. If not defined, one has to pass
+                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
+                less than `1`).
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality video at the
+                expense of slower inference.
+            timesteps (`List[int]`, *optional*):
+                Custom timesteps to use for the denoising process with schedulers which support a `timesteps` argument
+                in their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is
+                passed will be used. Must be in descending order.
+            sigmas (`List[float]`, *optional*):
+                Custom sigmas to use for the denoising process with schedulers which support a `sigmas` argument in
+                their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is passed
+                will be used.
+            guidance_scale (`float`, *optional*, defaults to 4.5):
+                Guidance scale as defined in [Classifier-Free Diffusion
+                Guidance](https://huggingface.co/papers/2207.12598). `guidance_scale` is defined as `w` of equation 2.
+                of [Imagen Paper](https://huggingface.co/papers/2205.11487). Guidance scale is enabled by setting
+                `guidance_scale > 1`. Higher guidance scale encourages to generate videos that are closely linked to
+                the text `prompt`, usually at the expense of lower video quality.
+            num_videos_per_prompt (`int`, *optional*, defaults to 1):
+                The number of videos to generate per prompt.
+            height (`int`, *optional*, defaults to 480):
+                The height in pixels of the generated video.
+            width (`int`, *optional*, defaults to 832):
+                The width in pixels of the generated video.
+            frames (`int`, *optional*, defaults to 81):
+                The number of frames in the generated video.
+            eta (`float`, *optional*, defaults to 0.0):
+                Corresponds to parameter eta (η) in the DDIM paper: https://huggingface.co/papers/2010.02502. Only
+                applies to [`schedulers.DDIMScheduler`], will be ignored for others.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
+                to make generation deterministic.
+            latents (`torch.Tensor`, *optional*):
+                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for video
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor will be generated by sampling using the supplied random `generator`.
+            prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            prompt_attention_mask (`torch.Tensor`, *optional*): Pre-generated attention mask for text embeddings.
+            negative_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated negative text embeddings. For PixArt-Sigma this negative prompt should be "". If not
+                provided, negative_prompt_embeds will be generated from `negative_prompt` input argument.
+            negative_prompt_attention_mask (`torch.Tensor`, *optional*):
+                Pre-generated attention mask for negative text embeddings.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generated video. Choose between mp4 or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`SanaVideoPipelineOutput`] instead of a plain tuple.
+            attention_kwargs:
+                A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
+                `self.processor` in
+                [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
+            clean_caption (`bool`, *optional*, defaults to `True`):
+                Whether or not to clean the caption before creating embeddings. Requires `beautifulsoup4` and `ftfy` to
+                be installed. If the dependencies are not installed, the embeddings will be created from the raw
+                prompt.
+            use_resolution_binning (`bool` defaults to `True`):
+                If set to `True`, the requested height and width are first mapped to the closest resolutions using
+                `ASPECT_RATIO_480_BIN` or `ASPECT_RATIO_720_BIN`. After the produced latents are decoded into videos,
+                they are resized back to the requested resolution. Useful for generating non-square videos.
+            callback_on_step_end (`Callable`, *optional*):
+                A function that calls at the end of each denoising steps during the inference. The function is called
+                with the following arguments: `callback_on_step_end(self: DiffusionPipeline, step: int, timestep: int,
+                callback_kwargs: Dict)`. `callback_kwargs` will include a list of all tensors as specified by
+                `callback_on_step_end_tensor_inputs`.
+            callback_on_step_end_tensor_inputs (`List`, *optional*):
+                The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
+                will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
+                `._callback_tensor_inputs` attribute of your pipeline class.
+            max_sequence_length (`int` defaults to `300`):
+                Maximum sequence length to use with the `prompt`.
+            complex_human_instruction (`List[str]`, *optional*):
+                Instructions for complex human attention:
+                https://github.com/NVlabs/Sana/blob/main/configs/sana_app_config/Sana_1600M_app.yaml#L55.
+
+        Examples:
+
+        Returns:
+            [`~pipelines.sana_video.pipeline_output.SanaVideoPipelineOutput`] or `tuple`:
+                If `return_dict` is `True`, [`~pipelines.sana_video.pipeline_output.SanaVideoPipelineOutput`] is
+                returned, otherwise a `tuple` is returned where the first element is a list with the generated videos
+        """
+
+        if isinstance(callback_on_step_end, (PipelineCallback, MultiPipelineCallbacks)):
+            callback_on_step_end_tensor_inputs = callback_on_step_end.tensor_inputs
+
+        # 1. Check inputs. Raise error if not correct
+        if use_resolution_binning:
+            if self.transformer.config.sample_size == 30:
+                aspect_ratio_bin = ASPECT_RATIO_480_BIN
+            elif self.transformer.config.sample_size == 22:
+                aspect_ratio_bin = ASPECT_RATIO_720_BIN
+            else:
+                raise ValueError("Invalid sample size")
+            orig_height, orig_width = height, width
+            height, width = self.video_processor.classify_height_width_bin(height, width, ratios=aspect_ratio_bin)
+
+        self.check_inputs(
+            prompt,
+            image,
+            height,
+            width,
+            callback_on_step_end_tensor_inputs,
+            negative_prompt,
+            prompt_embeds,
+            negative_prompt_embeds,
+            prompt_attention_mask,
+            negative_prompt_attention_mask,
+        )
+
+        self._guidance_scale = guidance_scale
+        self._attention_kwargs = attention_kwargs
+        self._interrupt = False
+
+        # 2. Default height and width to transformer
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        device = self._execution_device
+        lora_scale = self.attention_kwargs.get("scale", None) if self.attention_kwargs is not None else None
+
+        # 3. Encode input prompt
+        (
+            prompt_embeds,
+            prompt_attention_mask,
+            negative_prompt_embeds,
+            negative_prompt_attention_mask,
+        ) = self.encode_prompt(
+            prompt,
+            self.do_classifier_free_guidance,
+            negative_prompt=negative_prompt,
+            num_videos_per_prompt=num_videos_per_prompt,
+            device=device,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            prompt_attention_mask=prompt_attention_mask,
+            negative_prompt_attention_mask=negative_prompt_attention_mask,
+            clean_caption=clean_caption,
+            max_sequence_length=max_sequence_length,
+            complex_human_instruction=complex_human_instruction,
+            lora_scale=lora_scale,
+        )
+        if self.do_classifier_free_guidance:
+            prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds], dim=0)
+            prompt_attention_mask = torch.cat([negative_prompt_attention_mask, prompt_attention_mask], dim=0)
+
+        # 4. Prepare timesteps
+        timesteps, num_inference_steps = retrieve_timesteps(
+            self.scheduler, num_inference_steps, device, timesteps, sigmas
+        )
+
+        # 5. Prepare latents.
+        latent_channels = self.transformer.config.in_channels
+        image = self.video_processor.preprocess(image, height=height, width=width).to(device, dtype=torch.float32)
+
+        latents = self.prepare_latents(
+            image,
+            batch_size * num_videos_per_prompt,
+            latent_channels,
+            height,
+            width,
+            frames,
+            torch.float32,
+            device,
+            generator,
+            latents,
+        )
+
+        conditioning_mask = latents.new_zeros(
+            batch_size,
+            1,
+            latents.shape[2] // self.transformer_temporal_patch_size,
+            latents.shape[3] // self.transformer_spatial_patch_size,
+            latents.shape[4] // self.transformer_spatial_patch_size,
+        )
+        conditioning_mask[:, :, 0] = 1.0
+        if self.do_classifier_free_guidance:
+            conditioning_mask = torch.cat([conditioning_mask, conditioning_mask])
+
+        # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
+        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
+
+        # 7. Denoising loop
+        num_warmup_steps = max(len(timesteps) - num_inference_steps * self.scheduler.order, 0)
+        self._num_timesteps = len(timesteps)
+
+        transformer_dtype = self.transformer.dtype
+        with self.progress_bar(total=num_inference_steps) as progress_bar:
+            for i, t in enumerate(timesteps):
+                if self.interrupt:
+                    continue
+
+                latent_model_input = torch.cat([latents] * 2) if self.do_classifier_free_guidance else latents
+
+                # broadcast to batch dimension in a way that's compatible with ONNX/Core ML
+                timestep = t.expand(conditioning_mask.shape)
+                timestep = timestep * (1 - conditioning_mask)
+
+                # predict noise model_output
+                noise_pred = self.transformer(
+                    latent_model_input.to(dtype=transformer_dtype),
+                    encoder_hidden_states=prompt_embeds.to(dtype=transformer_dtype),
+                    encoder_attention_mask=prompt_attention_mask,
+                    timestep=timestep,
+                    return_dict=False,
+                    attention_kwargs=self.attention_kwargs,
+                )[0]
+                noise_pred = noise_pred.float()
+
+                # perform guidance
+                if self.do_classifier_free_guidance:
+                    noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                    noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
+                    timestep, _ = timestep.chunk(2)
+
+                # learned sigma
+                if self.transformer.config.out_channels // 2 == latent_channels:
+                    noise_pred = noise_pred.chunk(2, dim=1)[0]
+
+                noise_pred = noise_pred[:, :, 1:]
+                noise_latents = latents[:, :, 1:]
+                pred_latents = self.scheduler.step(
+                    noise_pred, t, noise_latents, **extra_step_kwargs, return_dict=False
+                )[0]
+
+                latents = torch.cat([latents[:, :, :1], pred_latents], dim=2)
+
+                if callback_on_step_end is not None:
+                    callback_kwargs = {}
+                    for k in callback_on_step_end_tensor_inputs:
+                        callback_kwargs[k] = locals()[k]
+                    callback_outputs = callback_on_step_end(self, i, t, callback_kwargs)
+
+                    latents = callback_outputs.pop("latents", latents)
+                    prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds)
+                    negative_prompt_embeds = callback_outputs.pop("negative_prompt_embeds", negative_prompt_embeds)
+
+                # call the callback, if provided
+                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+                    progress_bar.update()
+
+                if XLA_AVAILABLE:
+                    xm.mark_step()
+
+        if output_type == "latent":
+            video = latents
+        else:
+            latents = latents.to(self.vae.dtype)
+            torch_accelerator_module = getattr(torch, get_device(), torch.cuda)
+            oom_error = (
+                torch.OutOfMemoryError
+                if is_torch_version(">=", "2.5.0")
+                else torch_accelerator_module.OutOfMemoryError
+            )
+            latents_mean = (
+                torch.tensor(self.vae.config.latents_mean)
+                .view(1, self.vae.config.z_dim, 1, 1, 1)
+                .to(latents.device, latents.dtype)
+            )
+            latents_std = 1.0 / torch.tensor(self.vae.config.latents_std).view(1, self.vae.config.z_dim, 1, 1, 1).to(
+                latents.device, latents.dtype
+            )
+            latents = latents / latents_std + latents_mean
+            try:
+                video = self.vae.decode(latents, return_dict=False)[0]
+            except oom_error as e:
+                warnings.warn(
+                    f"{e}. \n"
+                    f"Try to use VAE tiling for large images. For example: \n"
+                    f"pipe.vae.enable_tiling(tile_sample_min_width=512, tile_sample_min_height=512)"
+                )
+
+            if use_resolution_binning:
+                video = self.video_processor.resize_and_crop_tensor(video, orig_width, orig_height)
+
+            video = self.video_processor.postprocess_video(video, output_type=output_type)
+
+        # Offload all models
+        self.maybe_free_model_hooks()
+
+        if not return_dict:
+            return (video,)
+
+        return SanaVideoPipelineOutput(frames=video)
diff --git a/src/diffusers/pipelines/shap_e/renderer.py b/src/diffusers/pipelines/shap_e/renderer.py
index b268eae806a6..d1d05c894595 100644
--- a/src/diffusers/pipelines/shap_e/renderer.py
+++ b/src/diffusers/pipelines/shap_e/renderer.py
@@ -742,7 +742,7 @@ class ShapEParamsProjModel(ModelMixin, ConfigMixin):
     def __init__(
         self,
         *,
-        param_names: Tuple[str] = (
+        param_names: Tuple[str, ...] = (
             "nerstf.mlp.0.weight",
             "nerstf.mlp.1.weight",
             "nerstf.mlp.2.weight",
@@ -786,13 +786,13 @@ class ShapERenderer(ModelMixin, ConfigMixin):
     def __init__(
         self,
         *,
-        param_names: Tuple[str] = (
+        param_names: Tuple[str, ...] = (
             "nerstf.mlp.0.weight",
             "nerstf.mlp.1.weight",
             "nerstf.mlp.2.weight",
             "nerstf.mlp.3.weight",
         ),
-        param_shapes: Tuple[Tuple[int]] = (
+        param_shapes: Tuple[Tuple[int, int], ...] = (
             (256, 93),
             (256, 256),
             (256, 256),
@@ -804,7 +804,7 @@ def __init__(
         n_hidden_layers: int = 6,
         act_fn: str = "swish",
         insert_direction_at: int = 4,
-        background: Tuple[float] = (
+        background: Tuple[float, ...] = (
             255.0,
             255.0,
             255.0,
diff --git a/src/diffusers/pipelines/skyreels_v2/pipeline_skyreels_v2.py b/src/diffusers/pipelines/skyreels_v2/pipeline_skyreels_v2.py
index 8562a5eaf0e6..d6cd7d7feceb 100644
--- a/src/diffusers/pipelines/skyreels_v2/pipeline_skyreels_v2.py
+++ b/src/diffusers/pipelines/skyreels_v2/pipeline_skyreels_v2.py
@@ -415,11 +415,11 @@ def __call__(
                 The number of denoising steps. More denoising steps usually lead to a higher quality image at the
                 expense of slower inference.
             guidance_scale (`float`, defaults to `6.0`):
-                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
-                `guidance_scale` is defined as `w` of equation 2. of [Imagen
-                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
-                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
-                usually at the expense of lower image quality.
+                Guidance scale as defined in [Classifier-Free Diffusion
+                Guidance](https://huggingface.co/papers/2207.12598). `guidance_scale` is defined as `w` of equation 2.
+                of [Imagen Paper](https://huggingface.co/papers/2205.11487). Guidance scale is enabled by setting
+                `guidance_scale > 1`. Higher guidance scale encourages to generate images that are closely linked to
+                the text `prompt`, usually at the expense of lower image quality.
             num_videos_per_prompt (`int`, *optional*, defaults to 1):
                 The number of images to generate per prompt.
             generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
diff --git a/src/diffusers/pipelines/skyreels_v2/pipeline_skyreels_v2_diffusion_forcing.py b/src/diffusers/pipelines/skyreels_v2/pipeline_skyreels_v2_diffusion_forcing.py
index d0a4e118ce43..089f92632d38 100644
--- a/src/diffusers/pipelines/skyreels_v2/pipeline_skyreels_v2_diffusion_forcing.py
+++ b/src/diffusers/pipelines/skyreels_v2/pipeline_skyreels_v2_diffusion_forcing.py
@@ -647,11 +647,11 @@ def __call__(
                 The number of denoising steps. More denoising steps usually lead to a higher quality image at the
                 expense of slower inference.
             guidance_scale (`float`, defaults to `6.0`):
-                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
-                `guidance_scale` is defined as `w` of equation 2. of [Imagen
-                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
-                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
-                usually at the expense of lower image quality. (**6.0 for T2V**, **5.0 for I2V**)
+                Guidance scale as defined in [Classifier-Free Diffusion
+                Guidance](https://huggingface.co/papers/2207.12598). `guidance_scale` is defined as `w` of equation 2.
+                of [Imagen Paper](https://huggingface.co/papers/2205.11487). Guidance scale is enabled by setting
+                `guidance_scale > 1`. Higher guidance scale encourages to generate images that are closely linked to
+                the text `prompt`, usually at the expense of lower image quality. (**6.0 for T2V**, **5.0 for I2V**)
             num_videos_per_prompt (`int`, *optional*, defaults to 1):
                 The number of images to generate per prompt.
             generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
diff --git a/src/diffusers/pipelines/skyreels_v2/pipeline_skyreels_v2_diffusion_forcing_i2v.py b/src/diffusers/pipelines/skyreels_v2/pipeline_skyreels_v2_diffusion_forcing_i2v.py
index 959cbb32f23a..2951a9447386 100644
--- a/src/diffusers/pipelines/skyreels_v2/pipeline_skyreels_v2_diffusion_forcing_i2v.py
+++ b/src/diffusers/pipelines/skyreels_v2/pipeline_skyreels_v2_diffusion_forcing_i2v.py
@@ -698,11 +698,11 @@ def __call__(
                 The number of denoising steps. More denoising steps usually lead to a higher quality image at the
                 expense of slower inference.
             guidance_scale (`float`, defaults to `5.0`):
-                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
-                `guidance_scale` is defined as `w` of equation 2. of [Imagen
-                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
-                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
-                usually at the expense of lower image quality. (**6.0 for T2V**, **5.0 for I2V**)
+                Guidance scale as defined in [Classifier-Free Diffusion
+                Guidance](https://huggingface.co/papers/2207.12598). `guidance_scale` is defined as `w` of equation 2.
+                of [Imagen Paper](https://huggingface.co/papers/2205.11487). Guidance scale is enabled by setting
+                `guidance_scale > 1`. Higher guidance scale encourages to generate images that are closely linked to
+                the text `prompt`, usually at the expense of lower image quality. (**6.0 for T2V**, **5.0 for I2V**)
             num_videos_per_prompt (`int`, *optional*, defaults to 1):
                 The number of images to generate per prompt.
             generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
diff --git a/src/diffusers/pipelines/skyreels_v2/pipeline_skyreels_v2_i2v.py b/src/diffusers/pipelines/skyreels_v2/pipeline_skyreels_v2_i2v.py
index d59b4ce3cb17..d61b687eadc3 100644
--- a/src/diffusers/pipelines/skyreels_v2/pipeline_skyreels_v2_i2v.py
+++ b/src/diffusers/pipelines/skyreels_v2/pipeline_skyreels_v2_i2v.py
@@ -524,11 +524,11 @@ def __call__(
                 The number of denoising steps. More denoising steps usually lead to a higher quality image at the
                 expense of slower inference.
             guidance_scale (`float`, defaults to `5.0`):
-                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
-                `guidance_scale` is defined as `w` of equation 2. of [Imagen
-                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
-                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
-                usually at the expense of lower image quality.
+                Guidance scale as defined in [Classifier-Free Diffusion
+                Guidance](https://huggingface.co/papers/2207.12598). `guidance_scale` is defined as `w` of equation 2.
+                of [Imagen Paper](https://huggingface.co/papers/2205.11487). Guidance scale is enabled by setting
+                `guidance_scale > 1`. Higher guidance scale encourages to generate images that are closely linked to
+                the text `prompt`, usually at the expense of lower image quality.
             num_videos_per_prompt (`int`, *optional*, defaults to 1):
                 The number of images to generate per prompt.
             generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_upscale.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_upscale.py
index 02f7526b7102..f13cdc67073f 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_upscale.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_upscale.py
@@ -24,10 +24,6 @@
 from ...image_processor import PipelineImageInput, VaeImageProcessor
 from ...loaders import FromSingleFileMixin, StableDiffusionLoraLoaderMixin, TextualInversionLoaderMixin
 from ...models import AutoencoderKL, UNet2DConditionModel
-from ...models.attention_processor import (
-    AttnProcessor2_0,
-    XFormersAttnProcessor,
-)
 from ...models.lora import adjust_lora_scale_text_encoder
 from ...schedulers import DDPMScheduler, KarrasDiffusionSchedulers
 from ...utils import (
@@ -529,21 +525,12 @@ def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype
         return latents
 
     def upcast_vae(self):
-        dtype = self.vae.dtype
-        self.vae.to(dtype=torch.float32)
-        use_torch_2_0_or_xformers = isinstance(
-            self.vae.decoder.mid_block.attentions[0].processor,
-            (
-                AttnProcessor2_0,
-                XFormersAttnProcessor,
-            ),
+        deprecate(
+            "upcast_vae",
+            "1.0.0",
+            "`upcast_vae` is deprecated. Please use `pipe.vae.to(torch.float32)`. For more details, please refer to: https://github.com/huggingface/diffusers/pull/12619#issue-3606633695.",
         )
-        # if xformers or torch_2_0 is used attention block does not need
-        # to be in float32 which can save lots of memory
-        if use_torch_2_0_or_xformers:
-            self.vae.post_quant_conv.to(dtype)
-            self.vae.decoder.conv_in.to(dtype)
-            self.vae.decoder.mid_block.to(dtype)
+        self.vae.to(dtype=torch.float32)
 
     @torch.no_grad()
     def __call__(
diff --git a/src/diffusers/pipelines/stable_diffusion/safety_checker.py b/src/diffusers/pipelines/stable_diffusion/safety_checker.py
index 16aff102599c..65daafe01237 100644
--- a/src/diffusers/pipelines/stable_diffusion/safety_checker.py
+++ b/src/diffusers/pipelines/stable_diffusion/safety_checker.py
@@ -17,7 +17,7 @@
 import torch.nn as nn
 from transformers import CLIPConfig, CLIPVisionModel, PreTrainedModel
 
-from ...utils import logging
+from ...utils import is_transformers_version, logging
 
 
 logger = logging.get_logger(__name__)
@@ -46,6 +46,9 @@ def __init__(self, config: CLIPConfig):
 
         self.concept_embeds_weights = nn.Parameter(torch.ones(17), requires_grad=False)
         self.special_care_embeds_weights = nn.Parameter(torch.ones(3), requires_grad=False)
+        # Model requires post_init after transformers v4.57.3
+        if is_transformers_version(">", "4.57.3"):
+            self.post_init()
 
     @torch.no_grad()
     def forward(self, clip_input, images):
diff --git a/src/diffusers/pipelines/stable_diffusion_3/pipeline_stable_diffusion_3.py b/src/diffusers/pipelines/stable_diffusion_3/pipeline_stable_diffusion_3.py
index 3b7b26dc636c..660d9801df56 100644
--- a/src/diffusers/pipelines/stable_diffusion_3/pipeline_stable_diffusion_3.py
+++ b/src/diffusers/pipelines/stable_diffusion_3/pipeline_stable_diffusion_3.py
@@ -336,7 +336,7 @@ def _get_clip_prompt_embeds(
         prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
         prompt_embeds = prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
 
-        pooled_prompt_embeds = pooled_prompt_embeds.repeat(1, num_images_per_prompt, 1)
+        pooled_prompt_embeds = pooled_prompt_embeds.repeat(1, num_images_per_prompt)
         pooled_prompt_embeds = pooled_prompt_embeds.view(batch_size * num_images_per_prompt, -1)
 
         return prompt_embeds, pooled_prompt_embeds
diff --git a/src/diffusers/pipelines/stable_diffusion_3/pipeline_stable_diffusion_3_img2img.py b/src/diffusers/pipelines/stable_diffusion_3/pipeline_stable_diffusion_3_img2img.py
index db047f19924d..9b11bc8781e7 100644
--- a/src/diffusers/pipelines/stable_diffusion_3/pipeline_stable_diffusion_3_img2img.py
+++ b/src/diffusers/pipelines/stable_diffusion_3/pipeline_stable_diffusion_3_img2img.py
@@ -361,7 +361,7 @@ def _get_clip_prompt_embeds(
         prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
         prompt_embeds = prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
 
-        pooled_prompt_embeds = pooled_prompt_embeds.repeat(1, num_images_per_prompt, 1)
+        pooled_prompt_embeds = pooled_prompt_embeds.repeat(1, num_images_per_prompt)
         pooled_prompt_embeds = pooled_prompt_embeds.view(batch_size * num_images_per_prompt, -1)
 
         return prompt_embeds, pooled_prompt_embeds
diff --git a/src/diffusers/pipelines/stable_diffusion_3/pipeline_stable_diffusion_3_inpaint.py b/src/diffusers/pipelines/stable_diffusion_3/pipeline_stable_diffusion_3_inpaint.py
index c95fa530c8d7..b947cbff0914 100644
--- a/src/diffusers/pipelines/stable_diffusion_3/pipeline_stable_diffusion_3_inpaint.py
+++ b/src/diffusers/pipelines/stable_diffusion_3/pipeline_stable_diffusion_3_inpaint.py
@@ -367,7 +367,7 @@ def _get_clip_prompt_embeds(
         prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
         prompt_embeds = prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
 
-        pooled_prompt_embeds = pooled_prompt_embeds.repeat(1, num_images_per_prompt, 1)
+        pooled_prompt_embeds = pooled_prompt_embeds.repeat(1, num_images_per_prompt)
         pooled_prompt_embeds = pooled_prompt_embeds.view(batch_size * num_images_per_prompt, -1)
 
         return prompt_embeds, pooled_prompt_embeds
diff --git a/src/diffusers/pipelines/stable_diffusion_k_diffusion/pipeline_stable_diffusion_xl_k_diffusion.py b/src/diffusers/pipelines/stable_diffusion_k_diffusion/pipeline_stable_diffusion_xl_k_diffusion.py
index 766ca37d8142..f9a8abfcc568 100644
--- a/src/diffusers/pipelines/stable_diffusion_k_diffusion/pipeline_stable_diffusion_xl_k_diffusion.py
+++ b/src/diffusers/pipelines/stable_diffusion_k_diffusion/pipeline_stable_diffusion_xl_k_diffusion.py
@@ -33,15 +33,11 @@
     TextualInversionLoaderMixin,
 )
 from ...models import AutoencoderKL, UNet2DConditionModel
-from ...models.attention_processor import (
-    AttnProcessor2_0,
-    FusedAttnProcessor2_0,
-    XFormersAttnProcessor,
-)
 from ...models.lora import adjust_lora_scale_text_encoder
 from ...schedulers import KarrasDiffusionSchedulers, LMSDiscreteScheduler
 from ...utils import (
     USE_PEFT_BACKEND,
+    deprecate,
     logging,
     replace_example_docstring,
     scale_lora_layers,
@@ -545,22 +541,12 @@ def _get_add_time_ids(
 
     # Copied from diffusers.pipelines.stable_diffusion_xl.pipeline_stable_diffusion_xl.StableDiffusionXLPipeline.upcast_vae
     def upcast_vae(self):
-        dtype = self.vae.dtype
-        self.vae.to(dtype=torch.float32)
-        use_torch_2_0_or_xformers = isinstance(
-            self.vae.decoder.mid_block.attentions[0].processor,
-            (
-                AttnProcessor2_0,
-                XFormersAttnProcessor,
-                FusedAttnProcessor2_0,
-            ),
+        deprecate(
+            "upcast_vae",
+            "1.0.0",
+            "`upcast_vae` is deprecated. Please use `pipe.vae.to(torch.float32)`. For more details, please refer to: https://github.com/huggingface/diffusers/pull/12619#issue-3606633695.",
         )
-        # if xformers or torch_2_0 is used attention block does not need
-        # to be in float32 which can save lots of memory
-        if use_torch_2_0_or_xformers:
-            self.vae.post_quant_conv.to(dtype)
-            self.vae.decoder.conv_in.to(dtype)
-            self.vae.decoder.mid_block.to(dtype)
+        self.vae.to(dtype=torch.float32)
 
     @property
     def guidance_scale(self):
diff --git a/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl.py b/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl.py
index b97cf6f1f6f8..e969d2a21a99 100644
--- a/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl.py
+++ b/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl.py
@@ -33,11 +33,6 @@
     TextualInversionLoaderMixin,
 )
 from ...models import AutoencoderKL, ImageProjection, UNet2DConditionModel
-from ...models.attention_processor import (
-    AttnProcessor2_0,
-    FusedAttnProcessor2_0,
-    XFormersAttnProcessor,
-)
 from ...models.lora import adjust_lora_scale_text_encoder
 from ...schedulers import KarrasDiffusionSchedulers
 from ...utils import (
@@ -750,22 +745,12 @@ def _get_add_time_ids(
         return add_time_ids
 
     def upcast_vae(self):
-        dtype = self.vae.dtype
-        self.vae.to(dtype=torch.float32)
-        use_torch_2_0_or_xformers = isinstance(
-            self.vae.decoder.mid_block.attentions[0].processor,
-            (
-                AttnProcessor2_0,
-                XFormersAttnProcessor,
-                FusedAttnProcessor2_0,
-            ),
+        deprecate(
+            "upcast_vae",
+            "1.0.0",
+            "`upcast_vae` is deprecated. Please use `pipe.vae.to(torch.float32)`. For more details, please refer to: https://github.com/huggingface/diffusers/pull/12619#issue-3606633695.",
         )
-        # if xformers or torch_2_0 is used attention block does not need
-        # to be in float32 which can save lots of memory
-        if use_torch_2_0_or_xformers:
-            self.vae.post_quant_conv.to(dtype)
-            self.vae.decoder.conv_in.to(dtype)
-            self.vae.decoder.mid_block.to(dtype)
+        self.vae.to(dtype=torch.float32)
 
     # Copied from diffusers.pipelines.latent_consistency_models.pipeline_latent_consistency_text2img.LatentConsistencyModelPipeline.get_guidance_scale_embedding
     def get_guidance_scale_embedding(
diff --git a/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_img2img.py b/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_img2img.py
index 44e8f4fe4b54..8d1da8dc102c 100644
--- a/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_img2img.py
+++ b/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_img2img.py
@@ -34,10 +34,6 @@
     TextualInversionLoaderMixin,
 )
 from ...models import AutoencoderKL, ImageProjection, UNet2DConditionModel
-from ...models.attention_processor import (
-    AttnProcessor2_0,
-    XFormersAttnProcessor,
-)
 from ...models.lora import adjust_lora_scale_text_encoder
 from ...schedulers import KarrasDiffusionSchedulers
 from ...utils import (
@@ -897,21 +893,12 @@ def _get_add_time_ids(
 
     # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_upscale.StableDiffusionUpscalePipeline.upcast_vae
     def upcast_vae(self):
-        dtype = self.vae.dtype
-        self.vae.to(dtype=torch.float32)
-        use_torch_2_0_or_xformers = isinstance(
-            self.vae.decoder.mid_block.attentions[0].processor,
-            (
-                AttnProcessor2_0,
-                XFormersAttnProcessor,
-            ),
+        deprecate(
+            "upcast_vae",
+            "1.0.0",
+            "`upcast_vae` is deprecated. Please use `pipe.vae.to(torch.float32)`. For more details, please refer to: https://github.com/huggingface/diffusers/pull/12619#issue-3606633695.",
         )
-        # if xformers or torch_2_0 is used attention block does not need
-        # to be in float32 which can save lots of memory
-        if use_torch_2_0_or_xformers:
-            self.vae.post_quant_conv.to(dtype)
-            self.vae.decoder.conv_in.to(dtype)
-            self.vae.decoder.mid_block.to(dtype)
+        self.vae.to(dtype=torch.float32)
 
     # Copied from diffusers.pipelines.latent_consistency_models.pipeline_latent_consistency_text2img.LatentConsistencyModelPipeline.get_guidance_scale_embedding
     def get_guidance_scale_embedding(
diff --git a/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_inpaint.py b/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_inpaint.py
index 88cc7515b051..54a1e311804c 100644
--- a/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_inpaint.py
+++ b/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_inpaint.py
@@ -35,10 +35,6 @@
     TextualInversionLoaderMixin,
 )
 from ...models import AutoencoderKL, ImageProjection, UNet2DConditionModel
-from ...models.attention_processor import (
-    AttnProcessor2_0,
-    XFormersAttnProcessor,
-)
 from ...models.lora import adjust_lora_scale_text_encoder
 from ...schedulers import KarrasDiffusionSchedulers
 from ...utils import (
@@ -1002,21 +998,12 @@ def _get_add_time_ids(
 
     # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_upscale.StableDiffusionUpscalePipeline.upcast_vae
     def upcast_vae(self):
-        dtype = self.vae.dtype
-        self.vae.to(dtype=torch.float32)
-        use_torch_2_0_or_xformers = isinstance(
-            self.vae.decoder.mid_block.attentions[0].processor,
-            (
-                AttnProcessor2_0,
-                XFormersAttnProcessor,
-            ),
+        deprecate(
+            "upcast_vae",
+            "1.0.0",
+            "`upcast_vae` is deprecated. Please use `pipe.vae.to(torch.float32)`. For more details, please refer to: https://github.com/huggingface/diffusers/pull/12619#issue-3606633695.",
         )
-        # if xformers or torch_2_0 is used attention block does not need
-        # to be in float32 which can save lots of memory
-        if use_torch_2_0_or_xformers:
-            self.vae.post_quant_conv.to(dtype)
-            self.vae.decoder.conv_in.to(dtype)
-            self.vae.decoder.mid_block.to(dtype)
+        self.vae.to(dtype=torch.float32)
 
     # Copied from diffusers.pipelines.latent_consistency_models.pipeline_latent_consistency_text2img.LatentConsistencyModelPipeline.get_guidance_scale_embedding
     def get_guidance_scale_embedding(
diff --git a/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_instruct_pix2pix.py b/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_instruct_pix2pix.py
index 58b008361782..5e13362eb3d1 100644
--- a/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_instruct_pix2pix.py
+++ b/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_instruct_pix2pix.py
@@ -22,11 +22,6 @@
 from ...image_processor import PipelineImageInput, VaeImageProcessor
 from ...loaders import FromSingleFileMixin, StableDiffusionXLLoraLoaderMixin, TextualInversionLoaderMixin
 from ...models import AutoencoderKL, UNet2DConditionModel
-from ...models.attention_processor import (
-    AttnProcessor2_0,
-    FusedAttnProcessor2_0,
-    XFormersAttnProcessor,
-)
 from ...models.lora import adjust_lora_scale_text_encoder
 from ...schedulers import KarrasDiffusionSchedulers
 from ...utils import (
@@ -590,22 +585,12 @@ def _get_add_time_ids(
 
     # Copied from diffusers.pipelines.stable_diffusion_xl.pipeline_stable_diffusion_xl.StableDiffusionXLPipeline.upcast_vae
     def upcast_vae(self):
-        dtype = self.vae.dtype
-        self.vae.to(dtype=torch.float32)
-        use_torch_2_0_or_xformers = isinstance(
-            self.vae.decoder.mid_block.attentions[0].processor,
-            (
-                AttnProcessor2_0,
-                XFormersAttnProcessor,
-                FusedAttnProcessor2_0,
-            ),
+        deprecate(
+            "upcast_vae",
+            "1.0.0",
+            "`upcast_vae` is deprecated. Please use `pipe.vae.to(torch.float32)`. For more details, please refer to: https://github.com/huggingface/diffusers/pull/12619#issue-3606633695.",
         )
-        # if xformers or torch_2_0 is used attention block does not need
-        # to be in float32 which can save lots of memory
-        if use_torch_2_0_or_xformers:
-            self.vae.post_quant_conv.to(dtype)
-            self.vae.decoder.conv_in.to(dtype)
-            self.vae.decoder.mid_block.to(dtype)
+        self.vae.to(dtype=torch.float32)
 
     @torch.no_grad()
     @replace_example_docstring(EXAMPLE_DOC_STRING)
diff --git a/src/diffusers/pipelines/t2i_adapter/pipeline_stable_diffusion_xl_adapter.py b/src/diffusers/pipelines/t2i_adapter/pipeline_stable_diffusion_xl_adapter.py
index 2802d690f3cc..0ea3ba5046cf 100644
--- a/src/diffusers/pipelines/t2i_adapter/pipeline_stable_diffusion_xl_adapter.py
+++ b/src/diffusers/pipelines/t2i_adapter/pipeline_stable_diffusion_xl_adapter.py
@@ -34,15 +34,12 @@
     TextualInversionLoaderMixin,
 )
 from ...models import AutoencoderKL, ImageProjection, MultiAdapter, T2IAdapter, UNet2DConditionModel
-from ...models.attention_processor import (
-    AttnProcessor2_0,
-    XFormersAttnProcessor,
-)
 from ...models.lora import adjust_lora_scale_text_encoder
 from ...schedulers import KarrasDiffusionSchedulers
 from ...utils import (
     PIL_INTERPOLATION,
     USE_PEFT_BACKEND,
+    deprecate,
     is_torch_xla_available,
     logging,
     replace_example_docstring,
@@ -779,21 +776,12 @@ def _get_add_time_ids(
 
     # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_upscale.StableDiffusionUpscalePipeline.upcast_vae
     def upcast_vae(self):
-        dtype = self.vae.dtype
-        self.vae.to(dtype=torch.float32)
-        use_torch_2_0_or_xformers = isinstance(
-            self.vae.decoder.mid_block.attentions[0].processor,
-            (
-                AttnProcessor2_0,
-                XFormersAttnProcessor,
-            ),
+        deprecate(
+            "upcast_vae",
+            "1.0.0",
+            "`upcast_vae` is deprecated. Please use `pipe.vae.to(torch.float32)`. For more details, please refer to: https://github.com/huggingface/diffusers/pull/12619#issue-3606633695.",
         )
-        # if xformers or torch_2_0 is used attention block does not need
-        # to be in float32 which can save lots of memory
-        if use_torch_2_0_or_xformers:
-            self.vae.post_quant_conv.to(dtype)
-            self.vae.decoder.conv_in.to(dtype)
-            self.vae.decoder.mid_block.to(dtype)
+        self.vae.to(dtype=torch.float32)
 
     # Copied from diffusers.pipelines.t2i_adapter.pipeline_stable_diffusion_adapter.StableDiffusionAdapterPipeline._default_height_width
     def _default_height_width(self, height, width, image):
diff --git a/src/diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_zero_sdxl.py b/src/diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_zero_sdxl.py
index 288aae6c0d44..c8dce75e2671 100644
--- a/src/diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_zero_sdxl.py
+++ b/src/diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_zero_sdxl.py
@@ -19,16 +19,12 @@
 from ...image_processor import VaeImageProcessor
 from ...loaders import StableDiffusionXLLoraLoaderMixin, TextualInversionLoaderMixin
 from ...models import AutoencoderKL, UNet2DConditionModel
-from ...models.attention_processor import (
-    AttnProcessor2_0,
-    FusedAttnProcessor2_0,
-    XFormersAttnProcessor,
-)
 from ...models.lora import adjust_lora_scale_text_encoder
 from ...schedulers import KarrasDiffusionSchedulers
 from ...utils import (
     USE_PEFT_BACKEND,
     BaseOutput,
+    deprecate,
     is_invisible_watermark_available,
     logging,
     scale_lora_layers,
@@ -457,22 +453,12 @@ def prepare_extra_step_kwargs(self, generator, eta):
 
     # Copied from diffusers.pipelines.stable_diffusion_xl.pipeline_stable_diffusion_xl.StableDiffusionXLPipeline.upcast_vae
     def upcast_vae(self):
-        dtype = self.vae.dtype
-        self.vae.to(dtype=torch.float32)
-        use_torch_2_0_or_xformers = isinstance(
-            self.vae.decoder.mid_block.attentions[0].processor,
-            (
-                AttnProcessor2_0,
-                XFormersAttnProcessor,
-                FusedAttnProcessor2_0,
-            ),
+        deprecate(
+            "upcast_vae",
+            "1.0.0",
+            "`upcast_vae` is deprecated. Please use `pipe.vae.to(torch.float32)`. For more details, please refer to: https://github.com/huggingface/diffusers/pull/12619#issue-3606633695.",
         )
-        # if xformers or torch_2_0 is used attention block does not need
-        # to be in float32 which can save lots of memory
-        if use_torch_2_0_or_xformers:
-            self.vae.post_quant_conv.to(dtype)
-            self.vae.decoder.conv_in.to(dtype)
-            self.vae.decoder.mid_block.to(dtype)
+        self.vae.to(dtype=torch.float32)
 
     # Copied from diffusers.pipelines.stable_diffusion_xl.pipeline_stable_diffusion_xl.StableDiffusionXLPipeline._get_add_time_ids
     def _get_add_time_ids(
diff --git a/src/diffusers/pipelines/wan/__init__.py b/src/diffusers/pipelines/wan/__init__.py
index bb96372b1db2..ad51a52f9242 100644
--- a/src/diffusers/pipelines/wan/__init__.py
+++ b/src/diffusers/pipelines/wan/__init__.py
@@ -23,6 +23,7 @@
     _dummy_objects.update(get_objects_from_module(dummy_torch_and_transformers_objects))
 else:
     _import_structure["pipeline_wan"] = ["WanPipeline"]
+    _import_structure["pipeline_wan_animate"] = ["WanAnimatePipeline"]
     _import_structure["pipeline_wan_i2v"] = ["WanImageToVideoPipeline"]
     _import_structure["pipeline_wan_vace"] = ["WanVACEPipeline"]
     _import_structure["pipeline_wan_video2video"] = ["WanVideoToVideoPipeline"]
@@ -35,10 +36,10 @@
         from ...utils.dummy_torch_and_transformers_objects import *
     else:
         from .pipeline_wan import WanPipeline
+        from .pipeline_wan_animate import WanAnimatePipeline
         from .pipeline_wan_i2v import WanImageToVideoPipeline
         from .pipeline_wan_vace import WanVACEPipeline
         from .pipeline_wan_video2video import WanVideoToVideoPipeline
-
 else:
     import sys
 
diff --git a/src/diffusers/pipelines/wan/image_processor.py b/src/diffusers/pipelines/wan/image_processor.py
new file mode 100644
index 000000000000..b1594d08630f
--- /dev/null
+++ b/src/diffusers/pipelines/wan/image_processor.py
@@ -0,0 +1,185 @@
+# Copyright 2025 The Wan Team and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Optional, Tuple, Union
+
+import numpy as np
+import PIL.Image
+import torch
+
+from ...configuration_utils import register_to_config
+from ...image_processor import VaeImageProcessor
+from ...utils import PIL_INTERPOLATION
+
+
+class WanAnimateImageProcessor(VaeImageProcessor):
+    r"""
+    Image processor to preprocess the reference (character) image for the Wan Animate model.
+
+    Args:
+        do_resize (`bool`, *optional*, defaults to `True`):
+            Whether to downscale the image's (height, width) dimensions to multiples of `vae_scale_factor`. Can accept
+            `height` and `width` arguments from [`image_processor.VaeImageProcessor.preprocess`] method.
+        vae_scale_factor (`int`, *optional*, defaults to `8`):
+            VAE (spatial) scale factor. If `do_resize` is `True`, the image is automatically resized to multiples of
+            this factor.
+        vae_latent_channels (`int`, *optional*, defaults to `16`):
+            VAE latent channels.
+        spatial_patch_size (`Tuple[int, int]`, *optional*, defaults to `(2, 2)`):
+            The spatial patch size used by the diffusion transformer. For Wan models, this is typically (2, 2).
+        resample (`str`, *optional*, defaults to `lanczos`):
+            Resampling filter to use when resizing the image.
+        do_normalize (`bool`, *optional*, defaults to `True`):
+            Whether to normalize the image to [-1,1].
+        do_binarize (`bool`, *optional*, defaults to `False`):
+            Whether to binarize the image to 0/1.
+        do_convert_rgb (`bool`, *optional*, defaults to be `False`):
+            Whether to convert the images to RGB format.
+        do_convert_grayscale (`bool`, *optional*, defaults to be `False`):
+            Whether to convert the images to grayscale format.
+        fill_color (`str` or `float` or `Tuple[float, ...]`, *optional*, defaults to `None`):
+            An optional fill color when `resize_mode` is set to `"fill"`. This will fill the empty space with that
+            color instead of filling with data from the image. Any valid `color` argument to `PIL.Image.new` is valid;
+            if `None`, will default to filling with data from `image`.
+    """
+
+    @register_to_config
+    def __init__(
+        self,
+        do_resize: bool = True,
+        vae_scale_factor: int = 8,
+        vae_latent_channels: int = 16,
+        spatial_patch_size: Tuple[int, int] = (2, 2),
+        resample: str = "lanczos",
+        reducing_gap: int = None,
+        do_normalize: bool = True,
+        do_binarize: bool = False,
+        do_convert_rgb: bool = False,
+        do_convert_grayscale: bool = False,
+        fill_color: Optional[Union[str, float, Tuple[float, ...]]] = 0,
+    ):
+        super().__init__()
+        if do_convert_rgb and do_convert_grayscale:
+            raise ValueError(
+                "`do_convert_rgb` and `do_convert_grayscale` can not both be set to `True`,"
+                " if you intended to convert the image into RGB format, please set `do_convert_grayscale = False`.",
+                " if you intended to convert the image into grayscale format, please set `do_convert_rgb = False`",
+            )
+
+    def _resize_and_fill(
+        self,
+        image: PIL.Image.Image,
+        width: int,
+        height: int,
+    ) -> PIL.Image.Image:
+        r"""
+        Resize the image to fit within the specified width and height, maintaining the aspect ratio, and then center
+        the image within the dimensions, filling empty with data from image.
+
+        Args:
+            image (`PIL.Image.Image`):
+                The image to resize and fill.
+            width (`int`):
+                The width to resize the image to.
+            height (`int`):
+                The height to resize the image to.
+
+        Returns:
+            `PIL.Image.Image`:
+                The resized and filled image.
+        """
+
+        ratio = width / height
+        src_ratio = image.width / image.height
+        fill_with_image_data = self.config.fill_color is None
+        fill_color = self.config.fill_color or 0
+
+        src_w = width if ratio < src_ratio else image.width * height // image.height
+        src_h = height if ratio >= src_ratio else image.height * width // image.width
+
+        resized = image.resize((src_w, src_h), resample=PIL_INTERPOLATION[self.config.resample])
+        res = PIL.Image.new("RGB", (width, height), color=fill_color)
+        res.paste(resized, box=(width // 2 - src_w // 2, height // 2 - src_h // 2))
+
+        if fill_with_image_data:
+            if ratio < src_ratio:
+                fill_height = height // 2 - src_h // 2
+                if fill_height > 0:
+                    res.paste(resized.resize((width, fill_height), box=(0, 0, width, 0)), box=(0, 0))
+                    res.paste(
+                        resized.resize((width, fill_height), box=(0, resized.height, width, resized.height)),
+                        box=(0, fill_height + src_h),
+                    )
+            elif ratio > src_ratio:
+                fill_width = width // 2 - src_w // 2
+                if fill_width > 0:
+                    res.paste(resized.resize((fill_width, height), box=(0, 0, 0, height)), box=(0, 0))
+                    res.paste(
+                        resized.resize((fill_width, height), box=(resized.width, 0, resized.width, height)),
+                        box=(fill_width + src_w, 0),
+                    )
+
+        return res
+
+    def get_default_height_width(
+        self,
+        image: Union[PIL.Image.Image, np.ndarray, torch.Tensor],
+        height: Optional[int] = None,
+        width: Optional[int] = None,
+    ) -> Tuple[int, int]:
+        r"""
+        Returns the height and width of the image, downscaled to the next integer multiple of `vae_scale_factor`.
+
+        Args:
+            image (`Union[PIL.Image.Image, np.ndarray, torch.Tensor]`):
+                The image input, which can be a PIL image, NumPy array, or PyTorch tensor. If it is a NumPy array, it
+                should have shape `[batch, height, width]` or `[batch, height, width, channels]`. If it is a PyTorch
+                tensor, it should have shape `[batch, channels, height, width]`.
+            height (`Optional[int]`, *optional*, defaults to `None`):
+                The height of the preprocessed image. If `None`, the height of the `image` input will be used.
+            width (`Optional[int]`, *optional*, defaults to `None`):
+                The width of the preprocessed image. If `None`, the width of the `image` input will be used.
+
+        Returns:
+            `Tuple[int, int]`:
+                A tuple containing the height and width, both resized to the nearest integer multiple of
+                `vae_scale_factor * spatial_patch_size`.
+        """
+
+        if height is None:
+            if isinstance(image, PIL.Image.Image):
+                height = image.height
+            elif isinstance(image, torch.Tensor):
+                height = image.shape[2]
+            else:
+                height = image.shape[1]
+
+        if width is None:
+            if isinstance(image, PIL.Image.Image):
+                width = image.width
+            elif isinstance(image, torch.Tensor):
+                width = image.shape[3]
+            else:
+                width = image.shape[2]
+
+        max_area = width * height
+        aspect_ratio = height / width
+        mod_value_h = self.config.vae_scale_factor * self.config.spatial_patch_size[0]
+        mod_value_w = self.config.vae_scale_factor * self.config.spatial_patch_size[1]
+
+        # Try to preserve the aspect ratio
+        height = round(np.sqrt(max_area * aspect_ratio)) // mod_value_h * mod_value_h
+        width = round(np.sqrt(max_area / aspect_ratio)) // mod_value_w * mod_value_w
+
+        return height, width
diff --git a/src/diffusers/pipelines/wan/pipeline_wan_animate.py b/src/diffusers/pipelines/wan/pipeline_wan_animate.py
new file mode 100644
index 000000000000..c7c983b2f7d4
--- /dev/null
+++ b/src/diffusers/pipelines/wan/pipeline_wan_animate.py
@@ -0,0 +1,1204 @@
+# Copyright 2025 The Wan Team and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import html
+from copy import deepcopy
+from typing import Any, Callable, Dict, List, Optional, Tuple, Union
+
+import PIL
+import regex as re
+import torch
+import torch.nn.functional as F
+from transformers import AutoTokenizer, CLIPImageProcessor, CLIPVisionModel, UMT5EncoderModel
+
+from ...callbacks import MultiPipelineCallbacks, PipelineCallback
+from ...image_processor import PipelineImageInput
+from ...loaders import WanLoraLoaderMixin
+from ...models import AutoencoderKLWan, WanAnimateTransformer3DModel
+from ...schedulers import UniPCMultistepScheduler
+from ...utils import is_ftfy_available, is_torch_xla_available, logging, replace_example_docstring
+from ...utils.torch_utils import randn_tensor
+from ...video_processor import VideoProcessor
+from ..pipeline_utils import DiffusionPipeline
+from .image_processor import WanAnimateImageProcessor
+from .pipeline_output import WanPipelineOutput
+
+
+if is_torch_xla_available():
+    import torch_xla.core.xla_model as xm
+
+    XLA_AVAILABLE = True
+else:
+    XLA_AVAILABLE = False
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+if is_ftfy_available():
+    import ftfy
+
+EXAMPLE_DOC_STRING = """
+    Examples:
+        ```python
+        >>> import torch
+        >>> import numpy as np
+        >>> from diffusers import WanAnimatePipeline
+        >>> from diffusers.utils import export_to_video, load_image, load_video
+
+        >>> model_id = "Wan-AI/Wan2.2-Animate-14B-Diffusers"
+        >>> pipe = WanAnimatePipeline.from_pretrained(model_id, torch_dtype=torch.bfloat16)
+        >>> # Optionally upcast the Wan VAE to FP32
+        >>> pipe.vae.to(torch.float32)
+        >>> pipe.to("cuda")
+
+        >>> # Load the reference character image
+        >>> image = load_image(
+        ...     "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/astronaut.jpg"
+        ... )
+
+        >>> # Load pose and face videos (preprocessed from reference video)
+        >>> # Note: Videos should be preprocessed to extract pose keypoints and face features
+        >>> # Refer to the Wan-Animate preprocessing documentation for details
+        >>> pose_video = load_video("path/to/pose_video.mp4")
+        >>> face_video = load_video("path/to/face_video.mp4")
+
+        >>> # CFG is generally not used for Wan Animate
+        >>> prompt = (
+        ...     "An astronaut hatching from an egg, on the surface of the moon, the darkness and depth of space realised in "
+        ...     "the background. High quality, ultrarealistic detail and breath-taking movie-like camera shot."
+        ... )
+
+        >>> # Animation mode: Animate the character with the motion from pose/face videos
+        >>> output = pipe(
+        ...     image=image,
+        ...     pose_video=pose_video,
+        ...     face_video=face_video,
+        ...     prompt=prompt,
+        ...     height=height,
+        ...     width=width,
+        ...     segment_frame_length=77,  # Frame length of each inference segment
+        ...     guidance_scale=1.0,
+        ...     num_inference_steps=20,
+        ...     mode="animate",
+        ... ).frames[0]
+        >>> export_to_video(output, "output_animation.mp4", fps=30)
+
+        >>> # Replacement mode: Replace a character in the background video
+        >>> # Requires additional background_video and mask_video inputs
+        >>> background_video = load_video("path/to/background_video.mp4")
+        >>> mask_video = load_video("path/to/mask_video.mp4")  # Black areas preserved, white areas generated
+        >>> output = pipe(
+        ...     image=image,
+        ...     pose_video=pose_video,
+        ...     face_video=face_video,
+        ...     background_video=background_video,
+        ...     mask_video=mask_video,
+        ...     prompt=prompt,
+        ...     height=height,
+        ...     width=width,
+        ...     segment_frame_length=77,  # Frame length of each inference segment
+        ...     guidance_scale=1.0,
+        ...     num_inference_steps=20,
+        ...     mode="replace",
+        ... ).frames[0]
+        >>> export_to_video(output, "output_replacement.mp4", fps=30)
+        ```
+"""
+
+
+def basic_clean(text):
+    text = ftfy.fix_text(text)
+    text = html.unescape(html.unescape(text))
+    return text.strip()
+
+
+def whitespace_clean(text):
+    text = re.sub(r"\s+", " ", text)
+    text = text.strip()
+    return text
+
+
+def prompt_clean(text):
+    text = whitespace_clean(basic_clean(text))
+    return text
+
+
+# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img.retrieve_latents
+def retrieve_latents(
+    encoder_output: torch.Tensor, generator: Optional[torch.Generator] = None, sample_mode: str = "sample"
+):
+    if hasattr(encoder_output, "latent_dist") and sample_mode == "sample":
+        return encoder_output.latent_dist.sample(generator)
+    elif hasattr(encoder_output, "latent_dist") and sample_mode == "argmax":
+        return encoder_output.latent_dist.mode()
+    elif hasattr(encoder_output, "latents"):
+        return encoder_output.latents
+    else:
+        raise AttributeError("Could not access latents of provided encoder_output")
+
+
+class WanAnimatePipeline(DiffusionPipeline, WanLoraLoaderMixin):
+    r"""
+    Pipeline for unified character animation and replacement using Wan-Animate.
+
+    WanAnimatePipeline takes a character image, pose video, and face video as input, and generates a video in two
+    modes:
+
+    1. **Animation mode**: The model generates a video of the character image that mimics the human motion in the input
+       pose and face videos. The character is animated based on the provided motion controls, creating a new animated
+       video of the character.
+
+    2. **Replacement mode**: The model replaces a character in a background video with the provided character image,
+       using the pose and face videos for motion control. This mode requires additional `background_video` and
+       `mask_video` inputs. The mask video should have black regions where the original content should be preserved and
+       white regions where the new character should be generated.
+
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods
+    implemented for all pipelines (downloading, saving, running on a particular device, etc.).
+
+    The pipeline also inherits the following loading methods:
+        - [`~loaders.WanLoraLoaderMixin.load_lora_weights`] for loading LoRA weights
+
+    Args:
+        tokenizer ([`T5Tokenizer`]):
+            Tokenizer from [T5](https://huggingface.co/docs/transformers/en/model_doc/t5#transformers.T5Tokenizer),
+            specifically the [google/umt5-xxl](https://huggingface.co/google/umt5-xxl) variant.
+        text_encoder ([`T5EncoderModel`]):
+            [T5](https://huggingface.co/docs/transformers/en/model_doc/t5#transformers.T5EncoderModel), specifically
+            the [google/umt5-xxl](https://huggingface.co/google/umt5-xxl) variant.
+        image_encoder ([`CLIPVisionModel`]):
+            [CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPVisionModel), specifically
+            the
+            [clip-vit-huge-patch14](https://github.com/mlfoundations/open_clip/blob/main/docs/PRETRAINED.md#vit-h14-xlm-roberta-large)
+            variant.
+        transformer ([`WanAnimateTransformer3DModel`]):
+            Conditional Transformer to denoise the input latents.
+        scheduler ([`UniPCMultistepScheduler`]):
+            A scheduler to be used in combination with `transformer` to denoise the encoded image latents.
+        vae ([`AutoencoderKLWan`]):
+            Variational Auto-Encoder (VAE) Model to encode and decode videos to and from latent representations.
+        image_processor ([`CLIPImageProcessor`]):
+            Image processor for preprocessing images before encoding.
+    """
+
+    model_cpu_offload_seq = "text_encoder->image_encoder->transformer->vae"
+    _callback_tensor_inputs = ["latents", "prompt_embeds", "negative_prompt_embeds"]
+
+    def __init__(
+        self,
+        tokenizer: AutoTokenizer,
+        text_encoder: UMT5EncoderModel,
+        vae: AutoencoderKLWan,
+        scheduler: UniPCMultistepScheduler,
+        image_processor: CLIPImageProcessor,
+        image_encoder: CLIPVisionModel,
+        transformer: WanAnimateTransformer3DModel,
+    ):
+        super().__init__()
+
+        self.register_modules(
+            vae=vae,
+            text_encoder=text_encoder,
+            tokenizer=tokenizer,
+            image_encoder=image_encoder,
+            transformer=transformer,
+            scheduler=scheduler,
+            image_processor=image_processor,
+        )
+
+        self.vae_scale_factor_temporal = self.vae.config.scale_factor_temporal if getattr(self, "vae", None) else 4
+        self.vae_scale_factor_spatial = self.vae.config.scale_factor_spatial if getattr(self, "vae", None) else 8
+        self.video_processor = VideoProcessor(vae_scale_factor=self.vae_scale_factor_spatial)
+        self.video_processor_for_mask = VideoProcessor(
+            vae_scale_factor=self.vae_scale_factor_spatial, do_normalize=False, do_convert_grayscale=True
+        )
+        # In case self.transformer is None (e.g. for some pipeline tests)
+        spatial_patch_size = self.transformer.config.patch_size[-2:] if self.transformer is not None else (2, 2)
+        self.vae_image_processor = WanAnimateImageProcessor(
+            vae_scale_factor=self.vae_scale_factor_spatial,
+            spatial_patch_size=spatial_patch_size,
+            resample="bilinear",
+            fill_color=0,
+        )
+        self.image_processor = image_processor
+
+    def _get_t5_prompt_embeds(
+        self,
+        prompt: Union[str, List[str]] = None,
+        num_videos_per_prompt: int = 1,
+        max_sequence_length: int = 512,
+        device: Optional[torch.device] = None,
+        dtype: Optional[torch.dtype] = None,
+    ):
+        device = device or self._execution_device
+        dtype = dtype or self.text_encoder.dtype
+
+        prompt = [prompt] if isinstance(prompt, str) else prompt
+        prompt = [prompt_clean(u) for u in prompt]
+        batch_size = len(prompt)
+
+        text_inputs = self.tokenizer(
+            prompt,
+            padding="max_length",
+            max_length=max_sequence_length,
+            truncation=True,
+            add_special_tokens=True,
+            return_attention_mask=True,
+            return_tensors="pt",
+        )
+        text_input_ids, mask = text_inputs.input_ids, text_inputs.attention_mask
+        seq_lens = mask.gt(0).sum(dim=1).long()
+
+        prompt_embeds = self.text_encoder(text_input_ids.to(device), mask.to(device)).last_hidden_state
+        prompt_embeds = prompt_embeds.to(dtype=dtype, device=device)
+        prompt_embeds = [u[:v] for u, v in zip(prompt_embeds, seq_lens)]
+        prompt_embeds = torch.stack(
+            [torch.cat([u, u.new_zeros(max_sequence_length - u.size(0), u.size(1))]) for u in prompt_embeds], dim=0
+        )
+
+        # duplicate text embeddings for each generation per prompt, using mps friendly method
+        _, seq_len, _ = prompt_embeds.shape
+        prompt_embeds = prompt_embeds.repeat(1, num_videos_per_prompt, 1)
+        prompt_embeds = prompt_embeds.view(batch_size * num_videos_per_prompt, seq_len, -1)
+
+        return prompt_embeds
+
+    # Copied from diffusers.pipelines.wan.pipeline_wan_i2v.WanImageToVideoPipeline.encode_image
+    def encode_image(
+        self,
+        image: PipelineImageInput,
+        device: Optional[torch.device] = None,
+    ):
+        device = device or self._execution_device
+        image = self.image_processor(images=image, return_tensors="pt").to(device)
+        image_embeds = self.image_encoder(**image, output_hidden_states=True)
+        return image_embeds.hidden_states[-2]
+
+    # Copied from diffusers.pipelines.wan.pipeline_wan.WanPipeline.encode_prompt
+    def encode_prompt(
+        self,
+        prompt: Union[str, List[str]],
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        do_classifier_free_guidance: bool = True,
+        num_videos_per_prompt: int = 1,
+        prompt_embeds: Optional[torch.Tensor] = None,
+        negative_prompt_embeds: Optional[torch.Tensor] = None,
+        max_sequence_length: int = 226,
+        device: Optional[torch.device] = None,
+        dtype: Optional[torch.dtype] = None,
+    ):
+        r"""
+        Encodes the prompt into text encoder hidden states.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                prompt to be encoded
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. If not defined, one has to pass
+                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
+                less than `1`).
+            do_classifier_free_guidance (`bool`, *optional*, defaults to `True`):
+                Whether to use classifier free guidance or not.
+            num_videos_per_prompt (`int`, *optional*, defaults to 1):
+                Number of videos that should be generated per prompt. torch device to place the resulting embeddings on
+            prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+            device: (`torch.device`, *optional*):
+                torch device
+            dtype: (`torch.dtype`, *optional*):
+                torch dtype
+        """
+        device = device or self._execution_device
+
+        prompt = [prompt] if isinstance(prompt, str) else prompt
+        if prompt is not None:
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        if prompt_embeds is None:
+            prompt_embeds = self._get_t5_prompt_embeds(
+                prompt=prompt,
+                num_videos_per_prompt=num_videos_per_prompt,
+                max_sequence_length=max_sequence_length,
+                device=device,
+                dtype=dtype,
+            )
+
+        if do_classifier_free_guidance and negative_prompt_embeds is None:
+            negative_prompt = negative_prompt or ""
+            negative_prompt = batch_size * [negative_prompt] if isinstance(negative_prompt, str) else negative_prompt
+
+            if prompt is not None and type(prompt) is not type(negative_prompt):
+                raise TypeError(
+                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
+                    f" {type(prompt)}."
+                )
+            elif batch_size != len(negative_prompt):
+                raise ValueError(
+                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
+                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
+                    " the batch size of `prompt`."
+                )
+
+            negative_prompt_embeds = self._get_t5_prompt_embeds(
+                prompt=negative_prompt,
+                num_videos_per_prompt=num_videos_per_prompt,
+                max_sequence_length=max_sequence_length,
+                device=device,
+                dtype=dtype,
+            )
+
+        return prompt_embeds, negative_prompt_embeds
+
+    def check_inputs(
+        self,
+        prompt,
+        negative_prompt,
+        image,
+        pose_video,
+        face_video,
+        background_video,
+        mask_video,
+        height,
+        width,
+        prompt_embeds=None,
+        negative_prompt_embeds=None,
+        image_embeds=None,
+        callback_on_step_end_tensor_inputs=None,
+        mode=None,
+        prev_segment_conditioning_frames=None,
+    ):
+        if image is not None and image_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `image`: {image} and `image_embeds`: {image_embeds}. Please make sure to"
+                " only forward one of the two."
+            )
+        if image is None and image_embeds is None:
+            raise ValueError(
+                "Provide either `image` or `prompt_embeds`. Cannot leave both `image` and `image_embeds` undefined."
+            )
+        if image is not None and not isinstance(image, torch.Tensor) and not isinstance(image, PIL.Image.Image):
+            raise ValueError(f"`image` has to be of type `torch.Tensor` or `PIL.Image.Image` but is {type(image)}")
+        if pose_video is None:
+            raise ValueError("Provide `pose_video`. Cannot leave `pose_video` undefined.")
+        if face_video is None:
+            raise ValueError("Provide `face_video`. Cannot leave `face_video` undefined.")
+        if not isinstance(pose_video, list) or not isinstance(face_video, list):
+            raise ValueError("`pose_video` and `face_video` must be lists of PIL images.")
+        if len(pose_video) == 0 or len(face_video) == 0:
+            raise ValueError("`pose_video` and `face_video` must contain at least one frame.")
+        if mode == "replace" and (background_video is None or mask_video is None):
+            raise ValueError(
+                "Provide `background_video` and `mask_video`. Cannot leave both `background_video` and `mask_video`"
+                " undefined when mode is `replace`."
+            )
+        if mode == "replace" and (not isinstance(background_video, list) or not isinstance(mask_video, list)):
+            raise ValueError("`background_video` and `mask_video` must be lists of PIL images when mode is `replace`.")
+
+        if height % 16 != 0 or width % 16 != 0:
+            raise ValueError(f"`height` and `width` have to be divisible by 16 but are {height} and {width}.")
+
+        if callback_on_step_end_tensor_inputs is not None and not all(
+            k in self._callback_tensor_inputs for k in callback_on_step_end_tensor_inputs
+        ):
+            raise ValueError(
+                f"`callback_on_step_end_tensor_inputs` has to be in {self._callback_tensor_inputs}, but found"
+                f" {[k for k in callback_on_step_end_tensor_inputs if k not in self._callback_tensor_inputs]}"
+            )
+
+        if prompt is not None and prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
+                " only forward one of the two."
+            )
+        elif negative_prompt is not None and negative_prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`: {negative_prompt_embeds}. Please make sure to"
+                " only forward one of the two."
+            )
+        elif prompt is None and prompt_embeds is None:
+            raise ValueError(
+                "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
+            )
+        elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+        elif negative_prompt is not None and (
+            not isinstance(negative_prompt, str) and not isinstance(negative_prompt, list)
+        ):
+            raise ValueError(f"`negative_prompt` has to be of type `str` or `list` but is {type(negative_prompt)}")
+
+        if mode is not None and (not isinstance(mode, str) or mode not in ("animate", "replace")):
+            raise ValueError(
+                f"`mode` has to be of type `str` and in ('animate', 'replace') but its type is {type(mode)} and value is {mode}"
+            )
+
+        if prev_segment_conditioning_frames is not None and (
+            not isinstance(prev_segment_conditioning_frames, int) or prev_segment_conditioning_frames not in (1, 5)
+        ):
+            raise ValueError(
+                f"`prev_segment_conditioning_frames` has to be of type `int` and 1 or 5 but its type is"
+                f" {type(prev_segment_conditioning_frames)} and value is {prev_segment_conditioning_frames}"
+            )
+
+    def get_i2v_mask(
+        self,
+        batch_size: int,
+        latent_t: int,
+        latent_h: int,
+        latent_w: int,
+        mask_len: int = 1,
+        mask_pixel_values: Optional[torch.Tensor] = None,
+        dtype: Optional[torch.dtype] = None,
+        device: Union[str, torch.device] = "cuda",
+    ) -> torch.Tensor:
+        # mask_pixel_values shape (if supplied): [B, C = 1, T, latent_h, latent_w]
+        if mask_pixel_values is None:
+            mask_lat_size = torch.zeros(
+                batch_size, 1, (latent_t - 1) * 4 + 1, latent_h, latent_w, dtype=dtype, device=device
+            )
+        else:
+            mask_lat_size = mask_pixel_values.clone().to(device=device, dtype=dtype)
+        mask_lat_size[:, :, :mask_len] = 1
+        first_frame_mask = mask_lat_size[:, :, 0:1]
+        # Repeat first frame mask self.vae_scale_factor_temporal (= 4) times in the frame dimension
+        first_frame_mask = torch.repeat_interleave(first_frame_mask, dim=2, repeats=self.vae_scale_factor_temporal)
+        mask_lat_size = torch.concat([first_frame_mask, mask_lat_size[:, :, 1:]], dim=2)
+        mask_lat_size = mask_lat_size.view(
+            batch_size, -1, self.vae_scale_factor_temporal, latent_h, latent_w
+        ).transpose(1, 2)  # [B, C = 1, 4 * T_lat, H_lat, W_lat] --> [B, C = 4, T_lat, H_lat, W_lat]
+
+        return mask_lat_size
+
+    def prepare_reference_image_latents(
+        self,
+        image: torch.Tensor,
+        batch_size: int = 1,
+        sample_mode: int = "argmax",
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        dtype: Optional[torch.dtype] = None,
+        device: Optional[torch.device] = None,
+    ) -> torch.Tensor:
+        # image shape: (B, C, H, W) or (B, C, T, H, W)
+        dtype = dtype or self.vae.dtype
+        if image.ndim == 4:
+            # Add a singleton frame dimension after the channels dimension
+            image = image.unsqueeze(2)
+
+        _, _, _, height, width = image.shape
+        latent_height = height // self.vae_scale_factor_spatial
+        latent_width = width // self.vae_scale_factor_spatial
+
+        # Encode image to latents using VAE
+        image = image.to(device=device, dtype=dtype)
+        if isinstance(generator, list):
+            # Like in prepare_latents, assume len(generator) == batch_size
+            ref_image_latents = [
+                retrieve_latents(self.vae.encode(image), generator=g, sample_mode=sample_mode) for g in generator
+            ]
+            ref_image_latents = torch.cat(ref_image_latents)
+        else:
+            ref_image_latents = retrieve_latents(self.vae.encode(image), generator, sample_mode)
+        # Standardize latents in preparation for Wan VAE encode
+        latents_mean = (
+            torch.tensor(self.vae.config.latents_mean)
+            .view(1, self.vae.config.z_dim, 1, 1, 1)
+            .to(ref_image_latents.device, ref_image_latents.dtype)
+        )
+        latents_recip_std = 1.0 / torch.tensor(self.vae.config.latents_std).view(1, self.vae.config.z_dim, 1, 1, 1).to(
+            ref_image_latents.device, ref_image_latents.dtype
+        )
+        ref_image_latents = (ref_image_latents - latents_mean) * latents_recip_std
+        # Handle the case where we supply one image and one generator, but batch_size > 1 (e.g. generating multiple
+        # videos per prompt)
+        if ref_image_latents.shape[0] == 1 and batch_size > 1:
+            ref_image_latents = ref_image_latents.expand(batch_size, -1, -1, -1, -1)
+
+        # Prepare I2V mask in latent space and prepend to the reference image latents along channel dim
+        reference_image_mask = self.get_i2v_mask(batch_size, 1, latent_height, latent_width, 1, None, dtype, device)
+        reference_image_latents = torch.cat([reference_image_mask, ref_image_latents], dim=1)
+
+        return reference_image_latents
+
+    def prepare_prev_segment_cond_latents(
+        self,
+        prev_segment_cond_video: Optional[torch.Tensor] = None,
+        background_video: Optional[torch.Tensor] = None,
+        mask_video: Optional[torch.Tensor] = None,
+        batch_size: int = 1,
+        segment_frame_length: int = 77,
+        start_frame: int = 0,
+        height: int = 720,
+        width: int = 1280,
+        prev_segment_cond_frames: int = 1,
+        task: str = "animate",
+        interpolation_mode: str = "bicubic",
+        sample_mode: str = "argmax",
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        dtype: Optional[torch.dtype] = None,
+        device: Optional[torch.device] = None,
+    ) -> torch.Tensor:
+        # prev_segment_cond_video shape: (B, C, T, H, W) in pixel space if supplied
+        # background_video shape: (B, C, T, H, W) (same as prev_segment_cond_video shape)
+        # mask_video shape: (B, 1, T, H, W) (same as prev_segment_cond_video, but with only 1 channel)
+        dtype = dtype or self.vae.dtype
+        if prev_segment_cond_video is None:
+            if task == "replace":
+                prev_segment_cond_video = background_video[:, :, :prev_segment_cond_frames].to(dtype)
+            else:
+                cond_frames_shape = (batch_size, 3, prev_segment_cond_frames, height, width)  # In pixel space
+                prev_segment_cond_video = torch.zeros(cond_frames_shape, dtype=dtype, device=device)
+
+        data_batch_size, channels, _, segment_height, segment_width = prev_segment_cond_video.shape
+        num_latent_frames = (segment_frame_length - 1) // self.vae_scale_factor_temporal + 1
+        latent_height = height // self.vae_scale_factor_spatial
+        latent_width = width // self.vae_scale_factor_spatial
+        if segment_height != height or segment_width != width:
+            print(
+                f"Interpolating prev segment cond video from ({segment_width}, {segment_height}) to ({width}, {height})"
+            )
+            # Perform a 4D (spatial) rather than a 5D (spatiotemporal) reshape, following the original code
+            prev_segment_cond_video = prev_segment_cond_video.transpose(1, 2).flatten(0, 1)  # [B * T, C, H, W]
+            prev_segment_cond_video = F.interpolate(
+                prev_segment_cond_video, size=(height, width), mode=interpolation_mode
+            )
+            prev_segment_cond_video = prev_segment_cond_video.unflatten(0, (batch_size, -1)).transpose(1, 2)
+
+        # Fill the remaining part of the cond video segment with zeros (if animating) or the background video (if
+        # replacing).
+        if task == "replace":
+            remaining_segment = background_video[:, :, prev_segment_cond_frames:].to(dtype)
+        else:
+            remaining_segment_frames = segment_frame_length - prev_segment_cond_frames
+            remaining_segment = torch.zeros(
+                batch_size, channels, remaining_segment_frames, height, width, dtype=dtype, device=device
+            )
+
+        # Prepend the conditioning frames from the previous segment to the remaining segment video in the frame dim
+        prev_segment_cond_video = prev_segment_cond_video.to(dtype=dtype)
+        full_segment_cond_video = torch.cat([prev_segment_cond_video, remaining_segment], dim=2)
+
+        if isinstance(generator, list):
+            if data_batch_size == len(generator):
+                prev_segment_cond_latents = [
+                    retrieve_latents(self.vae.encode(full_segment_cond_video[i].unsqueeze(0)), g, sample_mode)
+                    for i, g in enumerate(generator)
+                ]
+            elif data_batch_size == 1:
+                # Like prepare_latents, assume len(generator) == batch_size
+                prev_segment_cond_latents = [
+                    retrieve_latents(self.vae.encode(full_segment_cond_video), g, sample_mode) for g in generator
+                ]
+            else:
+                raise ValueError(
+                    f"The batch size of the prev segment video should be either {len(generator)} or 1 but is"
+                    f" {data_batch_size}"
+                )
+            prev_segment_cond_latents = torch.cat(prev_segment_cond_latents)
+        else:
+            prev_segment_cond_latents = retrieve_latents(
+                self.vae.encode(full_segment_cond_video), generator, sample_mode
+            )
+        # Standardize latents in preparation for Wan VAE encode
+        latents_mean = (
+            torch.tensor(self.vae.config.latents_mean)
+            .view(1, self.vae.config.z_dim, 1, 1, 1)
+            .to(prev_segment_cond_latents.device, prev_segment_cond_latents.dtype)
+        )
+        latents_recip_std = 1.0 / torch.tensor(self.vae.config.latents_std).view(1, self.vae.config.z_dim, 1, 1, 1).to(
+            prev_segment_cond_latents.device, prev_segment_cond_latents.dtype
+        )
+        prev_segment_cond_latents = (prev_segment_cond_latents - latents_mean) * latents_recip_std
+
+        # Prepare I2V mask
+        if task == "replace":
+            mask_video = 1 - mask_video
+            mask_video = mask_video.permute(0, 2, 1, 3, 4)
+            mask_video = mask_video.flatten(0, 1)
+            mask_video = F.interpolate(mask_video, size=(latent_height, latent_width), mode="nearest")
+            mask_pixel_values = mask_video.unflatten(0, (batch_size, -1))
+            mask_pixel_values = mask_pixel_values.permute(0, 2, 1, 3, 4)  # output shape: [B, C = 1, T, H_lat, W_lat]
+        else:
+            mask_pixel_values = None
+        prev_segment_cond_mask = self.get_i2v_mask(
+            batch_size,
+            num_latent_frames,
+            latent_height,
+            latent_width,
+            mask_len=prev_segment_cond_frames if start_frame > 0 else 0,
+            mask_pixel_values=mask_pixel_values,
+            dtype=dtype,
+            device=device,
+        )
+
+        # Prepend cond I2V mask to prev segment cond latents along channel dimension
+        prev_segment_cond_latents = torch.cat([prev_segment_cond_mask, prev_segment_cond_latents], dim=1)
+        return prev_segment_cond_latents
+
+    def prepare_pose_latents(
+        self,
+        pose_video: torch.Tensor,
+        batch_size: int = 1,
+        sample_mode: int = "argmax",
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        dtype: Optional[torch.dtype] = None,
+        device: Optional[torch.device] = None,
+    ) -> torch.Tensor:
+        # pose_video shape: (B, C, T, H, W)
+        pose_video = pose_video.to(device=device, dtype=dtype if dtype is not None else self.vae.dtype)
+        if isinstance(generator, list):
+            pose_latents = [
+                retrieve_latents(self.vae.encode(pose_video), generator=g, sample_mode=sample_mode) for g in generator
+            ]
+            pose_latents = torch.cat(pose_latents)
+        else:
+            pose_latents = retrieve_latents(self.vae.encode(pose_video), generator, sample_mode)
+        # Standardize latents in preparation for Wan VAE encode
+        latents_mean = (
+            torch.tensor(self.vae.config.latents_mean)
+            .view(1, self.vae.config.z_dim, 1, 1, 1)
+            .to(pose_latents.device, pose_latents.dtype)
+        )
+        latents_recip_std = 1.0 / torch.tensor(self.vae.config.latents_std).view(1, self.vae.config.z_dim, 1, 1, 1).to(
+            pose_latents.device, pose_latents.dtype
+        )
+        pose_latents = (pose_latents - latents_mean) * latents_recip_std
+        if pose_latents.shape[0] == 1 and batch_size > 1:
+            pose_latents = pose_latents.expand(batch_size, -1, -1, -1, -1)
+        return pose_latents
+
+    def prepare_latents(
+        self,
+        batch_size: int,
+        num_channels_latents: int = 16,
+        height: int = 720,
+        width: int = 1280,
+        num_frames: int = 77,
+        dtype: Optional[torch.dtype] = None,
+        device: Optional[torch.device] = None,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.Tensor] = None,
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        num_latent_frames = (num_frames - 1) // self.vae_scale_factor_temporal + 1
+        latent_height = height // self.vae_scale_factor_spatial
+        latent_width = width // self.vae_scale_factor_spatial
+
+        shape = (batch_size, num_channels_latents, num_latent_frames + 1, latent_height, latent_width)
+        if isinstance(generator, list) and len(generator) != batch_size:
+            raise ValueError(
+                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+            )
+
+        if latents is None:
+            latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+        else:
+            latents = latents.to(device=device, dtype=dtype)
+
+        return latents
+
+    def pad_video_frames(self, frames: List[Any], num_target_frames: int) -> List[Any]:
+        """
+        Pads an array-like video `frames` to `num_target_frames` using a "reflect"-like strategy. The frame dimension
+        is assumed to be the first dimension. In the 1D case, we can visualize this strategy as follows:
+
+        pad_video_frames([1, 2, 3, 4, 5], 10) -> [1, 2, 3, 4, 5, 4, 3, 2, 1, 2]
+        """
+        idx = 0
+        flip = False
+        target_frames = []
+        while len(target_frames) < num_target_frames:
+            target_frames.append(deepcopy(frames[idx]))
+            if flip:
+                idx -= 1
+            else:
+                idx += 1
+            if idx == 0 or idx == len(frames) - 1:
+                flip = not flip
+
+        return target_frames
+
+    @property
+    def guidance_scale(self):
+        return self._guidance_scale
+
+    @property
+    def do_classifier_free_guidance(self):
+        return self._guidance_scale > 1
+
+    @property
+    def num_timesteps(self):
+        return self._num_timesteps
+
+    @property
+    def current_timestep(self):
+        return self._current_timestep
+
+    @property
+    def interrupt(self):
+        return self._interrupt
+
+    @property
+    def attention_kwargs(self):
+        return self._attention_kwargs
+
+    @torch.no_grad()
+    @replace_example_docstring(EXAMPLE_DOC_STRING)
+    def __call__(
+        self,
+        image: PipelineImageInput,
+        pose_video: List[PIL.Image.Image],
+        face_video: List[PIL.Image.Image],
+        background_video: Optional[List[PIL.Image.Image]] = None,
+        mask_video: Optional[List[PIL.Image.Image]] = None,
+        prompt: Union[str, List[str]] = None,
+        negative_prompt: Union[str, List[str]] = None,
+        height: int = 720,
+        width: int = 1280,
+        segment_frame_length: int = 77,
+        num_inference_steps: int = 20,
+        mode: str = "animate",
+        prev_segment_conditioning_frames: int = 1,
+        motion_encode_batch_size: Optional[int] = None,
+        guidance_scale: float = 1.0,
+        num_videos_per_prompt: Optional[int] = 1,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.Tensor] = None,
+        prompt_embeds: Optional[torch.Tensor] = None,
+        negative_prompt_embeds: Optional[torch.Tensor] = None,
+        image_embeds: Optional[torch.Tensor] = None,
+        output_type: Optional[str] = "np",
+        return_dict: bool = True,
+        attention_kwargs: Optional[Dict[str, Any]] = None,
+        callback_on_step_end: Optional[
+            Union[Callable[[int, int, Dict], None], PipelineCallback, MultiPipelineCallbacks]
+        ] = None,
+        callback_on_step_end_tensor_inputs: List[str] = ["latents"],
+        max_sequence_length: int = 512,
+    ):
+        r"""
+        The call function to the pipeline for generation.
+
+        Args:
+            image (`PipelineImageInput`):
+                The input character image to condition the generation on. Must be an image, a list of images or a
+                `torch.Tensor`.
+            pose_video (`List[PIL.Image.Image]`):
+                The input pose video to condition the generation on. Must be a list of PIL images.
+            face_video (`List[PIL.Image.Image]`):
+                The input face video to condition the generation on. Must be a list of PIL images.
+            background_video (`List[PIL.Image.Image]`, *optional*):
+                When mode is `"replace"`, the input background video to condition the generation on. Must be a list of
+                PIL images.
+            mask_video (`List[PIL.Image.Image]`, *optional*):
+                When mode is `"replace"`, the input mask video to condition the generation on. Must be a list of PIL
+                images.
+            prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
+                instead.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. If not defined, one has to pass
+                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
+                less than `1`).
+            mode (`str`, defaults to `"animation"`):
+                The mode of the generation. Choose between `"animate"` and `"replace"`.
+            prev_segment_conditioning_frames (`int`, defaults to `1`):
+                The number of frames from the previous video segment to be used for temporal guidance. Recommended to
+                be 1 or 5. In general, should be 4N + 1, where N is a non-negative integer.
+            motion_encode_batch_size (`int`, *optional*):
+                The batch size for batched encoding of the face video via the motion encoder. This allows trading off
+                inference speed for lower memory usage by setting a smaller batch size. Will default to
+                `self.transformer.config.motion_encoder_batch_size` if not set.
+            height (`int`, defaults to `720`):
+                The height of the generated video.
+            width (`int`, defaults to `1280`):
+                The width of the generated video.
+            segment_frame_length (`int`, defaults to `77`):
+                The number of frames in each generated video segment. The total frames of video generated will be equal
+                to the number of frames in `pose_video`; we will generate the video in segments until we have hit this
+                length. In general, should be 4N + 1, where N is a non-negative integer.
+            num_inference_steps (`int`, defaults to `20`):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            guidance_scale (`float`, defaults to `1.0`):
+                Guidance scale as defined in [Classifier-Free Diffusion
+                Guidance](https://huggingface.co/papers/2207.12598). `guidance_scale` is defined as `w` of equation 2.
+                of [Imagen Paper](https://huggingface.co/papers/2205.11487). Guidance scale is enabled by setting
+                `guidance_scale > 1`. Higher guidance scale encourages to generate images that are closely linked to
+                the text `prompt`, usually at the expense of lower image quality. By default, CFG is not used in Wan
+                Animate inference.
+            num_videos_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
+                generation deterministic.
+            latents (`torch.Tensor`, *optional*):
+                Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor is generated by sampling using the supplied random `generator`.
+            prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not
+                provided, text embeddings are generated from the `prompt` input argument.
+            negative_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not
+                provided, text embeddings are generated from the `negative_prompt` input argument.
+            image_embeds (`torch.Tensor`, *optional*):
+                Pre-generated image embeddings. Can be used to easily tweak image inputs (weighting). If not provided,
+                image embeddings are generated from the `image` input argument.
+            output_type (`str`, *optional*, defaults to `"np"`):
+                The output format of the generated image. Choose between `PIL.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`WanPipelineOutput`] instead of a plain tuple.
+            attention_kwargs (`dict`, *optional*):
+                A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
+                `self.processor` in
+                [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
+            callback_on_step_end (`Callable`, `PipelineCallback`, `MultiPipelineCallbacks`, *optional*):
+                A function or a subclass of `PipelineCallback` or `MultiPipelineCallbacks` that is called at the end of
+                each denoising step during the inference. with the following arguments: `callback_on_step_end(self:
+                DiffusionPipeline, step: int, timestep: int, callback_kwargs: Dict)`. `callback_kwargs` will include a
+                list of all tensors as specified by `callback_on_step_end_tensor_inputs`.
+            callback_on_step_end_tensor_inputs (`List`, *optional*):
+                The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
+                will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
+                `._callback_tensor_inputs` attribute of your pipeline class.
+            max_sequence_length (`int`, defaults to `512`):
+                The maximum sequence length of the text encoder. If the prompt is longer than this, it will be
+                truncated. If the prompt is shorter, it will be padded to this length.
+
+        Examples:
+
+        Returns:
+            [`~WanPipelineOutput`] or `tuple`:
+                If `return_dict` is `True`, [`WanPipelineOutput`] is returned, otherwise a `tuple` is returned where
+                the first element is a list with the generated images and the second element is a list of `bool`s
+                indicating whether the corresponding generated image contains "not-safe-for-work" (nsfw) content.
+        """
+
+        if isinstance(callback_on_step_end, (PipelineCallback, MultiPipelineCallbacks)):
+            callback_on_step_end_tensor_inputs = callback_on_step_end.tensor_inputs
+
+        # 1. Check inputs. Raise error if not correct
+        self.check_inputs(
+            prompt,
+            negative_prompt,
+            image,
+            pose_video,
+            face_video,
+            background_video,
+            mask_video,
+            height,
+            width,
+            prompt_embeds,
+            negative_prompt_embeds,
+            image_embeds,
+            callback_on_step_end_tensor_inputs,
+            mode,
+            prev_segment_conditioning_frames,
+        )
+
+        if segment_frame_length % self.vae_scale_factor_temporal != 1:
+            logger.warning(
+                f"`segment_frame_length - 1` has to be divisible by {self.vae_scale_factor_temporal}. Rounding to the"
+                f" nearest number."
+            )
+            segment_frame_length = (
+                segment_frame_length // self.vae_scale_factor_temporal * self.vae_scale_factor_temporal + 1
+            )
+        segment_frame_length = max(segment_frame_length, 1)
+
+        self._guidance_scale = guidance_scale
+        self._attention_kwargs = attention_kwargs
+        self._current_timestep = None
+        self._interrupt = False
+
+        device = self._execution_device
+
+        # 2. Define call parameters
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        # As we generate in segments of `segment_frame_length`, set the target frame length to be the least multiple
+        # of the effective segment length greater than or equal to the length of `pose_video`.
+        cond_video_frames = len(pose_video)
+        effective_segment_length = segment_frame_length - prev_segment_conditioning_frames
+        last_segment_frames = (cond_video_frames - prev_segment_conditioning_frames) % effective_segment_length
+        if last_segment_frames == 0:
+            num_padding_frames = 0
+        else:
+            num_padding_frames = effective_segment_length - last_segment_frames
+        num_target_frames = cond_video_frames + num_padding_frames
+        num_segments = num_target_frames // effective_segment_length
+
+        # 3. Encode input prompt
+        prompt_embeds, negative_prompt_embeds = self.encode_prompt(
+            prompt=prompt,
+            negative_prompt=negative_prompt,
+            do_classifier_free_guidance=self.do_classifier_free_guidance,
+            num_videos_per_prompt=num_videos_per_prompt,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            max_sequence_length=max_sequence_length,
+            device=device,
+        )
+
+        transformer_dtype = self.transformer.dtype
+        prompt_embeds = prompt_embeds.to(transformer_dtype)
+        if negative_prompt_embeds is not None:
+            negative_prompt_embeds = negative_prompt_embeds.to(transformer_dtype)
+
+        # 4. Preprocess and encode the reference (character) image
+        image_height, image_width = self.video_processor.get_default_height_width(image)
+        if image_height != height or image_width != width:
+            logger.warning(f"Reshaping reference image from ({image_width}, {image_height}) to ({width}, {height})")
+        image_pixels = self.vae_image_processor.preprocess(image, height=height, width=width, resize_mode="fill").to(
+            device, dtype=torch.float32
+        )
+
+        # Get CLIP features from the reference image
+        if image_embeds is None:
+            image_embeds = self.encode_image(image, device)
+        image_embeds = image_embeds.repeat(batch_size * num_videos_per_prompt, 1, 1)
+        image_embeds = image_embeds.to(transformer_dtype)
+
+        # 5. Encode conditioning videos (pose, face)
+        pose_video = self.pad_video_frames(pose_video, num_target_frames)
+        face_video = self.pad_video_frames(face_video, num_target_frames)
+
+        # TODO: also support np.ndarray input (e.g. from decord like the original implementation?)
+        pose_video_width, pose_video_height = pose_video[0].size
+        if pose_video_height != height or pose_video_width != width:
+            logger.warning(
+                f"Reshaping pose video from ({pose_video_width}, {pose_video_height}) to ({width}, {height})"
+            )
+        pose_video = self.video_processor.preprocess_video(pose_video, height=height, width=width).to(
+            device, dtype=torch.float32
+        )
+
+        face_video_width, face_video_height = face_video[0].size
+        expected_face_size = self.transformer.config.motion_encoder_size
+        if face_video_width != expected_face_size or face_video_height != expected_face_size:
+            logger.warning(
+                f"Reshaping face video from ({face_video_width}, {face_video_height}) to ({expected_face_size},"
+                f" {expected_face_size})"
+            )
+        face_video = self.video_processor.preprocess_video(
+            face_video, height=expected_face_size, width=expected_face_size
+        ).to(device, dtype=torch.float32)
+
+        if mode == "replace":
+            background_video = self.pad_video_frames(background_video, num_target_frames)
+            mask_video = self.pad_video_frames(mask_video, num_target_frames)
+
+            background_video = self.video_processor.preprocess_video(background_video, height=height, width=width).to(
+                device, dtype=torch.float32
+            )
+            mask_video = self.video_processor_for_mask.preprocess_video(mask_video, height=height, width=width).to(
+                device, dtype=torch.float32
+            )
+
+        # 6. Prepare timesteps
+        self.scheduler.set_timesteps(num_inference_steps, device=device)
+        timesteps = self.scheduler.timesteps
+
+        # 7. Prepare latent variables which stay constant for all inference segments
+        num_channels_latents = self.vae.config.z_dim
+
+        # Get VAE-encoded latents of the reference (character) image
+        reference_image_latents = self.prepare_reference_image_latents(
+            image_pixels, batch_size * num_videos_per_prompt, generator=generator, device=device
+        )
+
+        # 8. Loop over video inference segments
+        start = 0
+        end = segment_frame_length  # Data space frames, not latent frames
+        all_out_frames = []
+        out_frames = None
+
+        for _ in range(num_segments):
+            assert start + prev_segment_conditioning_frames < cond_video_frames
+
+            # Sample noisy latents from prior for the current inference segment
+            latents = self.prepare_latents(
+                batch_size * num_videos_per_prompt,
+                num_channels_latents=num_channels_latents,
+                height=height,
+                width=width,
+                num_frames=segment_frame_length,
+                dtype=torch.float32,
+                device=device,
+                generator=generator,
+                latents=latents if start == 0 else None,  # Only use pre-calculated latents for first segment
+            )
+
+            pose_video_segment = pose_video[:, :, start:end]
+            face_video_segment = face_video[:, :, start:end]
+
+            face_video_segment = face_video_segment.expand(batch_size * num_videos_per_prompt, -1, -1, -1, -1)
+            face_video_segment = face_video_segment.to(dtype=transformer_dtype)
+
+            if start > 0:
+                prev_segment_cond_video = out_frames[:, :, -prev_segment_conditioning_frames:].clone().detach()
+            else:
+                prev_segment_cond_video = None
+
+            if mode == "replace":
+                background_video_segment = background_video[:, :, start:end]
+                mask_video_segment = mask_video[:, :, start:end]
+
+                background_video_segment = background_video_segment.expand(
+                    batch_size * num_videos_per_prompt, -1, -1, -1, -1
+                )
+                mask_video_segment = mask_video_segment.expand(batch_size * num_videos_per_prompt, -1, -1, -1, -1)
+            else:
+                background_video_segment = None
+                mask_video_segment = None
+
+            pose_latents = self.prepare_pose_latents(
+                pose_video_segment, batch_size * num_videos_per_prompt, generator=generator, device=device
+            )
+            pose_latents = pose_latents.to(dtype=transformer_dtype)
+
+            prev_segment_cond_latents = self.prepare_prev_segment_cond_latents(
+                prev_segment_cond_video,
+                background_video=background_video_segment,
+                mask_video=mask_video_segment,
+                batch_size=batch_size * num_videos_per_prompt,
+                segment_frame_length=segment_frame_length,
+                start_frame=start,
+                height=height,
+                width=width,
+                prev_segment_cond_frames=prev_segment_conditioning_frames,
+                task=mode,
+                generator=generator,
+                device=device,
+            )
+
+            # Concatenate the reference latents in the frame dimension
+            reference_latents = torch.cat([reference_image_latents, prev_segment_cond_latents], dim=2)
+
+            # 8.1 Denoising loop
+            num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
+            self._num_timesteps = len(timesteps)
+
+            with self.progress_bar(total=num_inference_steps) as progress_bar:
+                for i, t in enumerate(timesteps):
+                    if self.interrupt:
+                        continue
+
+                    self._current_timestep = t
+
+                    # Concatenate the reference image + prev segment conditioning in the channel dim
+                    latent_model_input = torch.cat([latents, reference_latents], dim=1).to(transformer_dtype)
+                    timestep = t.expand(latents.shape[0])
+
+                    with self.transformer.cache_context("cond"):
+                        noise_pred = self.transformer(
+                            hidden_states=latent_model_input,
+                            timestep=timestep,
+                            encoder_hidden_states=prompt_embeds,
+                            encoder_hidden_states_image=image_embeds,
+                            pose_hidden_states=pose_latents,
+                            face_pixel_values=face_video_segment,
+                            motion_encode_batch_size=motion_encode_batch_size,
+                            attention_kwargs=attention_kwargs,
+                            return_dict=False,
+                        )[0]
+
+                    if self.do_classifier_free_guidance:
+                        # Blank out face for unconditional guidance (set all pixels to -1)
+                        face_pixel_values_uncond = face_video_segment * 0 - 1
+                        with self.transformer.cache_context("uncond"):
+                            noise_uncond = self.transformer(
+                                hidden_states=latent_model_input,
+                                timestep=timestep,
+                                encoder_hidden_states=negative_prompt_embeds,
+                                encoder_hidden_states_image=image_embeds,
+                                pose_hidden_states=pose_latents,
+                                face_pixel_values=face_pixel_values_uncond,
+                                motion_encode_batch_size=motion_encode_batch_size,
+                                attention_kwargs=attention_kwargs,
+                                return_dict=False,
+                            )[0]
+                            noise_pred = noise_uncond + guidance_scale * (noise_pred - noise_uncond)
+
+                    # compute the previous noisy sample x_t -> x_t-1
+                    latents = self.scheduler.step(noise_pred, t, latents, return_dict=False)[0]
+
+                    if callback_on_step_end is not None:
+                        callback_kwargs = {}
+                        for k in callback_on_step_end_tensor_inputs:
+                            callback_kwargs[k] = locals()[k]
+                        callback_outputs = callback_on_step_end(self, i, t, callback_kwargs)
+
+                        latents = callback_outputs.pop("latents", latents)
+                        prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds)
+                        negative_prompt_embeds = callback_outputs.pop("negative_prompt_embeds", negative_prompt_embeds)
+
+                    # call the callback, if provided
+                    if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+                        progress_bar.update()
+
+                    if XLA_AVAILABLE:
+                        xm.mark_step()
+
+            latents = latents.to(self.vae.dtype)
+            # Destandardize latents in preparation for Wan VAE decoding
+            latents_mean = (
+                torch.tensor(self.vae.config.latents_mean)
+                .view(1, self.vae.config.z_dim, 1, 1, 1)
+                .to(latents.device, latents.dtype)
+            )
+            latents_recip_std = 1.0 / torch.tensor(self.vae.config.latents_std).view(
+                1, self.vae.config.z_dim, 1, 1, 1
+            ).to(latents.device, latents.dtype)
+            latents = latents / latents_recip_std + latents_mean
+            # Skip the first latent frame (used for conditioning)
+            out_frames = self.vae.decode(latents[:, :, 1:], return_dict=False)[0]
+
+            if start > 0:
+                out_frames = out_frames[:, :, prev_segment_conditioning_frames:]
+            all_out_frames.append(out_frames)
+
+            start += effective_segment_length
+            end += effective_segment_length
+
+            # Reset scheduler timesteps / state for next denoising loop
+            self.scheduler.set_timesteps(num_inference_steps, device=device)
+            timesteps = self.scheduler.timesteps
+
+        self._current_timestep = None
+        assert start + prev_segment_conditioning_frames >= cond_video_frames
+
+        if not output_type == "latent":
+            video = torch.cat(all_out_frames, dim=2)[:, :, :cond_video_frames]
+            video = self.video_processor.postprocess_video(video, output_type=output_type)
+        else:
+            video = latents
+
+        # Offload all models
+        self.maybe_free_model_hooks()
+
+        if not return_dict:
+            return (video,)
+
+        return WanPipelineOutput(frames=video)
diff --git a/src/diffusers/pipelines/wan/pipeline_wan_vace.py b/src/diffusers/pipelines/wan/pipeline_wan_vace.py
index 63e557a98fbe..351ae2e70563 100644
--- a/src/diffusers/pipelines/wan/pipeline_wan_vace.py
+++ b/src/diffusers/pipelines/wan/pipeline_wan_vace.py
@@ -758,11 +758,11 @@ def __call__(
                 The number of denoising steps. More denoising steps usually lead to a higher quality image at the
                 expense of slower inference.
             guidance_scale (`float`, defaults to `5.0`):
-                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
-                `guidance_scale` is defined as `w` of equation 2. of [Imagen
-                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
-                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
-                usually at the expense of lower image quality.
+                Guidance scale as defined in [Classifier-Free Diffusion
+                Guidance](https://huggingface.co/papers/2207.12598). `guidance_scale` is defined as `w` of equation 2.
+                of [Imagen Paper](https://huggingface.co/papers/2205.11487). Guidance scale is enabled by setting
+                `guidance_scale > 1`. Higher guidance scale encourages to generate images that are closely linked to
+                the text `prompt`, usually at the expense of lower image quality.
             guidance_scale_2 (`float`, *optional*, defaults to `None`):
                 Guidance scale for the low-noise stage transformer (`transformer_2`). If `None` and the pipeline's
                 `boundary_ratio` is not None, uses the same value as `guidance_scale`. Only used when `transformer_2`
diff --git a/src/diffusers/pipelines/wuerstchen/modeling_wuerstchen_prior.py b/src/diffusers/pipelines/wuerstchen/modeling_wuerstchen_prior.py
index ae821a55788a..dbdd50871b43 100644
--- a/src/diffusers/pipelines/wuerstchen/modeling_wuerstchen_prior.py
+++ b/src/diffusers/pipelines/wuerstchen/modeling_wuerstchen_prior.py
@@ -14,17 +14,16 @@
 # limitations under the License.
 
 import math
-from typing import Dict, Union
 
 import torch
 import torch.nn as nn
 
 from ...configuration_utils import ConfigMixin, register_to_config
 from ...loaders import PeftAdapterMixin, UNet2DConditionLoadersMixin
+from ...models.attention import AttentionMixin
 from ...models.attention_processor import (
     ADDED_KV_ATTENTION_PROCESSORS,
     CROSS_ATTENTION_PROCESSORS,
-    AttentionProcessor,
     AttnAddedKVProcessor,
     AttnProcessor,
 )
@@ -32,7 +31,7 @@
 from .modeling_wuerstchen_common import AttnBlock, ResBlock, TimestepBlock, WuerstchenLayerNorm
 
 
-class WuerstchenPrior(ModelMixin, ConfigMixin, UNet2DConditionLoadersMixin, PeftAdapterMixin):
+class WuerstchenPrior(ModelMixin, AttentionMixin, ConfigMixin, UNet2DConditionLoadersMixin, PeftAdapterMixin):
     unet_name = "prior"
     _supports_gradient_checkpointing = True
 
@@ -61,66 +60,6 @@ def __init__(self, c_in=16, c=1280, c_cond=1024, c_r=64, depth=16, nhead=16, dro
         self.gradient_checkpointing = False
         self.set_default_attn_processor()
 
-    @property
-    # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.attn_processors
-    def attn_processors(self) -> Dict[str, AttentionProcessor]:
-        r"""
-        Returns:
-            `dict` of attention processors: A dictionary containing all attention processors used in the model with
-            indexed by its weight name.
-        """
-        # set recursively
-        processors = {}
-
-        def fn_recursive_add_processors(name: str, module: torch.nn.Module, processors: Dict[str, AttentionProcessor]):
-            if hasattr(module, "get_processor"):
-                processors[f"{name}.processor"] = module.get_processor()
-
-            for sub_name, child in module.named_children():
-                fn_recursive_add_processors(f"{name}.{sub_name}", child, processors)
-
-            return processors
-
-        for name, module in self.named_children():
-            fn_recursive_add_processors(name, module, processors)
-
-        return processors
-
-    # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.set_attn_processor
-    def set_attn_processor(self, processor: Union[AttentionProcessor, Dict[str, AttentionProcessor]]):
-        r"""
-        Sets the attention processor to use to compute attention.
-
-        Parameters:
-            processor (`dict` of `AttentionProcessor` or only `AttentionProcessor`):
-                The instantiated processor class or a dictionary of processor classes that will be set as the processor
-                for **all** `Attention` layers.
-
-                If `processor` is a dict, the key needs to define the path to the corresponding cross attention
-                processor. This is strongly recommended when setting trainable attention processors.
-
-        """
-        count = len(self.attn_processors.keys())
-
-        if isinstance(processor, dict) and len(processor) != count:
-            raise ValueError(
-                f"A dict of processors was passed, but the number of processors {len(processor)} does not match the"
-                f" number of attention layers: {count}. Please make sure to pass {count} processor classes."
-            )
-
-        def fn_recursive_attn_processor(name: str, module: torch.nn.Module, processor):
-            if hasattr(module, "set_processor"):
-                if not isinstance(processor, dict):
-                    module.set_processor(processor)
-                else:
-                    module.set_processor(processor.pop(f"{name}.processor"))
-
-            for sub_name, child in module.named_children():
-                fn_recursive_attn_processor(f"{name}.{sub_name}", child, processor)
-
-        for name, module in self.named_children():
-            fn_recursive_attn_processor(name, module, processor)
-
     # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.set_default_attn_processor
     def set_default_attn_processor(self):
         """
diff --git a/src/diffusers/pipelines/z_image/__init__.py b/src/diffusers/pipelines/z_image/__init__.py
new file mode 100644
index 000000000000..f4342713e3e9
--- /dev/null
+++ b/src/diffusers/pipelines/z_image/__init__.py
@@ -0,0 +1,52 @@
+from typing import TYPE_CHECKING
+
+from ...utils import (
+    DIFFUSERS_SLOW_IMPORT,
+    OptionalDependencyNotAvailable,
+    _LazyModule,
+    get_objects_from_module,
+    is_torch_available,
+    is_transformers_available,
+)
+
+
+_dummy_objects = {}
+_import_structure = {}
+
+try:
+    if not (is_transformers_available() and is_torch_available()):
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    from ...utils import dummy_torch_and_transformers_objects  # noqa: F403
+
+    _dummy_objects.update(get_objects_from_module(dummy_torch_and_transformers_objects))
+else:
+    _import_structure["pipeline_output"] = ["ZImagePipelineOutput"]
+    _import_structure["pipeline_z_image"] = ["ZImagePipeline"]
+    _import_structure["pipeline_z_image_img2img"] = ["ZImageImg2ImgPipeline"]
+
+
+if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
+    try:
+        if not (is_transformers_available() and is_torch_available()):
+            raise OptionalDependencyNotAvailable()
+
+    except OptionalDependencyNotAvailable:
+        from ...utils.dummy_torch_and_transformers_objects import *
+    else:
+        from .pipeline_output import ZImagePipelineOutput
+        from .pipeline_z_image import ZImagePipeline
+        from .pipeline_z_image_img2img import ZImageImg2ImgPipeline
+
+else:
+    import sys
+
+    sys.modules[__name__] = _LazyModule(
+        __name__,
+        globals()["__file__"],
+        _import_structure,
+        module_spec=__spec__,
+    )
+
+    for name, value in _dummy_objects.items():
+        setattr(sys.modules[__name__], name, value)
diff --git a/src/diffusers/pipelines/z_image/pipeline_output.py b/src/diffusers/pipelines/z_image/pipeline_output.py
new file mode 100644
index 000000000000..69a320fc036a
--- /dev/null
+++ b/src/diffusers/pipelines/z_image/pipeline_output.py
@@ -0,0 +1,35 @@
+# Copyright 2025 Alibaba Z-Image Team and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from dataclasses import dataclass
+from typing import List, Union
+
+import numpy as np
+import PIL.Image
+
+from diffusers.utils import BaseOutput
+
+
+@dataclass
+class ZImagePipelineOutput(BaseOutput):
+    """
+    Output class for Z-Image pipelines.
+
+    Args:
+        images (`List[PIL.Image.Image]` or `np.ndarray`)
+            List of denoised PIL images of length `batch_size` or numpy array of shape `(batch_size, height, width,
+            num_channels)`. PIL images or numpy array present the denoised images of the diffusion pipeline.
+    """
+
+    images: Union[List[PIL.Image.Image], np.ndarray]
diff --git a/src/diffusers/pipelines/z_image/pipeline_z_image.py b/src/diffusers/pipelines/z_image/pipeline_z_image.py
new file mode 100644
index 000000000000..82bdd7d361b7
--- /dev/null
+++ b/src/diffusers/pipelines/z_image/pipeline_z_image.py
@@ -0,0 +1,594 @@
+# Copyright 2025 Alibaba Z-Image Team and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import inspect
+from typing import Any, Callable, Dict, List, Optional, Union
+
+import torch
+from transformers import AutoTokenizer, PreTrainedModel
+
+from ...image_processor import VaeImageProcessor
+from ...loaders import FromSingleFileMixin, ZImageLoraLoaderMixin
+from ...models.autoencoders import AutoencoderKL
+from ...models.transformers import ZImageTransformer2DModel
+from ...pipelines.pipeline_utils import DiffusionPipeline
+from ...schedulers import FlowMatchEulerDiscreteScheduler
+from ...utils import logging, replace_example_docstring
+from ...utils.torch_utils import randn_tensor
+from .pipeline_output import ZImagePipelineOutput
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+EXAMPLE_DOC_STRING = """
+    Examples:
+        ```py
+        >>> import torch
+        >>> from diffusers import ZImagePipeline
+
+        >>> pipe = ZImagePipeline.from_pretrained("Z-a-o/Z-Image-Turbo", torch_dtype=torch.bfloat16)
+        >>> pipe.to("cuda")
+
+        >>> # Optionally, set the attention backend to flash-attn 2 or 3, default is SDPA in PyTorch.
+        >>> # (1) Use flash attention 2
+        >>> # pipe.transformer.set_attention_backend("flash")
+        >>> # (2) Use flash attention 3
+        >>> # pipe.transformer.set_attention_backend("_flash_3")
+
+        >>> prompt = "一幅为名为“造相「Z-IMAGE-TURBO」”的项目设计的创意海报。画面巧妙地将文字概念视觉化：一辆复古蒸汽小火车化身为巨大的拉链头，正拉开厚厚的冬日积雪，展露出一个生机盎然的春天。"
+        >>> image = pipe(
+        ...     prompt,
+        ...     height=1024,
+        ...     width=1024,
+        ...     num_inference_steps=9,
+        ...     guidance_scale=0.0,
+        ...     generator=torch.Generator("cuda").manual_seed(42),
+        ... ).images[0]
+        >>> image.save("zimage.png")
+        ```
+"""
+
+
+# Copied from diffusers.pipelines.flux.pipeline_flux.calculate_shift
+def calculate_shift(
+    image_seq_len,
+    base_seq_len: int = 256,
+    max_seq_len: int = 4096,
+    base_shift: float = 0.5,
+    max_shift: float = 1.15,
+):
+    m = (max_shift - base_shift) / (max_seq_len - base_seq_len)
+    b = base_shift - m * base_seq_len
+    mu = image_seq_len * m + b
+    return mu
+
+
+# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.retrieve_timesteps
+def retrieve_timesteps(
+    scheduler,
+    num_inference_steps: Optional[int] = None,
+    device: Optional[Union[str, torch.device]] = None,
+    timesteps: Optional[List[int]] = None,
+    sigmas: Optional[List[float]] = None,
+    **kwargs,
+):
+    r"""
+    Calls the scheduler's `set_timesteps` method and retrieves timesteps from the scheduler after the call. Handles
+    custom timesteps. Any kwargs will be supplied to `scheduler.set_timesteps`.
+
+    Args:
+        scheduler (`SchedulerMixin`):
+            The scheduler to get timesteps from.
+        num_inference_steps (`int`):
+            The number of diffusion steps used when generating samples with a pre-trained model. If used, `timesteps`
+            must be `None`.
+        device (`str` or `torch.device`, *optional*):
+            The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
+        timesteps (`List[int]`, *optional*):
+            Custom timesteps used to override the timestep spacing strategy of the scheduler. If `timesteps` is passed,
+            `num_inference_steps` and `sigmas` must be `None`.
+        sigmas (`List[float]`, *optional*):
+            Custom sigmas used to override the timestep spacing strategy of the scheduler. If `sigmas` is passed,
+            `num_inference_steps` and `timesteps` must be `None`.
+
+    Returns:
+        `Tuple[torch.Tensor, int]`: A tuple where the first element is the timestep schedule from the scheduler and the
+        second element is the number of inference steps.
+    """
+    if timesteps is not None and sigmas is not None:
+        raise ValueError("Only one of `timesteps` or `sigmas` can be passed. Please choose one to set custom values")
+    if timesteps is not None:
+        accepts_timesteps = "timesteps" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
+        if not accepts_timesteps:
+            raise ValueError(
+                f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
+                f" timestep schedules. Please check whether you are using the correct scheduler."
+            )
+        scheduler.set_timesteps(timesteps=timesteps, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+        num_inference_steps = len(timesteps)
+    elif sigmas is not None:
+        accept_sigmas = "sigmas" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
+        if not accept_sigmas:
+            raise ValueError(
+                f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
+                f" sigmas schedules. Please check whether you are using the correct scheduler."
+            )
+        scheduler.set_timesteps(sigmas=sigmas, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+        num_inference_steps = len(timesteps)
+    else:
+        scheduler.set_timesteps(num_inference_steps, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+    return timesteps, num_inference_steps
+
+
+class ZImagePipeline(DiffusionPipeline, ZImageLoraLoaderMixin, FromSingleFileMixin):
+    model_cpu_offload_seq = "text_encoder->transformer->vae"
+    _optional_components = []
+    _callback_tensor_inputs = ["latents", "prompt_embeds"]
+
+    def __init__(
+        self,
+        scheduler: FlowMatchEulerDiscreteScheduler,
+        vae: AutoencoderKL,
+        text_encoder: PreTrainedModel,
+        tokenizer: AutoTokenizer,
+        transformer: ZImageTransformer2DModel,
+    ):
+        super().__init__()
+
+        self.register_modules(
+            vae=vae,
+            text_encoder=text_encoder,
+            tokenizer=tokenizer,
+            scheduler=scheduler,
+            transformer=transformer,
+        )
+        self.vae_scale_factor = (
+            2 ** (len(self.vae.config.block_out_channels) - 1) if hasattr(self, "vae") and self.vae is not None else 8
+        )
+        self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor * 2)
+
+    def encode_prompt(
+        self,
+        prompt: Union[str, List[str]],
+        device: Optional[torch.device] = None,
+        do_classifier_free_guidance: bool = True,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        prompt_embeds: Optional[List[torch.FloatTensor]] = None,
+        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
+        max_sequence_length: int = 512,
+    ):
+        prompt = [prompt] if isinstance(prompt, str) else prompt
+        prompt_embeds = self._encode_prompt(
+            prompt=prompt,
+            device=device,
+            prompt_embeds=prompt_embeds,
+            max_sequence_length=max_sequence_length,
+        )
+
+        if do_classifier_free_guidance:
+            if negative_prompt is None:
+                negative_prompt = ["" for _ in prompt]
+            else:
+                negative_prompt = [negative_prompt] if isinstance(negative_prompt, str) else negative_prompt
+            assert len(prompt) == len(negative_prompt)
+            negative_prompt_embeds = self._encode_prompt(
+                prompt=negative_prompt,
+                device=device,
+                prompt_embeds=negative_prompt_embeds,
+                max_sequence_length=max_sequence_length,
+            )
+        else:
+            negative_prompt_embeds = []
+        return prompt_embeds, negative_prompt_embeds
+
+    def _encode_prompt(
+        self,
+        prompt: Union[str, List[str]],
+        device: Optional[torch.device] = None,
+        prompt_embeds: Optional[List[torch.FloatTensor]] = None,
+        max_sequence_length: int = 512,
+    ) -> List[torch.FloatTensor]:
+        device = device or self._execution_device
+
+        if prompt_embeds is not None:
+            return prompt_embeds
+
+        if isinstance(prompt, str):
+            prompt = [prompt]
+
+        for i, prompt_item in enumerate(prompt):
+            messages = [
+                {"role": "user", "content": prompt_item},
+            ]
+            prompt_item = self.tokenizer.apply_chat_template(
+                messages,
+                tokenize=False,
+                add_generation_prompt=True,
+                enable_thinking=True,
+            )
+            prompt[i] = prompt_item
+
+        text_inputs = self.tokenizer(
+            prompt,
+            padding="max_length",
+            max_length=max_sequence_length,
+            truncation=True,
+            return_tensors="pt",
+        )
+
+        text_input_ids = text_inputs.input_ids.to(device)
+        prompt_masks = text_inputs.attention_mask.to(device).bool()
+
+        prompt_embeds = self.text_encoder(
+            input_ids=text_input_ids,
+            attention_mask=prompt_masks,
+            output_hidden_states=True,
+        ).hidden_states[-2]
+
+        embeddings_list = []
+
+        for i in range(len(prompt_embeds)):
+            embeddings_list.append(prompt_embeds[i][prompt_masks[i]])
+
+        return embeddings_list
+
+    def prepare_latents(
+        self,
+        batch_size,
+        num_channels_latents,
+        height,
+        width,
+        dtype,
+        device,
+        generator,
+        latents=None,
+    ):
+        height = 2 * (int(height) // (self.vae_scale_factor * 2))
+        width = 2 * (int(width) // (self.vae_scale_factor * 2))
+
+        shape = (batch_size, num_channels_latents, height, width)
+
+        if latents is None:
+            latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+        else:
+            if latents.shape != shape:
+                raise ValueError(f"Unexpected latents shape, got {latents.shape}, expected {shape}")
+            latents = latents.to(device)
+        return latents
+
+    @property
+    def guidance_scale(self):
+        return self._guidance_scale
+
+    @property
+    def do_classifier_free_guidance(self):
+        return self._guidance_scale > 1
+
+    @property
+    def joint_attention_kwargs(self):
+        return self._joint_attention_kwargs
+
+    @property
+    def num_timesteps(self):
+        return self._num_timesteps
+
+    @property
+    def interrupt(self):
+        return self._interrupt
+
+    @torch.no_grad()
+    @replace_example_docstring(EXAMPLE_DOC_STRING)
+    def __call__(
+        self,
+        prompt: Union[str, List[str]] = None,
+        height: Optional[int] = None,
+        width: Optional[int] = None,
+        num_inference_steps: int = 50,
+        sigmas: Optional[List[float]] = None,
+        guidance_scale: float = 5.0,
+        cfg_normalization: bool = False,
+        cfg_truncation: float = 1.0,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        num_images_per_prompt: Optional[int] = 1,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.FloatTensor] = None,
+        prompt_embeds: Optional[List[torch.FloatTensor]] = None,
+        negative_prompt_embeds: Optional[List[torch.FloatTensor]] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        joint_attention_kwargs: Optional[Dict[str, Any]] = None,
+        callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None,
+        callback_on_step_end_tensor_inputs: List[str] = ["latents"],
+        max_sequence_length: int = 512,
+    ):
+        r"""
+        Function invoked when calling the pipeline for generation.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
+                instead.
+            height (`int`, *optional*, defaults to 1024):
+                The height in pixels of the generated image.
+            width (`int`, *optional*, defaults to 1024):
+                The width in pixels of the generated image.
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            sigmas (`List[float]`, *optional*):
+                Custom sigmas to use for the denoising process with schedulers which support a `sigmas` argument in
+                their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is passed
+                will be used.
+            guidance_scale (`float`, *optional*, defaults to 5.0):
+                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
+                `guidance_scale` is defined as `w` of equation 2. of [Imagen
+                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
+                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
+                usually at the expense of lower image quality.
+            cfg_normalization (`bool`, *optional*, defaults to False):
+                Whether to apply configuration normalization.
+            cfg_truncation (`float`, *optional*, defaults to 1.0):
+                The truncation value for configuration.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. If not defined, one has to pass
+                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
+                less than `1`).
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
+                to make generation deterministic.
+            latents (`torch.FloatTensor`, *optional*):
+                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor will be generated by sampling using the supplied random `generator`.
+            prompt_embeds (`List[torch.FloatTensor]`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`List[torch.FloatTensor]`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generate image. Choose between
+                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.stable_diffusion.ZImagePipelineOutput`] instead of a plain
+                tuple.
+            joint_attention_kwargs (`dict`, *optional*):
+                A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
+                `self.processor` in
+                [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
+            callback_on_step_end (`Callable`, *optional*):
+                A function that calls at the end of each denoising steps during the inference. The function is called
+                with the following arguments: `callback_on_step_end(self: DiffusionPipeline, step: int, timestep: int,
+                callback_kwargs: Dict)`. `callback_kwargs` will include a list of all tensors as specified by
+                `callback_on_step_end_tensor_inputs`.
+            callback_on_step_end_tensor_inputs (`List`, *optional*):
+                The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
+                will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
+                `._callback_tensor_inputs` attribute of your pipeline class.
+            max_sequence_length (`int`, *optional*, defaults to 512):
+                Maximum sequence length to use with the `prompt`.
+
+        Examples:
+
+        Returns:
+            [`~pipelines.z_image.ZImagePipelineOutput`] or `tuple`: [`~pipelines.z_image.ZImagePipelineOutput`] if
+            `return_dict` is True, otherwise a `tuple`. When returning a tuple, the first element is a list with the
+            generated images.
+        """
+        height = height or 1024
+        width = width or 1024
+
+        vae_scale = self.vae_scale_factor * 2
+        if height % vae_scale != 0:
+            raise ValueError(
+                f"Height must be divisible by {vae_scale} (got {height}). "
+                f"Please adjust the height to a multiple of {vae_scale}."
+            )
+        if width % vae_scale != 0:
+            raise ValueError(
+                f"Width must be divisible by {vae_scale} (got {width}). "
+                f"Please adjust the width to a multiple of {vae_scale}."
+            )
+
+        device = self._execution_device
+
+        self._guidance_scale = guidance_scale
+        self._joint_attention_kwargs = joint_attention_kwargs
+        self._interrupt = False
+        self._cfg_normalization = cfg_normalization
+        self._cfg_truncation = cfg_truncation
+        # 2. Define call parameters
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = len(prompt_embeds)
+
+        # If prompt_embeds is provided and prompt is None, skip encoding
+        if prompt_embeds is not None and prompt is None:
+            if self.do_classifier_free_guidance and negative_prompt_embeds is None:
+                raise ValueError(
+                    "When `prompt_embeds` is provided without `prompt`, "
+                    "`negative_prompt_embeds` must also be provided for classifier-free guidance."
+                )
+        else:
+            (
+                prompt_embeds,
+                negative_prompt_embeds,
+            ) = self.encode_prompt(
+                prompt=prompt,
+                negative_prompt=negative_prompt,
+                do_classifier_free_guidance=self.do_classifier_free_guidance,
+                prompt_embeds=prompt_embeds,
+                negative_prompt_embeds=negative_prompt_embeds,
+                device=device,
+                max_sequence_length=max_sequence_length,
+            )
+
+        # 4. Prepare latent variables
+        num_channels_latents = self.transformer.in_channels
+
+        latents = self.prepare_latents(
+            batch_size * num_images_per_prompt,
+            num_channels_latents,
+            height,
+            width,
+            torch.float32,
+            device,
+            generator,
+            latents,
+        )
+
+        # Repeat prompt_embeds for num_images_per_prompt
+        if num_images_per_prompt > 1:
+            prompt_embeds = [pe for pe in prompt_embeds for _ in range(num_images_per_prompt)]
+            if self.do_classifier_free_guidance and negative_prompt_embeds:
+                negative_prompt_embeds = [npe for npe in negative_prompt_embeds for _ in range(num_images_per_prompt)]
+
+        actual_batch_size = batch_size * num_images_per_prompt
+        image_seq_len = (latents.shape[2] // 2) * (latents.shape[3] // 2)
+
+        # 5. Prepare timesteps
+        mu = calculate_shift(
+            image_seq_len,
+            self.scheduler.config.get("base_image_seq_len", 256),
+            self.scheduler.config.get("max_image_seq_len", 4096),
+            self.scheduler.config.get("base_shift", 0.5),
+            self.scheduler.config.get("max_shift", 1.15),
+        )
+        self.scheduler.sigma_min = 0.0
+        scheduler_kwargs = {"mu": mu}
+        timesteps, num_inference_steps = retrieve_timesteps(
+            self.scheduler,
+            num_inference_steps,
+            device,
+            sigmas=sigmas,
+            **scheduler_kwargs,
+        )
+        num_warmup_steps = max(len(timesteps) - num_inference_steps * self.scheduler.order, 0)
+        self._num_timesteps = len(timesteps)
+
+        # 6. Denoising loop
+        with self.progress_bar(total=num_inference_steps) as progress_bar:
+            for i, t in enumerate(timesteps):
+                if self.interrupt:
+                    continue
+
+                # broadcast to batch dimension in a way that's compatible with ONNX/Core ML
+                timestep = t.expand(latents.shape[0])
+                timestep = (1000 - timestep) / 1000
+                # Normalized time for time-aware config (0 at start, 1 at end)
+                t_norm = timestep[0].item()
+
+                # Handle cfg truncation
+                current_guidance_scale = self.guidance_scale
+                if (
+                    self.do_classifier_free_guidance
+                    and self._cfg_truncation is not None
+                    and float(self._cfg_truncation) <= 1
+                ):
+                    if t_norm > self._cfg_truncation:
+                        current_guidance_scale = 0.0
+
+                # Run CFG only if configured AND scale is non-zero
+                apply_cfg = self.do_classifier_free_guidance and current_guidance_scale > 0
+
+                if apply_cfg:
+                    latents_typed = latents.to(self.transformer.dtype)
+                    latent_model_input = latents_typed.repeat(2, 1, 1, 1)
+                    prompt_embeds_model_input = prompt_embeds + negative_prompt_embeds
+                    timestep_model_input = timestep.repeat(2)
+                else:
+                    latent_model_input = latents.to(self.transformer.dtype)
+                    prompt_embeds_model_input = prompt_embeds
+                    timestep_model_input = timestep
+
+                latent_model_input = latent_model_input.unsqueeze(2)
+                latent_model_input_list = list(latent_model_input.unbind(dim=0))
+
+                model_out_list = self.transformer(
+                    latent_model_input_list, timestep_model_input, prompt_embeds_model_input, return_dict=False
+                )[0]
+
+                if apply_cfg:
+                    # Perform CFG
+                    pos_out = model_out_list[:actual_batch_size]
+                    neg_out = model_out_list[actual_batch_size:]
+
+                    noise_pred = []
+                    for j in range(actual_batch_size):
+                        pos = pos_out[j].float()
+                        neg = neg_out[j].float()
+
+                        pred = pos + current_guidance_scale * (pos - neg)
+
+                        # Renormalization
+                        if self._cfg_normalization and float(self._cfg_normalization) > 0.0:
+                            ori_pos_norm = torch.linalg.vector_norm(pos)
+                            new_pos_norm = torch.linalg.vector_norm(pred)
+                            max_new_norm = ori_pos_norm * float(self._cfg_normalization)
+                            if new_pos_norm > max_new_norm:
+                                pred = pred * (max_new_norm / new_pos_norm)
+
+                        noise_pred.append(pred)
+
+                    noise_pred = torch.stack(noise_pred, dim=0)
+                else:
+                    noise_pred = torch.stack([t.float() for t in model_out_list], dim=0)
+
+                noise_pred = noise_pred.squeeze(2)
+                noise_pred = -noise_pred
+
+                # compute the previous noisy sample x_t -> x_t-1
+                latents = self.scheduler.step(noise_pred.to(torch.float32), t, latents, return_dict=False)[0]
+                assert latents.dtype == torch.float32
+
+                if callback_on_step_end is not None:
+                    callback_kwargs = {}
+                    for k in callback_on_step_end_tensor_inputs:
+                        callback_kwargs[k] = locals()[k]
+                    callback_outputs = callback_on_step_end(self, i, t, callback_kwargs)
+
+                    latents = callback_outputs.pop("latents", latents)
+                    prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds)
+                    negative_prompt_embeds = callback_outputs.pop("negative_prompt_embeds", negative_prompt_embeds)
+
+                # call the callback, if provided
+                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+                    progress_bar.update()
+
+        if output_type == "latent":
+            image = latents
+
+        else:
+            latents = latents.to(self.vae.dtype)
+            latents = (latents / self.vae.config.scaling_factor) + self.vae.config.shift_factor
+
+            image = self.vae.decode(latents, return_dict=False)[0]
+            image = self.image_processor.postprocess(image, output_type=output_type)
+
+        # Offload all models
+        self.maybe_free_model_hooks()
+
+        if not return_dict:
+            return (image,)
+
+        return ZImagePipelineOutput(images=image)
diff --git a/src/diffusers/pipelines/z_image/pipeline_z_image_img2img.py b/src/diffusers/pipelines/z_image/pipeline_z_image_img2img.py
new file mode 100644
index 000000000000..2b3e80a2082b
--- /dev/null
+++ b/src/diffusers/pipelines/z_image/pipeline_z_image_img2img.py
@@ -0,0 +1,709 @@
+# Copyright 2025 Alibaba Z-Image Team and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import inspect
+from typing import Any, Callable, Dict, List, Optional, Union
+
+import torch
+from transformers import AutoTokenizer, PreTrainedModel
+
+from ...image_processor import PipelineImageInput, VaeImageProcessor
+from ...loaders import FromSingleFileMixin, ZImageLoraLoaderMixin
+from ...models.autoencoders import AutoencoderKL
+from ...models.transformers import ZImageTransformer2DModel
+from ...pipelines.pipeline_utils import DiffusionPipeline
+from ...schedulers import FlowMatchEulerDiscreteScheduler
+from ...utils import logging, replace_example_docstring
+from ...utils.torch_utils import randn_tensor
+from .pipeline_output import ZImagePipelineOutput
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+EXAMPLE_DOC_STRING = """
+    Examples:
+        ```py
+        >>> import torch
+        >>> from diffusers import ZImageImg2ImgPipeline
+        >>> from diffusers.utils import load_image
+
+        >>> pipe = ZImageImg2ImgPipeline.from_pretrained("Z-a-o/Z-Image-Turbo", torch_dtype=torch.bfloat16)
+        >>> pipe.to("cuda")
+
+        >>> url = "https://raw.githubusercontent.com/CompVis/stable-diffusion/main/assets/stable-samples/img2img/sketch-mountains-input.jpg"
+        >>> init_image = load_image(url).resize((1024, 1024))
+
+        >>> prompt = "A fantasy landscape with mountains and a river, detailed, vibrant colors"
+        >>> image = pipe(
+        ...     prompt,
+        ...     image=init_image,
+        ...     strength=0.6,
+        ...     num_inference_steps=9,
+        ...     guidance_scale=0.0,
+        ...     generator=torch.Generator("cuda").manual_seed(42),
+        ... ).images[0]
+        >>> image.save("zimage_img2img.png")
+        ```
+"""
+
+
+# Copied from diffusers.pipelines.flux.pipeline_flux.calculate_shift
+def calculate_shift(
+    image_seq_len,
+    base_seq_len: int = 256,
+    max_seq_len: int = 4096,
+    base_shift: float = 0.5,
+    max_shift: float = 1.15,
+):
+    m = (max_shift - base_shift) / (max_seq_len - base_seq_len)
+    b = base_shift - m * base_seq_len
+    mu = image_seq_len * m + b
+    return mu
+
+
+# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img.retrieve_latents
+def retrieve_latents(
+    encoder_output: torch.Tensor, generator: Optional[torch.Generator] = None, sample_mode: str = "sample"
+):
+    if hasattr(encoder_output, "latent_dist") and sample_mode == "sample":
+        return encoder_output.latent_dist.sample(generator)
+    elif hasattr(encoder_output, "latent_dist") and sample_mode == "argmax":
+        return encoder_output.latent_dist.mode()
+    elif hasattr(encoder_output, "latents"):
+        return encoder_output.latents
+    else:
+        raise AttributeError("Could not access latents of provided encoder_output")
+
+
+# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.retrieve_timesteps
+def retrieve_timesteps(
+    scheduler,
+    num_inference_steps: Optional[int] = None,
+    device: Optional[Union[str, torch.device]] = None,
+    timesteps: Optional[List[int]] = None,
+    sigmas: Optional[List[float]] = None,
+    **kwargs,
+):
+    r"""
+    Calls the scheduler's `set_timesteps` method and retrieves timesteps from the scheduler after the call. Handles
+    custom timesteps. Any kwargs will be supplied to `scheduler.set_timesteps`.
+
+    Args:
+        scheduler (`SchedulerMixin`):
+            The scheduler to get timesteps from.
+        num_inference_steps (`int`):
+            The number of diffusion steps used when generating samples with a pre-trained model. If used, `timesteps`
+            must be `None`.
+        device (`str` or `torch.device`, *optional*):
+            The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
+        timesteps (`List[int]`, *optional*):
+            Custom timesteps used to override the timestep spacing strategy of the scheduler. If `timesteps` is passed,
+            `num_inference_steps` and `sigmas` must be `None`.
+        sigmas (`List[float]`, *optional*):
+            Custom sigmas used to override the timestep spacing strategy of the scheduler. If `sigmas` is passed,
+            `num_inference_steps` and `timesteps` must be `None`.
+
+    Returns:
+        `Tuple[torch.Tensor, int]`: A tuple where the first element is the timestep schedule from the scheduler and the
+        second element is the number of inference steps.
+    """
+    if timesteps is not None and sigmas is not None:
+        raise ValueError("Only one of `timesteps` or `sigmas` can be passed. Please choose one to set custom values")
+    if timesteps is not None:
+        accepts_timesteps = "timesteps" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
+        if not accepts_timesteps:
+            raise ValueError(
+                f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
+                f" timestep schedules. Please check whether you are using the correct scheduler."
+            )
+        scheduler.set_timesteps(timesteps=timesteps, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+        num_inference_steps = len(timesteps)
+    elif sigmas is not None:
+        accept_sigmas = "sigmas" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
+        if not accept_sigmas:
+            raise ValueError(
+                f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
+                f" sigmas schedules. Please check whether you are using the correct scheduler."
+            )
+        scheduler.set_timesteps(sigmas=sigmas, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+        num_inference_steps = len(timesteps)
+    else:
+        scheduler.set_timesteps(num_inference_steps, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+    return timesteps, num_inference_steps
+
+
+class ZImageImg2ImgPipeline(DiffusionPipeline, ZImageLoraLoaderMixin, FromSingleFileMixin):
+    r"""
+    The ZImage pipeline for image-to-image generation.
+
+    Args:
+        scheduler ([`FlowMatchEulerDiscreteScheduler`]):
+            A scheduler to be used in combination with `transformer` to denoise the encoded image latents.
+        vae ([`AutoencoderKL`]):
+            Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.
+        text_encoder ([`PreTrainedModel`]):
+            A text encoder model to encode text prompts.
+        tokenizer ([`AutoTokenizer`]):
+            A tokenizer to tokenize text prompts.
+        transformer ([`ZImageTransformer2DModel`]):
+            A ZImage transformer model to denoise the encoded image latents.
+    """
+
+    model_cpu_offload_seq = "text_encoder->transformer->vae"
+    _optional_components = []
+    _callback_tensor_inputs = ["latents", "prompt_embeds"]
+
+    def __init__(
+        self,
+        scheduler: FlowMatchEulerDiscreteScheduler,
+        vae: AutoencoderKL,
+        text_encoder: PreTrainedModel,
+        tokenizer: AutoTokenizer,
+        transformer: ZImageTransformer2DModel,
+    ):
+        super().__init__()
+
+        self.register_modules(
+            vae=vae,
+            text_encoder=text_encoder,
+            tokenizer=tokenizer,
+            scheduler=scheduler,
+            transformer=transformer,
+        )
+        self.vae_scale_factor = (
+            2 ** (len(self.vae.config.block_out_channels) - 1) if hasattr(self, "vae") and self.vae is not None else 8
+        )
+        self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor * 2)
+
+    # Copied from diffusers.pipelines.z_image.pipeline_z_image.ZImagePipeline.encode_prompt
+    def encode_prompt(
+        self,
+        prompt: Union[str, List[str]],
+        device: Optional[torch.device] = None,
+        do_classifier_free_guidance: bool = True,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        prompt_embeds: Optional[List[torch.FloatTensor]] = None,
+        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
+        max_sequence_length: int = 512,
+    ):
+        prompt = [prompt] if isinstance(prompt, str) else prompt
+        prompt_embeds = self._encode_prompt(
+            prompt=prompt,
+            device=device,
+            prompt_embeds=prompt_embeds,
+            max_sequence_length=max_sequence_length,
+        )
+
+        if do_classifier_free_guidance:
+            if negative_prompt is None:
+                negative_prompt = ["" for _ in prompt]
+            else:
+                negative_prompt = [negative_prompt] if isinstance(negative_prompt, str) else negative_prompt
+            assert len(prompt) == len(negative_prompt)
+            negative_prompt_embeds = self._encode_prompt(
+                prompt=negative_prompt,
+                device=device,
+                prompt_embeds=negative_prompt_embeds,
+                max_sequence_length=max_sequence_length,
+            )
+        else:
+            negative_prompt_embeds = []
+        return prompt_embeds, negative_prompt_embeds
+
+    # Copied from diffusers.pipelines.z_image.pipeline_z_image.ZImagePipeline._encode_prompt
+    def _encode_prompt(
+        self,
+        prompt: Union[str, List[str]],
+        device: Optional[torch.device] = None,
+        prompt_embeds: Optional[List[torch.FloatTensor]] = None,
+        max_sequence_length: int = 512,
+    ) -> List[torch.FloatTensor]:
+        device = device or self._execution_device
+
+        if prompt_embeds is not None:
+            return prompt_embeds
+
+        if isinstance(prompt, str):
+            prompt = [prompt]
+
+        for i, prompt_item in enumerate(prompt):
+            messages = [
+                {"role": "user", "content": prompt_item},
+            ]
+            prompt_item = self.tokenizer.apply_chat_template(
+                messages,
+                tokenize=False,
+                add_generation_prompt=True,
+                enable_thinking=True,
+            )
+            prompt[i] = prompt_item
+
+        text_inputs = self.tokenizer(
+            prompt,
+            padding="max_length",
+            max_length=max_sequence_length,
+            truncation=True,
+            return_tensors="pt",
+        )
+
+        text_input_ids = text_inputs.input_ids.to(device)
+        prompt_masks = text_inputs.attention_mask.to(device).bool()
+
+        prompt_embeds = self.text_encoder(
+            input_ids=text_input_ids,
+            attention_mask=prompt_masks,
+            output_hidden_states=True,
+        ).hidden_states[-2]
+
+        embeddings_list = []
+
+        for i in range(len(prompt_embeds)):
+            embeddings_list.append(prompt_embeds[i][prompt_masks[i]])
+
+        return embeddings_list
+
+    # Copied from diffusers.pipelines.stable_diffusion_3.pipeline_stable_diffusion_3_img2img.StableDiffusion3Img2ImgPipeline.get_timesteps
+    def get_timesteps(self, num_inference_steps, strength, device):
+        # get the original timestep using init_timestep
+        init_timestep = min(num_inference_steps * strength, num_inference_steps)
+
+        t_start = int(max(num_inference_steps - init_timestep, 0))
+        timesteps = self.scheduler.timesteps[t_start * self.scheduler.order :]
+        if hasattr(self.scheduler, "set_begin_index"):
+            self.scheduler.set_begin_index(t_start * self.scheduler.order)
+
+        return timesteps, num_inference_steps - t_start
+
+    def prepare_latents(
+        self,
+        image,
+        timestep,
+        batch_size,
+        num_channels_latents,
+        height,
+        width,
+        dtype,
+        device,
+        generator,
+        latents=None,
+    ):
+        height = 2 * (int(height) // (self.vae_scale_factor * 2))
+        width = 2 * (int(width) // (self.vae_scale_factor * 2))
+
+        shape = (batch_size, num_channels_latents, height, width)
+
+        if latents is not None:
+            return latents.to(device=device, dtype=dtype)
+
+        # Encode the input image
+        image = image.to(device=device, dtype=dtype)
+        if image.shape[1] != num_channels_latents:
+            if isinstance(generator, list):
+                image_latents = [
+                    retrieve_latents(self.vae.encode(image[i : i + 1]), generator=generator[i])
+                    for i in range(image.shape[0])
+                ]
+                image_latents = torch.cat(image_latents, dim=0)
+            else:
+                image_latents = retrieve_latents(self.vae.encode(image), generator=generator)
+
+            # Apply scaling (inverse of decoding: decode does latents/scaling_factor + shift_factor)
+            image_latents = (image_latents - self.vae.config.shift_factor) * self.vae.config.scaling_factor
+        else:
+            image_latents = image
+
+        # Handle batch size expansion
+        if batch_size > image_latents.shape[0] and batch_size % image_latents.shape[0] == 0:
+            additional_image_per_prompt = batch_size // image_latents.shape[0]
+            image_latents = torch.cat([image_latents] * additional_image_per_prompt, dim=0)
+        elif batch_size > image_latents.shape[0] and batch_size % image_latents.shape[0] != 0:
+            raise ValueError(
+                f"Cannot duplicate `image` of batch size {image_latents.shape[0]} to {batch_size} text prompts."
+            )
+
+        # Add noise using flow matching scale_noise
+        noise = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+        latents = self.scheduler.scale_noise(image_latents, timestep, noise)
+
+        return latents
+
+    @property
+    def guidance_scale(self):
+        return self._guidance_scale
+
+    @property
+    def do_classifier_free_guidance(self):
+        return self._guidance_scale > 1
+
+    @property
+    def joint_attention_kwargs(self):
+        return self._joint_attention_kwargs
+
+    @property
+    def num_timesteps(self):
+        return self._num_timesteps
+
+    @property
+    def interrupt(self):
+        return self._interrupt
+
+    @torch.no_grad()
+    @replace_example_docstring(EXAMPLE_DOC_STRING)
+    def __call__(
+        self,
+        prompt: Union[str, List[str]] = None,
+        image: PipelineImageInput = None,
+        strength: float = 0.6,
+        height: Optional[int] = None,
+        width: Optional[int] = None,
+        num_inference_steps: int = 50,
+        sigmas: Optional[List[float]] = None,
+        guidance_scale: float = 5.0,
+        cfg_normalization: bool = False,
+        cfg_truncation: float = 1.0,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        num_images_per_prompt: Optional[int] = 1,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.FloatTensor] = None,
+        prompt_embeds: Optional[List[torch.FloatTensor]] = None,
+        negative_prompt_embeds: Optional[List[torch.FloatTensor]] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        joint_attention_kwargs: Optional[Dict[str, Any]] = None,
+        callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None,
+        callback_on_step_end_tensor_inputs: List[str] = ["latents"],
+        max_sequence_length: int = 512,
+    ):
+        r"""
+        Function invoked when calling the pipeline for image-to-image generation.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
+                instead.
+            image (`torch.Tensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.Tensor]`, `List[PIL.Image.Image]`, or `List[np.ndarray]`):
+                `Image`, numpy array or tensor representing an image batch to be used as the starting point. For both
+                numpy array and pytorch tensor, the expected value range is between `[0, 1]`. If it's a tensor or a
+                list of tensors, the expected shape should be `(B, C, H, W)` or `(C, H, W)`. If it is a numpy array or
+                a list of arrays, the expected shape should be `(B, H, W, C)` or `(H, W, C)`.
+            strength (`float`, *optional*, defaults to 0.6):
+                Indicates extent to transform the reference `image`. Must be between 0 and 1. `image` is used as a
+                starting point and more noise is added the higher the `strength`. The number of denoising steps depends
+                on the amount of noise initially added. When `strength` is 1, added noise is maximum and the denoising
+                process runs for the full number of iterations specified in `num_inference_steps`. A value of 1
+                essentially ignores `image`.
+            height (`int`, *optional*, defaults to 1024):
+                The height in pixels of the generated image. If not provided, uses the input image height.
+            width (`int`, *optional*, defaults to 1024):
+                The width in pixels of the generated image. If not provided, uses the input image width.
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            sigmas (`List[float]`, *optional*):
+                Custom sigmas to use for the denoising process with schedulers which support a `sigmas` argument in
+                their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is passed
+                will be used.
+            guidance_scale (`float`, *optional*, defaults to 5.0):
+                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
+                `guidance_scale` is defined as `w` of equation 2. of [Imagen
+                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
+                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
+                usually at the expense of lower image quality.
+            cfg_normalization (`bool`, *optional*, defaults to False):
+                Whether to apply configuration normalization.
+            cfg_truncation (`float`, *optional*, defaults to 1.0):
+                The truncation value for configuration.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. If not defined, one has to pass
+                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
+                less than `1`).
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
+                to make generation deterministic.
+            latents (`torch.FloatTensor`, *optional*):
+                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor will be generated by sampling using the supplied random `generator`.
+            prompt_embeds (`List[torch.FloatTensor]`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`List[torch.FloatTensor]`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generate image. Choose between
+                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.stable_diffusion.ZImagePipelineOutput`] instead of a plain
+                tuple.
+            joint_attention_kwargs (`dict`, *optional*):
+                A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
+                `self.processor` in
+                [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
+            callback_on_step_end (`Callable`, *optional*):
+                A function that calls at the end of each denoising steps during the inference. The function is called
+                with the following arguments: `callback_on_step_end(self: DiffusionPipeline, step: int, timestep: int,
+                callback_kwargs: Dict)`. `callback_kwargs` will include a list of all tensors as specified by
+                `callback_on_step_end_tensor_inputs`.
+            callback_on_step_end_tensor_inputs (`List`, *optional*):
+                The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
+                will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
+                `._callback_tensor_inputs` attribute of your pipeline class.
+            max_sequence_length (`int`, *optional*, defaults to 512):
+                Maximum sequence length to use with the `prompt`.
+
+        Examples:
+
+        Returns:
+            [`~pipelines.z_image.ZImagePipelineOutput`] or `tuple`: [`~pipelines.z_image.ZImagePipelineOutput`] if
+            `return_dict` is True, otherwise a `tuple`. When returning a tuple, the first element is a list with the
+            generated images.
+        """
+        # 1. Check inputs and validate strength
+        if strength < 0 or strength > 1:
+            raise ValueError(f"The value of strength should be in [0.0, 1.0] but is {strength}")
+
+        # 2. Preprocess image
+        init_image = self.image_processor.preprocess(image)
+        init_image = init_image.to(dtype=torch.float32)
+
+        # Get dimensions from the preprocessed image if not specified
+        if height is None:
+            height = init_image.shape[-2]
+        if width is None:
+            width = init_image.shape[-1]
+
+        vae_scale = self.vae_scale_factor * 2
+        if height % vae_scale != 0:
+            raise ValueError(
+                f"Height must be divisible by {vae_scale} (got {height}). "
+                f"Please adjust the height to a multiple of {vae_scale}."
+            )
+        if width % vae_scale != 0:
+            raise ValueError(
+                f"Width must be divisible by {vae_scale} (got {width}). "
+                f"Please adjust the width to a multiple of {vae_scale}."
+            )
+
+        device = self._execution_device
+
+        self._guidance_scale = guidance_scale
+        self._joint_attention_kwargs = joint_attention_kwargs
+        self._interrupt = False
+        self._cfg_normalization = cfg_normalization
+        self._cfg_truncation = cfg_truncation
+
+        # 3. Define call parameters
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = len(prompt_embeds)
+
+        # If prompt_embeds is provided and prompt is None, skip encoding
+        if prompt_embeds is not None and prompt is None:
+            if self.do_classifier_free_guidance and negative_prompt_embeds is None:
+                raise ValueError(
+                    "When `prompt_embeds` is provided without `prompt`, "
+                    "`negative_prompt_embeds` must also be provided for classifier-free guidance."
+                )
+        else:
+            (
+                prompt_embeds,
+                negative_prompt_embeds,
+            ) = self.encode_prompt(
+                prompt=prompt,
+                negative_prompt=negative_prompt,
+                do_classifier_free_guidance=self.do_classifier_free_guidance,
+                prompt_embeds=prompt_embeds,
+                negative_prompt_embeds=negative_prompt_embeds,
+                device=device,
+                max_sequence_length=max_sequence_length,
+            )
+
+        # 4. Prepare latent variables
+        num_channels_latents = self.transformer.in_channels
+
+        # Repeat prompt_embeds for num_images_per_prompt
+        if num_images_per_prompt > 1:
+            prompt_embeds = [pe for pe in prompt_embeds for _ in range(num_images_per_prompt)]
+            if self.do_classifier_free_guidance and negative_prompt_embeds:
+                negative_prompt_embeds = [npe for npe in negative_prompt_embeds for _ in range(num_images_per_prompt)]
+
+        actual_batch_size = batch_size * num_images_per_prompt
+
+        # Calculate latent dimensions for image_seq_len
+        latent_height = 2 * (int(height) // (self.vae_scale_factor * 2))
+        latent_width = 2 * (int(width) // (self.vae_scale_factor * 2))
+        image_seq_len = (latent_height // 2) * (latent_width // 2)
+
+        # 5. Prepare timesteps
+        mu = calculate_shift(
+            image_seq_len,
+            self.scheduler.config.get("base_image_seq_len", 256),
+            self.scheduler.config.get("max_image_seq_len", 4096),
+            self.scheduler.config.get("base_shift", 0.5),
+            self.scheduler.config.get("max_shift", 1.15),
+        )
+        self.scheduler.sigma_min = 0.0
+        scheduler_kwargs = {"mu": mu}
+        timesteps, num_inference_steps = retrieve_timesteps(
+            self.scheduler,
+            num_inference_steps,
+            device,
+            sigmas=sigmas,
+            **scheduler_kwargs,
+        )
+
+        # 6. Adjust timesteps based on strength
+        timesteps, num_inference_steps = self.get_timesteps(num_inference_steps, strength, device)
+        if num_inference_steps < 1:
+            raise ValueError(
+                f"After adjusting the num_inference_steps by strength parameter: {strength}, the number of pipeline "
+                f"steps is {num_inference_steps} which is < 1 and not appropriate for this pipeline."
+            )
+        latent_timestep = timesteps[:1].repeat(actual_batch_size)
+
+        # 7. Prepare latents from image
+        latents = self.prepare_latents(
+            init_image,
+            latent_timestep,
+            actual_batch_size,
+            num_channels_latents,
+            height,
+            width,
+            prompt_embeds[0].dtype,
+            device,
+            generator,
+            latents,
+        )
+
+        num_warmup_steps = max(len(timesteps) - num_inference_steps * self.scheduler.order, 0)
+        self._num_timesteps = len(timesteps)
+
+        # 8. Denoising loop
+        with self.progress_bar(total=num_inference_steps) as progress_bar:
+            for i, t in enumerate(timesteps):
+                if self.interrupt:
+                    continue
+
+                # broadcast to batch dimension in a way that's compatible with ONNX/Core ML
+                timestep = t.expand(latents.shape[0])
+                timestep = (1000 - timestep) / 1000
+                # Normalized time for time-aware config (0 at start, 1 at end)
+                t_norm = timestep[0].item()
+
+                # Handle cfg truncation
+                current_guidance_scale = self.guidance_scale
+                if (
+                    self.do_classifier_free_guidance
+                    and self._cfg_truncation is not None
+                    and float(self._cfg_truncation) <= 1
+                ):
+                    if t_norm > self._cfg_truncation:
+                        current_guidance_scale = 0.0
+
+                # Run CFG only if configured AND scale is non-zero
+                apply_cfg = self.do_classifier_free_guidance and current_guidance_scale > 0
+
+                if apply_cfg:
+                    latents_typed = latents.to(self.transformer.dtype)
+                    latent_model_input = latents_typed.repeat(2, 1, 1, 1)
+                    prompt_embeds_model_input = prompt_embeds + negative_prompt_embeds
+                    timestep_model_input = timestep.repeat(2)
+                else:
+                    latent_model_input = latents.to(self.transformer.dtype)
+                    prompt_embeds_model_input = prompt_embeds
+                    timestep_model_input = timestep
+
+                latent_model_input = latent_model_input.unsqueeze(2)
+                latent_model_input_list = list(latent_model_input.unbind(dim=0))
+
+                model_out_list = self.transformer(
+                    latent_model_input_list,
+                    timestep_model_input,
+                    prompt_embeds_model_input,
+                )[0]
+
+                if apply_cfg:
+                    # Perform CFG
+                    pos_out = model_out_list[:actual_batch_size]
+                    neg_out = model_out_list[actual_batch_size:]
+
+                    noise_pred = []
+                    for j in range(actual_batch_size):
+                        pos = pos_out[j].float()
+                        neg = neg_out[j].float()
+
+                        pred = pos + current_guidance_scale * (pos - neg)
+
+                        # Renormalization
+                        if self._cfg_normalization and float(self._cfg_normalization) > 0.0:
+                            ori_pos_norm = torch.linalg.vector_norm(pos)
+                            new_pos_norm = torch.linalg.vector_norm(pred)
+                            max_new_norm = ori_pos_norm * float(self._cfg_normalization)
+                            if new_pos_norm > max_new_norm:
+                                pred = pred * (max_new_norm / new_pos_norm)
+
+                        noise_pred.append(pred)
+
+                    noise_pred = torch.stack(noise_pred, dim=0)
+                else:
+                    noise_pred = torch.stack([t.float() for t in model_out_list], dim=0)
+
+                noise_pred = noise_pred.squeeze(2)
+                noise_pred = -noise_pred
+
+                # compute the previous noisy sample x_t -> x_t-1
+                latents = self.scheduler.step(noise_pred.to(torch.float32), t, latents, return_dict=False)[0]
+                assert latents.dtype == torch.float32
+
+                if callback_on_step_end is not None:
+                    callback_kwargs = {}
+                    for k in callback_on_step_end_tensor_inputs:
+                        callback_kwargs[k] = locals()[k]
+                    callback_outputs = callback_on_step_end(self, i, t, callback_kwargs)
+
+                    latents = callback_outputs.pop("latents", latents)
+                    prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds)
+                    negative_prompt_embeds = callback_outputs.pop("negative_prompt_embeds", negative_prompt_embeds)
+
+                # call the callback, if provided
+                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+                    progress_bar.update()
+
+        if output_type == "latent":
+            image = latents
+
+        else:
+            latents = latents.to(self.vae.dtype)
+            latents = (latents / self.vae.config.scaling_factor) + self.vae.config.shift_factor
+
+            image = self.vae.decode(latents, return_dict=False)[0]
+            image = self.image_processor.postprocess(image, output_type=output_type)
+
+        # Offload all models
+        self.maybe_free_model_hooks()
+
+        if not return_dict:
+            return (image,)
+
+        return ZImagePipelineOutput(images=image)
diff --git a/src/diffusers/quantizers/modelopt/modelopt_quantizer.py b/src/diffusers/quantizers/modelopt/modelopt_quantizer.py
index 534f752321b3..7312036f52d0 100644
--- a/src/diffusers/quantizers/modelopt/modelopt_quantizer.py
+++ b/src/diffusers/quantizers/modelopt/modelopt_quantizer.py
@@ -27,7 +27,7 @@
 
 class NVIDIAModelOptQuantizer(DiffusersQuantizer):
     r"""
-    Diffusers Quantizer for TensorRT Model Optimizer
+    Diffusers Quantizer for Nvidia-Model Optimizer
     """
 
     use_keep_in_fp32_modules = True
diff --git a/src/diffusers/schedulers/scheduling_amused.py b/src/diffusers/schedulers/scheduling_amused.py
index 238b8d869171..a0b8fbc862b0 100644
--- a/src/diffusers/schedulers/scheduling_amused.py
+++ b/src/diffusers/schedulers/scheduling_amused.py
@@ -1,6 +1,6 @@
 import math
 from dataclasses import dataclass
-from typing import List, Optional, Tuple, Union
+from typing import List, Literal, Optional, Tuple, Union
 
 import torch
 
@@ -9,13 +9,48 @@
 from .scheduling_utils import SchedulerMixin
 
 
-def gumbel_noise(t, generator=None):
+def gumbel_noise(t: torch.Tensor, generator: Optional[torch.Generator] = None) -> torch.Tensor:
+    """
+    Generate Gumbel noise for sampling.
+
+    Args:
+        t (`torch.Tensor`):
+            Input tensor to match the shape and dtype of the output noise.
+        generator (`torch.Generator`, *optional*):
+            A random number generator for reproducible sampling.
+
+    Returns:
+        `torch.Tensor`:
+            Gumbel-distributed noise with the same shape, dtype, and device as the input tensor.
+    """
     device = generator.device if generator is not None else t.device
     noise = torch.zeros_like(t, device=device).uniform_(0, 1, generator=generator).to(t.device)
     return -torch.log((-torch.log(noise.clamp(1e-20))).clamp(1e-20))
 
 
-def mask_by_random_topk(mask_len, probs, temperature=1.0, generator=None):
+def mask_by_random_topk(
+    mask_len: torch.Tensor,
+    probs: torch.Tensor,
+    temperature: float = 1.0,
+    generator: Optional[torch.Generator] = None,
+) -> torch.Tensor:
+    """
+    Mask tokens by selecting the top-k lowest confidence scores with temperature-based randomness.
+
+    Args:
+        mask_len (`torch.Tensor`):
+            Number of tokens to mask per sample in the batch.
+        probs (`torch.Tensor`):
+            Probability scores for each token.
+        temperature (`float`, *optional*, defaults to 1.0):
+            Temperature parameter for controlling randomness in the masking process.
+        generator (`torch.Generator`, *optional*):
+            A random number generator for reproducible sampling.
+
+    Returns:
+        `torch.Tensor`:
+            Boolean mask indicating which tokens should be masked.
+    """
     confidence = torch.log(probs.clamp(1e-20)) + temperature * gumbel_noise(probs, generator=generator)
     sorted_confidence = torch.sort(confidence, dim=-1).values
     cut_off = torch.gather(sorted_confidence, 1, mask_len.long())
@@ -29,28 +64,46 @@ class AmusedSchedulerOutput(BaseOutput):
     Output class for the scheduler's `step` function output.
 
     Args:
-        prev_sample (`torch.Tensor` of shape `(batch_size, num_channels, height, width)` for images):
-            Computed sample `(x_{t-1})` of previous timestep. `prev_sample` should be used as next model input in the
-            denoising loop.
-        pred_original_sample (`torch.Tensor` of shape `(batch_size, num_channels, height, width)` for images):
-            The predicted denoised sample `(x_{0})` based on the model output from the current timestep.
-            `pred_original_sample` can be used to preview progress or for guidance.
+        prev_sample (`torch.LongTensor` of shape `(batch_size, height, width)` or `(batch_size, sequence_length)`):
+            Computed sample `(x_{t-1})` of previous timestep with token IDs. `prev_sample` should be used as next model
+            input in the denoising loop.
+        pred_original_sample (`torch.LongTensor` of shape `(batch_size, height, width)` or `(batch_size, sequence_length)`, *optional*):
+            The predicted fully denoised sample `(x_{0})` with token IDs based on the model output from the current
+            timestep. `pred_original_sample` can be used to preview progress or for guidance.
     """
 
     prev_sample: torch.Tensor
-    pred_original_sample: torch.Tensor = None
+    pred_original_sample: Optional[torch.Tensor] = None
 
 
 class AmusedScheduler(SchedulerMixin, ConfigMixin):
+    """
+    A scheduler for masked token generation as used in [`AmusedPipeline`].
+
+    This scheduler iteratively unmasks tokens based on their confidence scores, following either a cosine or linear
+    schedule. Unlike traditional diffusion schedulers that work with continuous pixel values, this scheduler operates
+    on discrete token IDs, making it suitable for autoregressive and non-autoregressive masked token generation models.
+
+    This scheduler inherits from [`SchedulerMixin`] and [`ConfigMixin`]. Check the superclass documentation for the
+    generic methods the library implements for all schedulers such as loading and saving.
+
+    Args:
+        mask_token_id (`int`):
+            The token ID used to represent masked tokens in the sequence.
+        masking_schedule (`Literal["cosine", "linear"]`, *optional*, defaults to `"cosine"`):
+            The schedule type for determining the mask ratio at each timestep. Can be either `"cosine"` or `"linear"`.
+    """
+
     order = 1
 
-    temperatures: torch.Tensor
+    temperatures: Optional[torch.Tensor]
+    timesteps: Optional[torch.Tensor]
 
     @register_to_config
     def __init__(
         self,
         mask_token_id: int,
-        masking_schedule: str = "cosine",
+        masking_schedule: Literal["cosine", "linear"] = "cosine",
     ):
         self.temperatures = None
         self.timesteps = None
@@ -58,9 +111,23 @@ def __init__(
     def set_timesteps(
         self,
         num_inference_steps: int,
-        temperature: Union[int, Tuple[int, int], List[int]] = (2, 0),
-        device: Union[str, torch.device] = None,
-    ):
+        temperature: Union[float, Tuple[float, float], List[float]] = (2, 0),
+        device: Optional[Union[str, torch.device]] = None,
+    ) -> None:
+        """
+        Set the discrete timesteps used for the diffusion chain (to be run before inference).
+
+        Args:
+            num_inference_steps (`int`):
+                The number of diffusion steps used when generating samples with a pre-trained model.
+            temperature (`Union[float, Tuple[float, float], List[float]]`, *optional*, defaults to `(2, 0)`):
+                Temperature parameter(s) for controlling the randomness of sampling. If a tuple or list is provided,
+                temperatures will be linearly interpolated between the first and second values across all timesteps. If
+                a single value is provided, temperatures will be linearly interpolated from that value to 0.01.
+            device (`str` or `torch.device`, *optional*):
+                The device to which the timesteps and temperatures should be moved to. If `None`, the timesteps are not
+                moved.
+        """
         self.timesteps = torch.arange(num_inference_steps, device=device).flip(0)
 
         if isinstance(temperature, (tuple, list)):
@@ -71,12 +138,38 @@ def set_timesteps(
     def step(
         self,
         model_output: torch.Tensor,
-        timestep: torch.long,
+        timestep: int,
         sample: torch.LongTensor,
-        starting_mask_ratio: int = 1,
+        starting_mask_ratio: float = 1.0,
         generator: Optional[torch.Generator] = None,
         return_dict: bool = True,
-    ) -> Union[AmusedSchedulerOutput, Tuple]:
+    ) -> Union[AmusedSchedulerOutput, Tuple[torch.Tensor, torch.Tensor]]:
+        """
+        Predict the sample at the previous timestep by masking tokens based on confidence scores.
+
+        Args:
+            model_output (`torch.Tensor`):
+                The direct output from the learned diffusion model. Typically of shape `(batch_size, num_tokens,
+                codebook_size)` or `(batch_size, codebook_size, height, width)` for 2D inputs.
+            timestep (`int`):
+                The current discrete timestep in the diffusion chain.
+            sample (`torch.LongTensor`):
+                A current instance of a sample created by the diffusion process. Contains token IDs, with masked
+                positions indicated by `mask_token_id`.
+            starting_mask_ratio (`float`, *optional*, defaults to 1.0):
+                A multiplier applied to the mask ratio schedule. Values less than 1.0 will result in fewer tokens being
+                masked at each step.
+            generator (`torch.Generator`, *optional*):
+                A random number generator for reproducible sampling.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether to return an [`~schedulers.scheduling_amused.AmusedSchedulerOutput`] or a plain tuple.
+
+        Returns:
+            [`~schedulers.scheduling_amused.AmusedSchedulerOutput`] or `tuple`:
+                If `return_dict` is `True`, [`~schedulers.scheduling_amused.AmusedSchedulerOutput`] is returned,
+                otherwise a tuple is returned where the first element is the sample tensor (`prev_sample`) and the
+                second element is the predicted original sample tensor (`pred_original_sample`).
+        """
         two_dim_input = sample.ndim == 3 and model_output.ndim == 4
 
         if two_dim_input:
@@ -137,7 +230,27 @@ def step(
 
         return AmusedSchedulerOutput(prev_sample, pred_original_sample)
 
-    def add_noise(self, sample, timesteps, generator=None):
+    def add_noise(
+        self,
+        sample: torch.LongTensor,
+        timesteps: int,
+        generator: Optional[torch.Generator] = None,
+    ) -> torch.LongTensor:
+        """
+        Add noise to a sample by randomly masking tokens according to the masking schedule.
+
+        Args:
+            sample (`torch.LongTensor`):
+                The input sample containing token IDs to be partially masked.
+            timesteps (`int`):
+                The timestep that determines how much masking to apply. Higher timesteps result in more masking.
+            generator (`torch.Generator`, *optional*):
+                A random number generator for reproducible masking.
+
+        Returns:
+            `torch.LongTensor`:
+                The sample with some tokens replaced by `mask_token_id` according to the masking schedule.
+        """
         step_idx = (self.timesteps == timesteps).nonzero()
         ratio = (step_idx + 1) / len(self.timesteps)
 
diff --git a/src/diffusers/schedulers/scheduling_consistency_decoder.py b/src/diffusers/schedulers/scheduling_consistency_decoder.py
index d7af018b284a..767fa9157f59 100644
--- a/src/diffusers/schedulers/scheduling_consistency_decoder.py
+++ b/src/diffusers/schedulers/scheduling_consistency_decoder.py
@@ -1,6 +1,6 @@
 import math
 from dataclasses import dataclass
-from typing import Optional, Tuple, Union
+from typing import Literal, Optional, Tuple, Union
 
 import torch
 
@@ -12,10 +12,10 @@
 
 # Copied from diffusers.schedulers.scheduling_ddpm.betas_for_alpha_bar
 def betas_for_alpha_bar(
-    num_diffusion_timesteps,
-    max_beta=0.999,
-    alpha_transform_type="cosine",
-):
+    num_diffusion_timesteps: int,
+    max_beta: float = 0.999,
+    alpha_transform_type: Literal["cosine", "exp"] = "cosine",
+) -> torch.Tensor:
     """
     Create a beta schedule that discretizes the given alpha_t_bar function, which defines the cumulative product of
     (1-beta) over time from t = [0,1].
@@ -23,16 +23,17 @@ def betas_for_alpha_bar(
     Contains a function alpha_bar that takes an argument t and transforms it to the cumulative product of (1-beta) up
     to that part of the diffusion process.
 
-
     Args:
-        num_diffusion_timesteps (`int`): the number of betas to produce.
-        max_beta (`float`): the maximum beta to use; use values lower than 1 to
-                     prevent singularities.
-        alpha_transform_type (`str`, *optional*, default to `cosine`): the type of noise schedule for alpha_bar.
-                     Choose from `cosine` or `exp`
+        num_diffusion_timesteps (`int`):
+            The number of betas to produce.
+        max_beta (`float`, defaults to `0.999`):
+            The maximum beta to use; use values lower than 1 to avoid numerical instability.
+        alpha_transform_type (`"cosine"` or `"exp"`, defaults to `"cosine"`):
+            The type of noise schedule for `alpha_bar`. Choose from `cosine` or `exp`.
 
     Returns:
-        betas (`np.ndarray`): the betas used by the scheduler to step the model outputs
+        `torch.Tensor`:
+            The betas used by the scheduler to step the model outputs.
     """
     if alpha_transform_type == "cosine":
 
diff --git a/src/diffusers/schedulers/scheduling_consistency_models.py b/src/diffusers/schedulers/scheduling_consistency_models.py
index 5d81d5eb8ac0..386a43db0f9c 100644
--- a/src/diffusers/schedulers/scheduling_consistency_models.py
+++ b/src/diffusers/schedulers/scheduling_consistency_models.py
@@ -121,7 +121,7 @@ def set_begin_index(self, begin_index: int = 0):
         Sets the begin index for the scheduler. This function should be run from pipeline before the inference.
 
         Args:
-            begin_index (`int`):
+            begin_index (`int`, defaults to `0`):
                 The begin index for the scheduler.
         """
         self._begin_index = begin_index
@@ -287,7 +287,23 @@ def get_scalings_for_boundary_condition(self, sigma):
         return c_skip, c_out
 
     # Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler.index_for_timestep
-    def index_for_timestep(self, timestep, schedule_timesteps=None):
+    def index_for_timestep(
+        self, timestep: Union[float, torch.Tensor], schedule_timesteps: Optional[torch.Tensor] = None
+    ) -> int:
+        """
+        Find the index of a given timestep in the timestep schedule.
+
+        Args:
+            timestep (`float` or `torch.Tensor`):
+                The timestep value to find in the schedule.
+            schedule_timesteps (`torch.Tensor`, *optional*):
+                The timestep schedule to search in. If `None`, uses `self.timesteps`.
+
+        Returns:
+            `int`:
+                The index of the timestep in the schedule. For the very first step, returns the second index if
+                multiple matches exist to avoid skipping a sigma when starting mid-schedule (e.g., for image-to-image).
+        """
         if schedule_timesteps is None:
             schedule_timesteps = self.timesteps
 
@@ -302,7 +318,14 @@ def index_for_timestep(self, timestep, schedule_timesteps=None):
         return indices[pos].item()
 
     # Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler._init_step_index
-    def _init_step_index(self, timestep):
+    def _init_step_index(self, timestep: Union[float, torch.Tensor]) -> None:
+        """
+        Initialize the step index for the scheduler based on the given timestep.
+
+        Args:
+            timestep (`float` or `torch.Tensor`):
+                The current timestep to initialize the step index from.
+        """
         if self.begin_index is None:
             if isinstance(timestep, torch.Tensor):
                 timestep = timestep.to(self.timesteps.device)
@@ -410,6 +433,21 @@ def add_noise(
         noise: torch.Tensor,
         timesteps: torch.Tensor,
     ) -> torch.Tensor:
+        """
+        Add noise to the original samples according to the noise schedule at the specified timesteps.
+
+        Args:
+            original_samples (`torch.Tensor`):
+                The original samples to which noise will be added.
+            noise (`torch.Tensor`):
+                The noise tensor to add to the original samples.
+            timesteps (`torch.Tensor`):
+                The timesteps at which to add noise, determining the noise level from the schedule.
+
+        Returns:
+            `torch.Tensor`:
+                The noisy samples with added noise scaled according to the timestep schedule.
+        """
         # Make sure sigmas and timesteps have the same device and dtype as original_samples
         sigmas = self.sigmas.to(device=original_samples.device, dtype=original_samples.dtype)
         if original_samples.device.type == "mps" and torch.is_floating_point(timesteps):
diff --git a/src/diffusers/schedulers/scheduling_cosine_dpmsolver_multistep.py b/src/diffusers/schedulers/scheduling_cosine_dpmsolver_multistep.py
index b9567f2c47d5..103cca81c6a5 100644
--- a/src/diffusers/schedulers/scheduling_cosine_dpmsolver_multistep.py
+++ b/src/diffusers/schedulers/scheduling_cosine_dpmsolver_multistep.py
@@ -53,7 +53,7 @@ class CosineDPMSolverMultistepScheduler(SchedulerMixin, ConfigMixin):
         prediction_type (`str`, defaults to `v_prediction`, *optional*):
             Prediction type of the scheduler function; can be `epsilon` (predicts the noise of the diffusion process),
             `sample` (directly predicts the noisy sample`) or `v_prediction` (see section 2.4 of [Imagen
-            Video](https://imagen.research.google/video/paper.pdf) paper).
+            Video](https://huggingface.co/papers/2210.02303) paper).
         solver_type (`str`, defaults to `midpoint`):
             Solver type for the second-order solver; can be `midpoint` or `heun`. The solver type slightly affects the
             sample quality, especially for a small number of steps. It is recommended to use `midpoint` solvers.
@@ -137,7 +137,7 @@ def set_begin_index(self, begin_index: int = 0):
         Sets the begin index for the scheduler. This function should be run from pipeline before the inference.
 
         Args:
-            begin_index (`int`):
+            begin_index (`int`, defaults to `0`):
                 The begin index for the scheduler.
         """
         self._begin_index = begin_index
@@ -266,6 +266,19 @@ def _compute_exponential_sigmas(self, ramp, sigma_min=None, sigma_max=None) -> t
 
     # Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler._sigma_to_t
     def _sigma_to_t(self, sigma, log_sigmas):
+        """
+        Convert sigma values to corresponding timestep values through interpolation.
+
+        Args:
+            sigma (`np.ndarray`):
+                The sigma value(s) to convert to timestep(s).
+            log_sigmas (`np.ndarray`):
+                The logarithm of the sigma schedule used for interpolation.
+
+        Returns:
+            `np.ndarray`:
+                The interpolated timestep value(s) corresponding to the input sigma(s).
+        """
         # get log sigma
         log_sigma = np.log(np.maximum(sigma, 1e-10))
 
@@ -416,7 +429,22 @@ def multistep_dpm_solver_second_order_update(
         return x_t
 
     # Copied from diffusers.schedulers.scheduling_dpmsolver_multistep.DPMSolverMultistepScheduler.index_for_timestep
-    def index_for_timestep(self, timestep, schedule_timesteps=None):
+    def index_for_timestep(
+        self, timestep: Union[int, torch.Tensor], schedule_timesteps: Optional[torch.Tensor] = None
+    ) -> int:
+        """
+        Find the index for a given timestep in the schedule.
+
+        Args:
+            timestep (`int` or `torch.Tensor`):
+                The timestep for which to find the index.
+            schedule_timesteps (`torch.Tensor`, *optional*):
+                The timestep schedule to search in. If `None`, uses `self.timesteps`.
+
+        Returns:
+            `int`:
+                The index of the timestep in the schedule.
+        """
         if schedule_timesteps is None:
             schedule_timesteps = self.timesteps
 
@@ -439,6 +467,10 @@ def index_for_timestep(self, timestep, schedule_timesteps=None):
     def _init_step_index(self, timestep):
         """
         Initialize the step_index counter for the scheduler.
+
+        Args:
+            timestep (`int` or `torch.Tensor`):
+                The current timestep for which to initialize the step index.
         """
 
         if self.begin_index is None:
@@ -537,6 +569,21 @@ def add_noise(
         noise: torch.Tensor,
         timesteps: torch.Tensor,
     ) -> torch.Tensor:
+        """
+        Add noise to the original samples according to the noise schedule at the specified timesteps.
+
+        Args:
+            original_samples (`torch.Tensor`):
+                The original samples to which noise will be added.
+            noise (`torch.Tensor`):
+                The noise tensor to add to the original samples.
+            timesteps (`torch.Tensor`):
+                The timesteps at which to add noise, determining the noise level from the schedule.
+
+        Returns:
+            `torch.Tensor`:
+                The noisy samples with added noise scaled according to the timestep schedule.
+        """
         # Make sure sigmas and timesteps have the same device and dtype as original_samples
         sigmas = self.sigmas.to(device=original_samples.device, dtype=original_samples.dtype)
         if original_samples.device.type == "mps" and torch.is_floating_point(timesteps):
diff --git a/src/diffusers/schedulers/scheduling_ddim.py b/src/diffusers/schedulers/scheduling_ddim.py
index 5ee0d084f060..d7fe29a72ac9 100644
--- a/src/diffusers/schedulers/scheduling_ddim.py
+++ b/src/diffusers/schedulers/scheduling_ddim.py
@@ -17,7 +17,7 @@
 
 import math
 from dataclasses import dataclass
-from typing import List, Optional, Tuple, Union
+from typing import List, Literal, Optional, Tuple, Union
 
 import numpy as np
 import torch
@@ -49,10 +49,10 @@ class DDIMSchedulerOutput(BaseOutput):
 
 # Copied from diffusers.schedulers.scheduling_ddpm.betas_for_alpha_bar
 def betas_for_alpha_bar(
-    num_diffusion_timesteps,
-    max_beta=0.999,
-    alpha_transform_type="cosine",
-):
+    num_diffusion_timesteps: int,
+    max_beta: float = 0.999,
+    alpha_transform_type: Literal["cosine", "exp"] = "cosine",
+) -> torch.Tensor:
     """
     Create a beta schedule that discretizes the given alpha_t_bar function, which defines the cumulative product of
     (1-beta) over time from t = [0,1].
@@ -60,16 +60,17 @@ def betas_for_alpha_bar(
     Contains a function alpha_bar that takes an argument t and transforms it to the cumulative product of (1-beta) up
     to that part of the diffusion process.
 
-
     Args:
-        num_diffusion_timesteps (`int`): the number of betas to produce.
-        max_beta (`float`): the maximum beta to use; use values lower than 1 to
-                     prevent singularities.
-        alpha_transform_type (`str`, *optional*, default to `cosine`): the type of noise schedule for alpha_bar.
-                     Choose from `cosine` or `exp`
+        num_diffusion_timesteps (`int`):
+            The number of betas to produce.
+        max_beta (`float`, defaults to `0.999`):
+            The maximum beta to use; use values lower than 1 to avoid numerical instability.
+        alpha_transform_type (`"cosine"` or `"exp"`, defaults to `"cosine"`):
+            The type of noise schedule for `alpha_bar`. Choose from `cosine` or `exp`.
 
     Returns:
-        betas (`np.ndarray`): the betas used by the scheduler to step the model outputs
+        `torch.Tensor`:
+            The betas used by the scheduler to step the model outputs.
     """
     if alpha_transform_type == "cosine":
 
@@ -92,17 +93,17 @@ def alpha_bar_fn(t):
     return torch.tensor(betas, dtype=torch.float32)
 
 
-def rescale_zero_terminal_snr(betas):
+def rescale_zero_terminal_snr(betas: torch.Tensor) -> torch.Tensor:
     """
     Rescales betas to have zero terminal SNR Based on https://huggingface.co/papers/2305.08891 (Algorithm 1)
 
-
     Args:
         betas (`torch.Tensor`):
-            the betas that the scheduler is being initialized with.
+            The betas that the scheduler is being initialized with.
 
     Returns:
-        `torch.Tensor`: rescaled betas with zero terminal SNR
+        `torch.Tensor`:
+            Rescaled betas with zero terminal SNR.
     """
     # Convert betas to alphas_bar_sqrt
     alphas = 1.0 - betas
@@ -143,9 +144,9 @@ class DDIMScheduler(SchedulerMixin, ConfigMixin):
             The starting `beta` value of inference.
         beta_end (`float`, defaults to 0.02):
             The final `beta` value.
-        beta_schedule (`str`, defaults to `"linear"`):
-            The beta schedule, a mapping from a beta range to a sequence of betas for stepping the model. Choose from
-            `linear`, `scaled_linear`, or `squaredcos_cap_v2`.
+        beta_schedule (`Literal["linear", "scaled_linear", "squaredcos_cap_v2"]`, defaults to `"linear"`):
+            The beta schedule, a mapping from a beta range to a sequence of betas for stepping the model. Must be one
+            of `"linear"`, `"scaled_linear"`, or `"squaredcos_cap_v2"`.
         trained_betas (`np.ndarray`, *optional*):
             Pass an array of betas directly to the constructor to bypass `beta_start` and `beta_end`.
         clip_sample (`bool`, defaults to `True`):
@@ -158,10 +159,10 @@ class DDIMScheduler(SchedulerMixin, ConfigMixin):
             otherwise it uses the alpha value at step 0.
         steps_offset (`int`, defaults to 0):
             An offset added to the inference steps, as required by some model families.
-        prediction_type (`str`, defaults to `epsilon`, *optional*):
-            Prediction type of the scheduler function; can be `epsilon` (predicts the noise of the diffusion process),
-            `sample` (directly predicts the noisy sample`) or `v_prediction` (see section 2.4 of [Imagen
-            Video](https://imagen.research.google/video/paper.pdf) paper).
+        prediction_type (`Literal["epsilon", "sample", "v_prediction"]`, defaults to `"epsilon"`):
+            Prediction type of the scheduler function. Must be one of `"epsilon"` (predicts the noise of the diffusion
+            process), `"sample"` (directly predicts the noisy sample), or `"v_prediction"` (see section 2.4 of [Imagen
+            Video](https://huggingface.co/papers/2210.02303) paper).
         thresholding (`bool`, defaults to `False`):
             Whether to use the "dynamic thresholding" method. This is unsuitable for latent-space diffusion models such
             as Stable Diffusion.
@@ -169,9 +170,10 @@ class DDIMScheduler(SchedulerMixin, ConfigMixin):
             The ratio for the dynamic thresholding method. Valid only when `thresholding=True`.
         sample_max_value (`float`, defaults to 1.0):
             The threshold value for dynamic thresholding. Valid only when `thresholding=True`.
-        timestep_spacing (`str`, defaults to `"leading"`):
-            The way the timesteps should be scaled. Refer to Table 2 of the [Common Diffusion Noise Schedules and
-            Sample Steps are Flawed](https://huggingface.co/papers/2305.08891) for more information.
+        timestep_spacing (`Literal["leading", "trailing", "linspace"]`, defaults to `"leading"`):
+            The way the timesteps should be scaled. Must be one of `"leading"`, `"trailing"`, or `"linspace"`. Refer to
+            Table 2 of the [Common Diffusion Noise Schedules and Sample Steps are
+            Flawed](https://huggingface.co/papers/2305.08891) for more information.
         rescale_betas_zero_snr (`bool`, defaults to `False`):
             Whether to rescale the betas to have zero terminal SNR. This enables the model to generate very bright and
             dark samples instead of limiting it to samples with medium brightness. Loosely related to
@@ -187,17 +189,17 @@ def __init__(
         num_train_timesteps: int = 1000,
         beta_start: float = 0.0001,
         beta_end: float = 0.02,
-        beta_schedule: str = "linear",
+        beta_schedule: Literal["linear", "scaled_linear", "squaredcos_cap_v2"] = "linear",
         trained_betas: Optional[Union[np.ndarray, List[float]]] = None,
         clip_sample: bool = True,
         set_alpha_to_one: bool = True,
         steps_offset: int = 0,
-        prediction_type: str = "epsilon",
+        prediction_type: Literal["epsilon", "sample", "v_prediction"] = "epsilon",
         thresholding: bool = False,
         dynamic_thresholding_ratio: float = 0.995,
         clip_sample_range: float = 1.0,
         sample_max_value: float = 1.0,
-        timestep_spacing: str = "leading",
+        timestep_spacing: Literal["leading", "trailing", "linspace"] = "leading",
         rescale_betas_zero_snr: bool = False,
     ):
         if trained_betas is not None:
@@ -250,7 +252,25 @@ def scale_model_input(self, sample: torch.Tensor, timestep: Optional[int] = None
         """
         return sample
 
-    def _get_variance(self, timestep, prev_timestep):
+    def _get_variance(self, timestep: int, prev_timestep: int) -> torch.Tensor:
+        """
+        Computes the variance of the noise added at a given diffusion step.
+
+        For a given `timestep` and its previous step, this method calculates the variance as defined in DDIM/DDPM
+        literature:
+            var_t = (beta_prod_t_prev / beta_prod_t) * (1 - alpha_prod_t / alpha_prod_t_prev)
+        where alpha_prod and beta_prod are cumulative products of alphas and betas, respectively.
+
+        Args:
+            timestep (`int`):
+                The current timestep in the diffusion process.
+            prev_timestep (`int`):
+                The previous timestep in the diffusion process. If negative, uses `final_alpha_cumprod`.
+
+        Returns:
+            `torch.Tensor`:
+                The variance for the current timestep.
+        """
         alpha_prod_t = self.alphas_cumprod[timestep]
         alpha_prod_t_prev = self.alphas_cumprod[prev_timestep] if prev_timestep >= 0 else self.final_alpha_cumprod
         beta_prod_t = 1 - alpha_prod_t
@@ -263,6 +283,8 @@ def _get_variance(self, timestep, prev_timestep):
     # Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler._threshold_sample
     def _threshold_sample(self, sample: torch.Tensor) -> torch.Tensor:
         """
+        Apply dynamic thresholding to the predicted sample.
+
         "Dynamic thresholding: At each sampling step we set s to a certain percentile absolute pixel value in xt0 (the
         prediction of x_0 at timestep t), and if s > 1, then we threshold xt0 to the range [-s, s] and then divide by
         s. Dynamic thresholding pushes saturated pixels (those near -1 and 1) inwards, thereby actively preventing
@@ -270,6 +292,14 @@ def _threshold_sample(self, sample: torch.Tensor) -> torch.Tensor:
         photorealism as well as better image-text alignment, especially when using very large guidance weights."
 
         https://huggingface.co/papers/2205.11487
+
+        Args:
+            sample (`torch.Tensor`):
+                The predicted sample to be thresholded.
+
+        Returns:
+            `torch.Tensor`:
+                The thresholded sample.
         """
         dtype = sample.dtype
         batch_size, channels, *remaining_dims = sample.shape
@@ -294,13 +324,18 @@ def _threshold_sample(self, sample: torch.Tensor) -> torch.Tensor:
 
         return sample
 
-    def set_timesteps(self, num_inference_steps: int, device: Union[str, torch.device] = None):
+    def set_timesteps(self, num_inference_steps: int, device: Union[str, torch.device] = None) -> None:
         """
         Sets the discrete timesteps used for the diffusion chain (to be run before inference).
 
         Args:
             num_inference_steps (`int`):
                 The number of diffusion steps used when generating samples with a pre-trained model.
+            device (`Union[str, torch.device]`, *optional*):
+                The device to use for the timesteps.
+
+        Raises:
+            ValueError: If `num_inference_steps` is larger than `self.config.num_train_timesteps`.
         """
 
         if num_inference_steps > self.config.num_train_timesteps:
@@ -346,7 +381,7 @@ def step(
         sample: torch.Tensor,
         eta: float = 0.0,
         use_clipped_model_output: bool = False,
-        generator=None,
+        generator: Optional[torch.Generator] = None,
         variance_noise: Optional[torch.Tensor] = None,
         return_dict: bool = True,
     ) -> Union[DDIMSchedulerOutput, Tuple]:
@@ -357,20 +392,21 @@ def step(
         Args:
             model_output (`torch.Tensor`):
                 The direct output from learned diffusion model.
-            timestep (`float`):
+            timestep (`int`):
                 The current discrete timestep in the diffusion chain.
             sample (`torch.Tensor`):
                 A current instance of a sample created by the diffusion process.
-            eta (`float`):
-                The weight of noise for added noise in diffusion step.
-            use_clipped_model_output (`bool`, defaults to `False`):
+            eta (`float`, *optional*, defaults to 0.0):
+                The weight of noise for added noise in diffusion step. A value of 0 corresponds to DDIM (deterministic)
+                and 1 corresponds to DDPM (fully stochastic).
+            use_clipped_model_output (`bool`, *optional*, defaults to `False`):
                 If `True`, computes "corrected" `model_output` from the clipped predicted original sample. Necessary
                 because predicted original sample is clipped to [-1, 1] when `self.config.clip_sample` is `True`. If no
                 clipping has happened, "corrected" `model_output` would coincide with the one provided as input and
                 `use_clipped_model_output` has no effect.
             generator (`torch.Generator`, *optional*):
-                A random number generator.
-            variance_noise (`torch.Tensor`):
+                A random number generator for reproducible sampling.
+            variance_noise (`torch.Tensor`, *optional*):
                 Alternative to generating noise with `generator` by directly providing the noise for the variance
                 itself. Useful for methods such as [`CycleDiffusion`].
             return_dict (`bool`, *optional*, defaults to `True`):
@@ -477,6 +513,22 @@ def add_noise(
         noise: torch.Tensor,
         timesteps: torch.IntTensor,
     ) -> torch.Tensor:
+        """
+        Add noise to the original samples according to the noise magnitude at each timestep (this is the forward
+        diffusion process).
+
+        Args:
+            original_samples (`torch.Tensor`):
+                The original samples to which noise will be added.
+            noise (`torch.Tensor`):
+                The noise to add to the samples.
+            timesteps (`torch.IntTensor`):
+                The timesteps indicating the noise level for each sample.
+
+        Returns:
+            `torch.Tensor`:
+                The noisy samples.
+        """
         # Make sure alphas_cumprod and timestep have same device and dtype as original_samples
         # Move the self.alphas_cumprod to device to avoid redundant CPU to GPU data movement
         # for the subsequent add_noise calls
@@ -499,6 +551,21 @@ def add_noise(
 
     # Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler.get_velocity
     def get_velocity(self, sample: torch.Tensor, noise: torch.Tensor, timesteps: torch.IntTensor) -> torch.Tensor:
+        """
+        Compute the velocity prediction from the sample and noise according to the velocity formula.
+
+        Args:
+            sample (`torch.Tensor`):
+                The input sample.
+            noise (`torch.Tensor`):
+                The noise tensor.
+            timesteps (`torch.IntTensor`):
+                The timesteps for velocity computation.
+
+        Returns:
+            `torch.Tensor`:
+                The computed velocity.
+        """
         # Make sure alphas_cumprod and timestep have same device and dtype as sample
         self.alphas_cumprod = self.alphas_cumprod.to(device=sample.device)
         alphas_cumprod = self.alphas_cumprod.to(dtype=sample.dtype)
@@ -517,5 +584,5 @@ def get_velocity(self, sample: torch.Tensor, noise: torch.Tensor, timesteps: tor
         velocity = sqrt_alpha_prod * noise - sqrt_one_minus_alpha_prod * sample
         return velocity
 
-    def __len__(self):
+    def __len__(self) -> int:
         return self.config.num_train_timesteps
diff --git a/src/diffusers/schedulers/scheduling_ddim_cogvideox.py b/src/diffusers/schedulers/scheduling_ddim_cogvideox.py
index c19efdc7834d..f2683d1304ec 100644
--- a/src/diffusers/schedulers/scheduling_ddim_cogvideox.py
+++ b/src/diffusers/schedulers/scheduling_ddim_cogvideox.py
@@ -18,7 +18,7 @@
 
 import math
 from dataclasses import dataclass
-from typing import List, Optional, Tuple, Union
+from typing import List, Literal, Optional, Tuple, Union
 
 import numpy as np
 import torch
@@ -49,10 +49,10 @@ class DDIMSchedulerOutput(BaseOutput):
 
 # Copied from diffusers.schedulers.scheduling_ddpm.betas_for_alpha_bar
 def betas_for_alpha_bar(
-    num_diffusion_timesteps,
-    max_beta=0.999,
-    alpha_transform_type="cosine",
-):
+    num_diffusion_timesteps: int,
+    max_beta: float = 0.999,
+    alpha_transform_type: Literal["cosine", "exp"] = "cosine",
+) -> torch.Tensor:
     """
     Create a beta schedule that discretizes the given alpha_t_bar function, which defines the cumulative product of
     (1-beta) over time from t = [0,1].
@@ -60,16 +60,17 @@ def betas_for_alpha_bar(
     Contains a function alpha_bar that takes an argument t and transforms it to the cumulative product of (1-beta) up
     to that part of the diffusion process.
 
-
     Args:
-        num_diffusion_timesteps (`int`): the number of betas to produce.
-        max_beta (`float`): the maximum beta to use; use values lower than 1 to
-                     prevent singularities.
-        alpha_transform_type (`str`, *optional*, default to `cosine`): the type of noise schedule for alpha_bar.
-                     Choose from `cosine` or `exp`
+        num_diffusion_timesteps (`int`):
+            The number of betas to produce.
+        max_beta (`float`, defaults to `0.999`):
+            The maximum beta to use; use values lower than 1 to avoid numerical instability.
+        alpha_transform_type (`"cosine"` or `"exp"`, defaults to `"cosine"`):
+            The type of noise schedule for `alpha_bar`. Choose from `cosine` or `exp`.
 
     Returns:
-        betas (`np.ndarray`): the betas used by the scheduler to step the model outputs
+        `torch.Tensor`:
+            The betas used by the scheduler to step the model outputs.
     """
     if alpha_transform_type == "cosine":
 
@@ -156,7 +157,7 @@ class CogVideoXDDIMScheduler(SchedulerMixin, ConfigMixin):
         prediction_type (`str`, defaults to `epsilon`, *optional*):
             Prediction type of the scheduler function; can be `epsilon` (predicts the noise of the diffusion process),
             `sample` (directly predicts the noisy sample`) or `v_prediction` (see section 2.4 of [Imagen
-            Video](https://imagen.research.google/video/paper.pdf) paper).
+            Video](https://huggingface.co/papers/2210.02303) paper).
         thresholding (`bool`, defaults to `False`):
             Whether to use the "dynamic thresholding" method. This is unsuitable for latent-space diffusion models such
             as Stable Diffusion.
@@ -408,6 +409,22 @@ def add_noise(
         noise: torch.Tensor,
         timesteps: torch.IntTensor,
     ) -> torch.Tensor:
+        """
+        Add noise to the original samples according to the noise magnitude at each timestep (this is the forward
+        diffusion process).
+
+        Args:
+            original_samples (`torch.Tensor`):
+                The original samples to which noise will be added.
+            noise (`torch.Tensor`):
+                The noise to add to the samples.
+            timesteps (`torch.IntTensor`):
+                The timesteps indicating the noise level for each sample.
+
+        Returns:
+            `torch.Tensor`:
+                The noisy samples.
+        """
         # Make sure alphas_cumprod and timestep have same device and dtype as original_samples
         # Move the self.alphas_cumprod to device to avoid redundant CPU to GPU data movement
         # for the subsequent add_noise calls
@@ -430,6 +447,21 @@ def add_noise(
 
     # Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler.get_velocity
     def get_velocity(self, sample: torch.Tensor, noise: torch.Tensor, timesteps: torch.IntTensor) -> torch.Tensor:
+        """
+        Compute the velocity prediction from the sample and noise according to the velocity formula.
+
+        Args:
+            sample (`torch.Tensor`):
+                The input sample.
+            noise (`torch.Tensor`):
+                The noise tensor.
+            timesteps (`torch.IntTensor`):
+                The timesteps for velocity computation.
+
+        Returns:
+            `torch.Tensor`:
+                The computed velocity.
+        """
         # Make sure alphas_cumprod and timestep have same device and dtype as sample
         self.alphas_cumprod = self.alphas_cumprod.to(device=sample.device)
         alphas_cumprod = self.alphas_cumprod.to(dtype=sample.dtype)
diff --git a/src/diffusers/schedulers/scheduling_ddim_inverse.py b/src/diffusers/schedulers/scheduling_ddim_inverse.py
index 49dba840d089..8ae13ad49d10 100644
--- a/src/diffusers/schedulers/scheduling_ddim_inverse.py
+++ b/src/diffusers/schedulers/scheduling_ddim_inverse.py
@@ -16,7 +16,7 @@
 # and https://github.com/hojonathanho/diffusion
 import math
 from dataclasses import dataclass
-from typing import List, Optional, Tuple, Union
+from typing import List, Literal, Optional, Tuple, Union
 
 import numpy as np
 import torch
@@ -47,10 +47,10 @@ class DDIMSchedulerOutput(BaseOutput):
 
 # Copied from diffusers.schedulers.scheduling_ddpm.betas_for_alpha_bar
 def betas_for_alpha_bar(
-    num_diffusion_timesteps,
-    max_beta=0.999,
-    alpha_transform_type="cosine",
-):
+    num_diffusion_timesteps: int,
+    max_beta: float = 0.999,
+    alpha_transform_type: Literal["cosine", "exp"] = "cosine",
+) -> torch.Tensor:
     """
     Create a beta schedule that discretizes the given alpha_t_bar function, which defines the cumulative product of
     (1-beta) over time from t = [0,1].
@@ -58,16 +58,17 @@ def betas_for_alpha_bar(
     Contains a function alpha_bar that takes an argument t and transforms it to the cumulative product of (1-beta) up
     to that part of the diffusion process.
 
-
     Args:
-        num_diffusion_timesteps (`int`): the number of betas to produce.
-        max_beta (`float`): the maximum beta to use; use values lower than 1 to
-                     prevent singularities.
-        alpha_transform_type (`str`, *optional*, default to `cosine`): the type of noise schedule for alpha_bar.
-                     Choose from `cosine` or `exp`
+        num_diffusion_timesteps (`int`):
+            The number of betas to produce.
+        max_beta (`float`, defaults to `0.999`):
+            The maximum beta to use; use values lower than 1 to avoid numerical instability.
+        alpha_transform_type (`"cosine"` or `"exp"`, defaults to `"cosine"`):
+            The type of noise schedule for `alpha_bar`. Choose from `cosine` or `exp`.
 
     Returns:
-        betas (`np.ndarray`): the betas used by the scheduler to step the model outputs
+        `torch.Tensor`:
+            The betas used by the scheduler to step the model outputs.
     """
     if alpha_transform_type == "cosine":
 
@@ -95,13 +96,13 @@ def rescale_zero_terminal_snr(betas):
     """
     Rescales betas to have zero terminal SNR Based on https://huggingface.co/papers/2305.08891 (Algorithm 1)
 
-
     Args:
         betas (`torch.Tensor`):
-            the betas that the scheduler is being initialized with.
+            The betas that the scheduler is being initialized with.
 
     Returns:
-        `torch.Tensor`: rescaled betas with zero terminal SNR
+        `torch.Tensor`:
+            Rescaled betas with zero terminal SNR.
     """
     # Convert betas to alphas_bar_sqrt
     alphas = 1.0 - betas
@@ -159,7 +160,7 @@ class DDIMInverseScheduler(SchedulerMixin, ConfigMixin):
         prediction_type (`str`, defaults to `epsilon`, *optional*):
             Prediction type of the scheduler function; can be `epsilon` (predicts the noise of the diffusion process),
             `sample` (directly predicts the noisy sample`) or `v_prediction` (see section 2.4 of [Imagen
-            Video](https://imagen.research.google/video/paper.pdf) paper).
+            Video](https://huggingface.co/papers/2210.02303) paper).
         timestep_spacing (`str`, defaults to `"leading"`):
             The way the timesteps should be scaled. Refer to Table 2 of the [Common Diffusion Noise Schedules and
             Sample Steps are Flawed](https://huggingface.co/papers/2305.08891) for more information.
diff --git a/src/diffusers/schedulers/scheduling_ddim_parallel.py b/src/diffusers/schedulers/scheduling_ddim_parallel.py
index 7c3f03a8dbe1..10873a082fee 100644
--- a/src/diffusers/schedulers/scheduling_ddim_parallel.py
+++ b/src/diffusers/schedulers/scheduling_ddim_parallel.py
@@ -17,7 +17,7 @@
 
 import math
 from dataclasses import dataclass
-from typing import List, Optional, Tuple, Union
+from typing import List, Literal, Optional, Tuple, Union
 
 import numpy as np
 import torch
@@ -49,10 +49,10 @@ class DDIMParallelSchedulerOutput(BaseOutput):
 
 # Copied from diffusers.schedulers.scheduling_ddpm.betas_for_alpha_bar
 def betas_for_alpha_bar(
-    num_diffusion_timesteps,
-    max_beta=0.999,
-    alpha_transform_type="cosine",
-):
+    num_diffusion_timesteps: int,
+    max_beta: float = 0.999,
+    alpha_transform_type: Literal["cosine", "exp"] = "cosine",
+) -> torch.Tensor:
     """
     Create a beta schedule that discretizes the given alpha_t_bar function, which defines the cumulative product of
     (1-beta) over time from t = [0,1].
@@ -60,16 +60,17 @@ def betas_for_alpha_bar(
     Contains a function alpha_bar that takes an argument t and transforms it to the cumulative product of (1-beta) up
     to that part of the diffusion process.
 
-
     Args:
-        num_diffusion_timesteps (`int`): the number of betas to produce.
-        max_beta (`float`): the maximum beta to use; use values lower than 1 to
-                     prevent singularities.
-        alpha_transform_type (`str`, *optional*, default to `cosine`): the type of noise schedule for alpha_bar.
-                     Choose from `cosine` or `exp`
+        num_diffusion_timesteps (`int`):
+            The number of betas to produce.
+        max_beta (`float`, defaults to `0.999`):
+            The maximum beta to use; use values lower than 1 to avoid numerical instability.
+        alpha_transform_type (`"cosine"` or `"exp"`, defaults to `"cosine"`):
+            The type of noise schedule for `alpha_bar`. Choose from `cosine` or `exp`.
 
     Returns:
-        betas (`np.ndarray`): the betas used by the scheduler to step the model outputs
+        `torch.Tensor`:
+            The betas used by the scheduler to step the model outputs.
     """
     if alpha_transform_type == "cosine":
 
@@ -97,13 +98,13 @@ def rescale_zero_terminal_snr(betas):
     """
     Rescales betas to have zero terminal SNR Based on https://huggingface.co/papers/2305.08891 (Algorithm 1)
 
-
     Args:
         betas (`torch.Tensor`):
-            the betas that the scheduler is being initialized with.
+            The betas that the scheduler is being initialized with.
 
     Returns:
-        `torch.Tensor`: rescaled betas with zero terminal SNR
+        `torch.Tensor`:
+            Rescaled betas with zero terminal SNR.
     """
     # Convert betas to alphas_bar_sqrt
     alphas = 1.0 - betas
@@ -163,7 +164,7 @@ class DDIMParallelScheduler(SchedulerMixin, ConfigMixin):
         prediction_type (`str`, default `epsilon`, optional):
             prediction type of the scheduler function, one of `epsilon` (predicting the noise of the diffusion
             process), `sample` (directly predicting the noisy sample`) or `v_prediction` (see section 2.4
-            https://imagen.research.google/video/paper.pdf)
+            https://huggingface.co/papers/2210.02303)
         thresholding (`bool`, default `False`):
             whether to use the "dynamic thresholding" method (introduced by Imagen,
             https://huggingface.co/papers/2205.11487). Note that the thresholding method is unsuitable for latent-space
@@ -194,17 +195,17 @@ def __init__(
         num_train_timesteps: int = 1000,
         beta_start: float = 0.0001,
         beta_end: float = 0.02,
-        beta_schedule: str = "linear",
+        beta_schedule: Literal["linear", "scaled_linear", "squaredcos_cap_v2"] = "linear",
         trained_betas: Optional[Union[np.ndarray, List[float]]] = None,
         clip_sample: bool = True,
         set_alpha_to_one: bool = True,
         steps_offset: int = 0,
-        prediction_type: str = "epsilon",
+        prediction_type: Literal["epsilon", "sample", "v_prediction"] = "epsilon",
         thresholding: bool = False,
         dynamic_thresholding_ratio: float = 0.995,
         clip_sample_range: float = 1.0,
         sample_max_value: float = 1.0,
-        timestep_spacing: str = "leading",
+        timestep_spacing: Literal["leading", "trailing", "linspace"] = "leading",
         rescale_betas_zero_snr: bool = False,
     ):
         if trained_betas is not None:
@@ -285,6 +286,8 @@ def _batch_get_variance(self, t, prev_t):
     # Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler._threshold_sample
     def _threshold_sample(self, sample: torch.Tensor) -> torch.Tensor:
         """
+        Apply dynamic thresholding to the predicted sample.
+
         "Dynamic thresholding: At each sampling step we set s to a certain percentile absolute pixel value in xt0 (the
         prediction of x_0 at timestep t), and if s > 1, then we threshold xt0 to the range [-s, s] and then divide by
         s. Dynamic thresholding pushes saturated pixels (those near -1 and 1) inwards, thereby actively preventing
@@ -292,6 +295,14 @@ def _threshold_sample(self, sample: torch.Tensor) -> torch.Tensor:
         photorealism as well as better image-text alignment, especially when using very large guidance weights."
 
         https://huggingface.co/papers/2205.11487
+
+        Args:
+            sample (`torch.Tensor`):
+                The predicted sample to be thresholded.
+
+        Returns:
+            `torch.Tensor`:
+                The thresholded sample.
         """
         dtype = sample.dtype
         batch_size, channels, *remaining_dims = sample.shape
@@ -324,6 +335,11 @@ def set_timesteps(self, num_inference_steps: int, device: Union[str, torch.devic
         Args:
             num_inference_steps (`int`):
                 The number of diffusion steps used when generating samples with a pre-trained model.
+            device (`Union[str, torch.device]`, *optional*):
+                The device to use for the timesteps.
+
+        Raises:
+            ValueError: If `num_inference_steps` is larger than `self.config.num_train_timesteps`.
         """
 
         if num_inference_steps > self.config.num_train_timesteps:
@@ -602,6 +618,22 @@ def add_noise(
         noise: torch.Tensor,
         timesteps: torch.IntTensor,
     ) -> torch.Tensor:
+        """
+        Add noise to the original samples according to the noise magnitude at each timestep (this is the forward
+        diffusion process).
+
+        Args:
+            original_samples (`torch.Tensor`):
+                The original samples to which noise will be added.
+            noise (`torch.Tensor`):
+                The noise to add to the samples.
+            timesteps (`torch.IntTensor`):
+                The timesteps indicating the noise level for each sample.
+
+        Returns:
+            `torch.Tensor`:
+                The noisy samples.
+        """
         # Make sure alphas_cumprod and timestep have same device and dtype as original_samples
         # Move the self.alphas_cumprod to device to avoid redundant CPU to GPU data movement
         # for the subsequent add_noise calls
@@ -624,6 +656,21 @@ def add_noise(
 
     # Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler.get_velocity
     def get_velocity(self, sample: torch.Tensor, noise: torch.Tensor, timesteps: torch.IntTensor) -> torch.Tensor:
+        """
+        Compute the velocity prediction from the sample and noise according to the velocity formula.
+
+        Args:
+            sample (`torch.Tensor`):
+                The input sample.
+            noise (`torch.Tensor`):
+                The noise tensor.
+            timesteps (`torch.IntTensor`):
+                The timesteps for velocity computation.
+
+        Returns:
+            `torch.Tensor`:
+                The computed velocity.
+        """
         # Make sure alphas_cumprod and timestep have same device and dtype as sample
         self.alphas_cumprod = self.alphas_cumprod.to(device=sample.device)
         alphas_cumprod = self.alphas_cumprod.to(dtype=sample.dtype)
diff --git a/src/diffusers/schedulers/scheduling_ddpm.py b/src/diffusers/schedulers/scheduling_ddpm.py
index 0fab6d910a82..ded88b8e1e0a 100644
--- a/src/diffusers/schedulers/scheduling_ddpm.py
+++ b/src/diffusers/schedulers/scheduling_ddpm.py
@@ -16,7 +16,7 @@
 
 import math
 from dataclasses import dataclass
-from typing import List, Optional, Tuple, Union
+from typing import List, Literal, Optional, Tuple, Union
 
 import numpy as np
 import torch
@@ -46,10 +46,10 @@ class DDPMSchedulerOutput(BaseOutput):
 
 
 def betas_for_alpha_bar(
-    num_diffusion_timesteps,
-    max_beta=0.999,
-    alpha_transform_type="cosine",
-):
+    num_diffusion_timesteps: int,
+    max_beta: float = 0.999,
+    alpha_transform_type: Literal["cosine", "exp"] = "cosine",
+) -> torch.Tensor:
     """
     Create a beta schedule that discretizes the given alpha_t_bar function, which defines the cumulative product of
     (1-beta) over time from t = [0,1].
@@ -57,16 +57,17 @@ def betas_for_alpha_bar(
     Contains a function alpha_bar that takes an argument t and transforms it to the cumulative product of (1-beta) up
     to that part of the diffusion process.
 
-
     Args:
-        num_diffusion_timesteps (`int`): the number of betas to produce.
-        max_beta (`float`): the maximum beta to use; use values lower than 1 to
-                     prevent singularities.
-        alpha_transform_type (`str`, *optional*, default to `cosine`): the type of noise schedule for alpha_bar.
-                     Choose from `cosine` or `exp`
+        num_diffusion_timesteps (`int`):
+            The number of betas to produce.
+        max_beta (`float`, defaults to `0.999`):
+            The maximum beta to use; use values lower than 1 to avoid numerical instability.
+        alpha_transform_type (`"cosine"` or `"exp"`, defaults to `"cosine"`):
+            The type of noise schedule for `alpha_bar`. Choose from `cosine` or `exp`.
 
     Returns:
-        betas (`np.ndarray`): the betas used by the scheduler to step the model outputs
+        `torch.Tensor`:
+            The betas used by the scheduler to step the model outputs.
     """
     if alpha_transform_type == "cosine":
 
@@ -90,17 +91,17 @@ def alpha_bar_fn(t):
 
 
 # Copied from diffusers.schedulers.scheduling_ddim.rescale_zero_terminal_snr
-def rescale_zero_terminal_snr(betas):
+def rescale_zero_terminal_snr(betas: torch.Tensor) -> torch.Tensor:
     """
     Rescales betas to have zero terminal SNR Based on https://huggingface.co/papers/2305.08891 (Algorithm 1)
 
-
     Args:
         betas (`torch.Tensor`):
-            the betas that the scheduler is being initialized with.
+            The betas that the scheduler is being initialized with.
 
     Returns:
-        `torch.Tensor`: rescaled betas with zero terminal SNR
+        `torch.Tensor`:
+            Rescaled betas with zero terminal SNR.
     """
     # Convert betas to alphas_bar_sqrt
     alphas = 1.0 - betas
@@ -134,39 +135,37 @@ class DDPMScheduler(SchedulerMixin, ConfigMixin):
     methods the library implements for all schedulers such as loading and saving.
 
     Args:
-        num_train_timesteps (`int`, defaults to 1000):
+        num_train_timesteps (`int`, defaults to `1000`):
             The number of diffusion steps to train the model.
-        beta_start (`float`, defaults to 0.0001):
+        beta_start (`float`, defaults to `0.0001`):
             The starting `beta` value of inference.
-        beta_end (`float`, defaults to 0.02):
+        beta_end (`float`, defaults to `0.02`):
             The final `beta` value.
-        beta_schedule (`str`, defaults to `"linear"`):
-            The beta schedule, a mapping from a beta range to a sequence of betas for stepping the model. Choose from
-            `linear`, `scaled_linear`, `squaredcos_cap_v2`, or `sigmoid`.
+        beta_schedule (`"linear"`, `"scaled_linear"`, `"squaredcos_cap_v2"`, or `"sigmoid"`, defaults to `"linear"`):
+            The beta schedule, a mapping from a beta range to a sequence of betas for stepping the model.
         trained_betas (`np.ndarray`, *optional*):
             An array of betas to pass directly to the constructor without using `beta_start` and `beta_end`.
-        variance_type (`str`, defaults to `"fixed_small"`):
-            Clip the variance when adding noise to the denoised sample. Choose from `fixed_small`, `fixed_small_log`,
-            `fixed_large`, `fixed_large_log`, `learned` or `learned_range`.
+        variance_type (`"fixed_small"`, `"fixed_small_log"`, `"fixed_large"`, `"fixed_large_log"`, `"learned"`, or `"learned_range"`, defaults to `"fixed_small"`):
+            Clip the variance when adding noise to the denoised sample.
         clip_sample (`bool`, defaults to `True`):
             Clip the predicted sample for numerical stability.
-        clip_sample_range (`float`, defaults to 1.0):
+        clip_sample_range (`float`, defaults to `1.0`):
             The maximum magnitude for sample clipping. Valid only when `clip_sample=True`.
-        prediction_type (`str`, defaults to `epsilon`, *optional*):
+        prediction_type (`"epsilon"`, `"sample"`, or `"v_prediction"`, defaults to `"epsilon"`):
             Prediction type of the scheduler function; can be `epsilon` (predicts the noise of the diffusion process),
             `sample` (directly predicts the noisy sample`) or `v_prediction` (see section 2.4 of [Imagen
-            Video](https://imagen.research.google/video/paper.pdf) paper).
+            Video](https://huggingface.co/papers/2210.02303) paper).
         thresholding (`bool`, defaults to `False`):
             Whether to use the "dynamic thresholding" method. This is unsuitable for latent-space diffusion models such
             as Stable Diffusion.
-        dynamic_thresholding_ratio (`float`, defaults to 0.995):
+        dynamic_thresholding_ratio (`float`, defaults to `0.995`):
             The ratio for the dynamic thresholding method. Valid only when `thresholding=True`.
-        sample_max_value (`float`, defaults to 1.0):
+        sample_max_value (`float`, defaults to `1.0`):
             The threshold value for dynamic thresholding. Valid only when `thresholding=True`.
-        timestep_spacing (`str`, defaults to `"leading"`):
+        timestep_spacing (`"linspace"`, `"leading"`, or `"trailing"`, defaults to `"leading"`):
             The way the timesteps should be scaled. Refer to Table 2 of the [Common Diffusion Noise Schedules and
             Sample Steps are Flawed](https://huggingface.co/papers/2305.08891) for more information.
-        steps_offset (`int`, defaults to 0):
+        steps_offset (`int`, defaults to `0`):
             An offset added to the inference steps, as required by some model families.
         rescale_betas_zero_snr (`bool`, defaults to `False`):
             Whether to rescale the betas to have zero terminal SNR. This enables the model to generate very bright and
@@ -183,16 +182,18 @@ def __init__(
         num_train_timesteps: int = 1000,
         beta_start: float = 0.0001,
         beta_end: float = 0.02,
-        beta_schedule: str = "linear",
+        beta_schedule: Literal["linear", "scaled_linear", "squaredcos_cap_v2", "sigmoid"] = "linear",
         trained_betas: Optional[Union[np.ndarray, List[float]]] = None,
-        variance_type: str = "fixed_small",
+        variance_type: Literal[
+            "fixed_small", "fixed_small_log", "fixed_large", "fixed_large_log", "learned", "learned_range"
+        ] = "fixed_small",
         clip_sample: bool = True,
-        prediction_type: str = "epsilon",
+        prediction_type: Literal["epsilon", "sample", "v_prediction"] = "epsilon",
         thresholding: bool = False,
         dynamic_thresholding_ratio: float = 0.995,
         clip_sample_range: float = 1.0,
         sample_max_value: float = 1.0,
-        timestep_spacing: str = "leading",
+        timestep_spacing: Literal["linspace", "leading", "trailing"] = "leading",
         steps_offset: int = 0,
         rescale_betas_zero_snr: bool = False,
     ):
@@ -322,7 +323,31 @@ def set_timesteps(
 
         self.timesteps = torch.from_numpy(timesteps).to(device)
 
-    def _get_variance(self, t, predicted_variance=None, variance_type=None):
+    def _get_variance(
+        self,
+        t: int,
+        predicted_variance: Optional[torch.Tensor] = None,
+        variance_type: Optional[
+            Literal["fixed_small", "fixed_small_log", "fixed_large", "fixed_large_log", "learned", "learned_range"]
+        ] = None,
+    ) -> torch.Tensor:
+        """
+        Compute the variance for a given timestep according to the specified variance type.
+
+        Args:
+            t (`int`):
+                The current timestep.
+            predicted_variance (`torch.Tensor`, *optional*):
+                The predicted variance from the model. Used only when `variance_type` is `"learned"` or
+                `"learned_range"`.
+            variance_type (`"fixed_small"`, `"fixed_small_log"`, `"fixed_large"`, `"fixed_large_log"`, `"learned"`, or `"learned_range"`, *optional*):
+                The type of variance to compute. If `None`, uses the variance type specified in the scheduler
+                configuration.
+
+        Returns:
+            `torch.Tensor`:
+                The computed variance.
+        """
         prev_t = self.previous_timestep(t)
 
         alpha_prod_t = self.alphas_cumprod[t]
@@ -364,6 +389,8 @@ def _get_variance(self, t, predicted_variance=None, variance_type=None):
 
     def _threshold_sample(self, sample: torch.Tensor) -> torch.Tensor:
         """
+        Apply dynamic thresholding to the predicted sample.
+
         "Dynamic thresholding: At each sampling step we set s to a certain percentile absolute pixel value in xt0 (the
         prediction of x_0 at timestep t), and if s > 1, then we threshold xt0 to the range [-s, s] and then divide by
         s. Dynamic thresholding pushes saturated pixels (those near -1 and 1) inwards, thereby actively preventing
@@ -371,6 +398,14 @@ def _threshold_sample(self, sample: torch.Tensor) -> torch.Tensor:
         photorealism as well as better image-text alignment, especially when using very large guidance weights."
 
         https://huggingface.co/papers/2205.11487
+
+        Args:
+            sample (`torch.Tensor`):
+                The predicted sample to be thresholded.
+
+        Returns:
+            `torch.Tensor`:
+                The thresholded sample.
         """
         dtype = sample.dtype
         batch_size, channels, *remaining_dims = sample.shape
@@ -400,7 +435,7 @@ def step(
         model_output: torch.Tensor,
         timestep: int,
         sample: torch.Tensor,
-        generator=None,
+        generator: Optional[torch.Generator] = None,
         return_dict: bool = True,
     ) -> Union[DDPMSchedulerOutput, Tuple]:
         """
@@ -410,20 +445,19 @@ def step(
         Args:
             model_output (`torch.Tensor`):
                 The direct output from learned diffusion model.
-            timestep (`float`):
+            timestep (`int`):
                 The current discrete timestep in the diffusion chain.
             sample (`torch.Tensor`):
                 A current instance of a sample created by the diffusion process.
             generator (`torch.Generator`, *optional*):
                 A random number generator.
-            return_dict (`bool`, *optional*, defaults to `True`):
+            return_dict (`bool`, defaults to `True`):
                 Whether or not to return a [`~schedulers.scheduling_ddpm.DDPMSchedulerOutput`] or `tuple`.
 
         Returns:
             [`~schedulers.scheduling_ddpm.DDPMSchedulerOutput`] or `tuple`:
                 If return_dict is `True`, [`~schedulers.scheduling_ddpm.DDPMSchedulerOutput`] is returned, otherwise a
                 tuple is returned where the first element is the sample tensor.
-
         """
         t = timestep
 
@@ -504,6 +538,22 @@ def add_noise(
         noise: torch.Tensor,
         timesteps: torch.IntTensor,
     ) -> torch.Tensor:
+        """
+        Add noise to the original samples according to the noise magnitude at each timestep (this is the forward
+        diffusion process).
+
+        Args:
+            original_samples (`torch.Tensor`):
+                The original samples to which noise will be added.
+            noise (`torch.Tensor`):
+                The noise to add to the samples.
+            timesteps (`torch.IntTensor`):
+                The timesteps indicating the noise level for each sample.
+
+        Returns:
+            `torch.Tensor`:
+                The noisy samples.
+        """
         # Make sure alphas_cumprod and timestep have same device and dtype as original_samples
         # Move the self.alphas_cumprod to device to avoid redundant CPU to GPU data movement
         # for the subsequent add_noise calls
@@ -525,6 +575,21 @@ def add_noise(
         return noisy_samples
 
     def get_velocity(self, sample: torch.Tensor, noise: torch.Tensor, timesteps: torch.IntTensor) -> torch.Tensor:
+        """
+        Compute the velocity prediction from the sample and noise according to the velocity formula.
+
+        Args:
+            sample (`torch.Tensor`):
+                The input sample.
+            noise (`torch.Tensor`):
+                The noise tensor.
+            timesteps (`torch.IntTensor`):
+                The timesteps for velocity computation.
+
+        Returns:
+            `torch.Tensor`:
+                The computed velocity.
+        """
         # Make sure alphas_cumprod and timestep have same device and dtype as sample
         self.alphas_cumprod = self.alphas_cumprod.to(device=sample.device)
         alphas_cumprod = self.alphas_cumprod.to(dtype=sample.dtype)
@@ -543,10 +608,21 @@ def get_velocity(self, sample: torch.Tensor, noise: torch.Tensor, timesteps: tor
         velocity = sqrt_alpha_prod * noise - sqrt_one_minus_alpha_prod * sample
         return velocity
 
-    def __len__(self):
+    def __len__(self) -> int:
         return self.config.num_train_timesteps
 
-    def previous_timestep(self, timestep):
+    def previous_timestep(self, timestep: int) -> int:
+        """
+        Compute the previous timestep in the diffusion chain.
+
+        Args:
+            timestep (`int`):
+                The current timestep.
+
+        Returns:
+            `int`:
+                The previous timestep.
+        """
         if self.custom_timesteps or self.num_inference_steps:
             index = (self.timesteps == timestep).nonzero(as_tuple=True)[0][0]
             if index == self.timesteps.shape[0] - 1:
diff --git a/src/diffusers/schedulers/scheduling_ddpm_parallel.py b/src/diffusers/schedulers/scheduling_ddpm_parallel.py
index ec741f9ecb7d..941fc16be080 100644
--- a/src/diffusers/schedulers/scheduling_ddpm_parallel.py
+++ b/src/diffusers/schedulers/scheduling_ddpm_parallel.py
@@ -16,7 +16,7 @@
 
 import math
 from dataclasses import dataclass
-from typing import List, Optional, Tuple, Union
+from typing import List, Literal, Optional, Tuple, Union
 
 import numpy as np
 import torch
@@ -48,10 +48,10 @@ class DDPMParallelSchedulerOutput(BaseOutput):
 
 # Copied from diffusers.schedulers.scheduling_ddpm.betas_for_alpha_bar
 def betas_for_alpha_bar(
-    num_diffusion_timesteps,
-    max_beta=0.999,
-    alpha_transform_type="cosine",
-):
+    num_diffusion_timesteps: int,
+    max_beta: float = 0.999,
+    alpha_transform_type: Literal["cosine", "exp"] = "cosine",
+) -> torch.Tensor:
     """
     Create a beta schedule that discretizes the given alpha_t_bar function, which defines the cumulative product of
     (1-beta) over time from t = [0,1].
@@ -59,16 +59,17 @@ def betas_for_alpha_bar(
     Contains a function alpha_bar that takes an argument t and transforms it to the cumulative product of (1-beta) up
     to that part of the diffusion process.
 
-
     Args:
-        num_diffusion_timesteps (`int`): the number of betas to produce.
-        max_beta (`float`): the maximum beta to use; use values lower than 1 to
-                     prevent singularities.
-        alpha_transform_type (`str`, *optional*, default to `cosine`): the type of noise schedule for alpha_bar.
-                     Choose from `cosine` or `exp`
+        num_diffusion_timesteps (`int`):
+            The number of betas to produce.
+        max_beta (`float`, defaults to `0.999`):
+            The maximum beta to use; use values lower than 1 to avoid numerical instability.
+        alpha_transform_type (`"cosine"` or `"exp"`, defaults to `"cosine"`):
+            The type of noise schedule for `alpha_bar`. Choose from `cosine` or `exp`.
 
     Returns:
-        betas (`np.ndarray`): the betas used by the scheduler to step the model outputs
+        `torch.Tensor`:
+            The betas used by the scheduler to step the model outputs.
     """
     if alpha_transform_type == "cosine":
 
@@ -96,13 +97,13 @@ def rescale_zero_terminal_snr(betas):
     """
     Rescales betas to have zero terminal SNR Based on https://huggingface.co/papers/2305.08891 (Algorithm 1)
 
-
     Args:
         betas (`torch.Tensor`):
-            the betas that the scheduler is being initialized with.
+            The betas that the scheduler is being initialized with.
 
     Returns:
-        `torch.Tensor`: rescaled betas with zero terminal SNR
+        `torch.Tensor`:
+            Rescaled betas with zero terminal SNR.
     """
     # Convert betas to alphas_bar_sqrt
     alphas = 1.0 - betas
@@ -159,7 +160,7 @@ class DDPMParallelScheduler(SchedulerMixin, ConfigMixin):
         prediction_type (`str`, default `epsilon`, optional):
             prediction type of the scheduler function, one of `epsilon` (predicting the noise of the diffusion
             process), `sample` (directly predicting the noisy sample`) or `v_prediction` (see section 2.4
-            https://imagen.research.google/video/paper.pdf)
+            https://huggingface.co/papers/2210.02303)
         thresholding (`bool`, default `False`):
             whether to use the "dynamic thresholding" method (introduced by Imagen,
             https://huggingface.co/papers/2205.11487). Note that the thresholding method is unsuitable for latent-space
@@ -191,16 +192,18 @@ def __init__(
         num_train_timesteps: int = 1000,
         beta_start: float = 0.0001,
         beta_end: float = 0.02,
-        beta_schedule: str = "linear",
+        beta_schedule: Literal["linear", "scaled_linear", "squaredcos_cap_v2", "sigmoid"] = "linear",
         trained_betas: Optional[Union[np.ndarray, List[float]]] = None,
-        variance_type: str = "fixed_small",
+        variance_type: Literal[
+            "fixed_small", "fixed_small_log", "fixed_large", "fixed_large_log", "learned", "learned_range"
+        ] = "fixed_small",
         clip_sample: bool = True,
-        prediction_type: str = "epsilon",
+        prediction_type: Literal["epsilon", "sample", "v_prediction"] = "epsilon",
         thresholding: bool = False,
         dynamic_thresholding_ratio: float = 0.995,
         clip_sample_range: float = 1.0,
         sample_max_value: float = 1.0,
-        timestep_spacing: str = "leading",
+        timestep_spacing: Literal["linspace", "leading", "trailing"] = "leading",
         steps_offset: int = 0,
         rescale_betas_zero_snr: bool = False,
     ):
@@ -333,7 +336,31 @@ def set_timesteps(
         self.timesteps = torch.from_numpy(timesteps).to(device)
 
     # Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler._get_variance
-    def _get_variance(self, t, predicted_variance=None, variance_type=None):
+    def _get_variance(
+        self,
+        t: int,
+        predicted_variance: Optional[torch.Tensor] = None,
+        variance_type: Optional[
+            Literal["fixed_small", "fixed_small_log", "fixed_large", "fixed_large_log", "learned", "learned_range"]
+        ] = None,
+    ) -> torch.Tensor:
+        """
+        Compute the variance for a given timestep according to the specified variance type.
+
+        Args:
+            t (`int`):
+                The current timestep.
+            predicted_variance (`torch.Tensor`, *optional*):
+                The predicted variance from the model. Used only when `variance_type` is `"learned"` or
+                `"learned_range"`.
+            variance_type (`"fixed_small"`, `"fixed_small_log"`, `"fixed_large"`, `"fixed_large_log"`, `"learned"`, or `"learned_range"`, *optional*):
+                The type of variance to compute. If `None`, uses the variance type specified in the scheduler
+                configuration.
+
+        Returns:
+            `torch.Tensor`:
+                The computed variance.
+        """
         prev_t = self.previous_timestep(t)
 
         alpha_prod_t = self.alphas_cumprod[t]
@@ -376,6 +403,8 @@ def _get_variance(self, t, predicted_variance=None, variance_type=None):
     # Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler._threshold_sample
     def _threshold_sample(self, sample: torch.Tensor) -> torch.Tensor:
         """
+        Apply dynamic thresholding to the predicted sample.
+
         "Dynamic thresholding: At each sampling step we set s to a certain percentile absolute pixel value in xt0 (the
         prediction of x_0 at timestep t), and if s > 1, then we threshold xt0 to the range [-s, s] and then divide by
         s. Dynamic thresholding pushes saturated pixels (those near -1 and 1) inwards, thereby actively preventing
@@ -383,6 +412,14 @@ def _threshold_sample(self, sample: torch.Tensor) -> torch.Tensor:
         photorealism as well as better image-text alignment, especially when using very large guidance weights."
 
         https://huggingface.co/papers/2205.11487
+
+        Args:
+            sample (`torch.Tensor`):
+                The predicted sample to be thresholded.
+
+        Returns:
+            `torch.Tensor`:
+                The thresholded sample.
         """
         dtype = sample.dtype
         batch_size, channels, *remaining_dims = sample.shape
@@ -593,6 +630,22 @@ def add_noise(
         noise: torch.Tensor,
         timesteps: torch.IntTensor,
     ) -> torch.Tensor:
+        """
+        Add noise to the original samples according to the noise magnitude at each timestep (this is the forward
+        diffusion process).
+
+        Args:
+            original_samples (`torch.Tensor`):
+                The original samples to which noise will be added.
+            noise (`torch.Tensor`):
+                The noise to add to the samples.
+            timesteps (`torch.IntTensor`):
+                The timesteps indicating the noise level for each sample.
+
+        Returns:
+            `torch.Tensor`:
+                The noisy samples.
+        """
         # Make sure alphas_cumprod and timestep have same device and dtype as original_samples
         # Move the self.alphas_cumprod to device to avoid redundant CPU to GPU data movement
         # for the subsequent add_noise calls
@@ -615,6 +668,21 @@ def add_noise(
 
     # Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler.get_velocity
     def get_velocity(self, sample: torch.Tensor, noise: torch.Tensor, timesteps: torch.IntTensor) -> torch.Tensor:
+        """
+        Compute the velocity prediction from the sample and noise according to the velocity formula.
+
+        Args:
+            sample (`torch.Tensor`):
+                The input sample.
+            noise (`torch.Tensor`):
+                The noise tensor.
+            timesteps (`torch.IntTensor`):
+                The timesteps for velocity computation.
+
+        Returns:
+            `torch.Tensor`:
+                The computed velocity.
+        """
         # Make sure alphas_cumprod and timestep have same device and dtype as sample
         self.alphas_cumprod = self.alphas_cumprod.to(device=sample.device)
         alphas_cumprod = self.alphas_cumprod.to(dtype=sample.dtype)
@@ -638,6 +706,17 @@ def __len__(self):
 
     # Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler.previous_timestep
     def previous_timestep(self, timestep):
+        """
+        Compute the previous timestep in the diffusion chain.
+
+        Args:
+            timestep (`int`):
+                The current timestep.
+
+        Returns:
+            `int`:
+                The previous timestep.
+        """
         if self.custom_timesteps or self.num_inference_steps:
             index = (self.timesteps == timestep).nonzero(as_tuple=True)[0][0]
             if index == self.timesteps.shape[0] - 1:
diff --git a/src/diffusers/schedulers/scheduling_deis_multistep.py b/src/diffusers/schedulers/scheduling_deis_multistep.py
index 7d8685ba10c3..b7d64fc00bae 100644
--- a/src/diffusers/schedulers/scheduling_deis_multistep.py
+++ b/src/diffusers/schedulers/scheduling_deis_multistep.py
@@ -16,7 +16,7 @@
 # The codebase is modified based on https://github.com/huggingface/diffusers/blob/main/src/diffusers/schedulers/scheduling_dpmsolver_multistep.py
 
 import math
-from typing import List, Optional, Tuple, Union
+from typing import List, Literal, Optional, Tuple, Union
 
 import numpy as np
 import torch
@@ -32,10 +32,10 @@
 
 # Copied from diffusers.schedulers.scheduling_ddpm.betas_for_alpha_bar
 def betas_for_alpha_bar(
-    num_diffusion_timesteps,
-    max_beta=0.999,
-    alpha_transform_type="cosine",
-):
+    num_diffusion_timesteps: int,
+    max_beta: float = 0.999,
+    alpha_transform_type: Literal["cosine", "exp"] = "cosine",
+) -> torch.Tensor:
     """
     Create a beta schedule that discretizes the given alpha_t_bar function, which defines the cumulative product of
     (1-beta) over time from t = [0,1].
@@ -43,16 +43,17 @@ def betas_for_alpha_bar(
     Contains a function alpha_bar that takes an argument t and transforms it to the cumulative product of (1-beta) up
     to that part of the diffusion process.
 
-
     Args:
-        num_diffusion_timesteps (`int`): the number of betas to produce.
-        max_beta (`float`): the maximum beta to use; use values lower than 1 to
-                     prevent singularities.
-        alpha_transform_type (`str`, *optional*, default to `cosine`): the type of noise schedule for alpha_bar.
-                     Choose from `cosine` or `exp`
+        num_diffusion_timesteps (`int`):
+            The number of betas to produce.
+        max_beta (`float`, defaults to `0.999`):
+            The maximum beta to use; use values lower than 1 to avoid numerical instability.
+        alpha_transform_type (`"cosine"` or `"exp"`, defaults to `"cosine"`):
+            The type of noise schedule for `alpha_bar`. Choose from `cosine` or `exp`.
 
     Returns:
-        betas (`np.ndarray`): the betas used by the scheduler to step the model outputs
+        `torch.Tensor`:
+            The betas used by the scheduler to step the model outputs.
     """
     if alpha_transform_type == "cosine":
 
@@ -83,33 +84,35 @@ class DEISMultistepScheduler(SchedulerMixin, ConfigMixin):
     methods the library implements for all schedulers such as loading and saving.
 
     Args:
-        num_train_timesteps (`int`, defaults to 1000):
+        num_train_timesteps (`int`, defaults to `1000`):
             The number of diffusion steps to train the model.
-        beta_start (`float`, defaults to 0.0001):
+        beta_start (`float`, defaults to `0.0001`):
             The starting `beta` value of inference.
-        beta_end (`float`, defaults to 0.02):
+        beta_end (`float`, defaults to `0.02`):
             The final `beta` value.
-        beta_schedule (`str`, defaults to `"linear"`):
+        beta_schedule (`"linear"`, `"scaled_linear"`, or `"squaredcos_cap_v2"`, defaults to `"linear"`):
             The beta schedule, a mapping from a beta range to a sequence of betas for stepping the model. Choose from
             `linear`, `scaled_linear`, or `squaredcos_cap_v2`.
-        trained_betas (`np.ndarray`, *optional*):
+        trained_betas (`np.ndarray` or `List[float]`, *optional*):
             Pass an array of betas directly to the constructor to bypass `beta_start` and `beta_end`.
-        solver_order (`int`, defaults to 2):
+        solver_order (`int`, defaults to `2`):
             The DEIS order which can be `1` or `2` or `3`. It is recommended to use `solver_order=2` for guided
             sampling, and `solver_order=3` for unconditional sampling.
-        prediction_type (`str`, defaults to `epsilon`):
+        prediction_type (`"epsilon"`, `"sample"`, `"v_prediction"`, or `"flow_prediction"`, defaults to `"epsilon"`):
             Prediction type of the scheduler function; can be `epsilon` (predicts the noise of the diffusion process),
-            `sample` (directly predicts the noisy sample`) or `v_prediction` (see section 2.4 of [Imagen
-            Video](https://imagen.research.google/video/paper.pdf) paper).
+            `sample` (directly predicts the noisy sample`), `v_prediction` (see section 2.4 of [Imagen
+            Video](https://huggingface.co/papers/2210.02303) paper), or `flow_prediction`.
         thresholding (`bool`, defaults to `False`):
             Whether to use the "dynamic thresholding" method. This is unsuitable for latent-space diffusion models such
             as Stable Diffusion.
-        dynamic_thresholding_ratio (`float`, defaults to 0.995):
+        dynamic_thresholding_ratio (`float`, defaults to `0.995`):
             The ratio for the dynamic thresholding method. Valid only when `thresholding=True`.
-        sample_max_value (`float`, defaults to 1.0):
+        sample_max_value (`float`, defaults to `1.0`):
             The threshold value for dynamic thresholding. Valid only when `thresholding=True`.
-        algorithm_type (`str`, defaults to `deis`):
+        algorithm_type (`"deis"`, defaults to `"deis"`):
             The algorithm type for the solver.
+        solver_type (`"logrho"`, defaults to `"logrho"`):
+            Solver type for DEIS.
         lower_order_final (`bool`, defaults to `True`):
             Whether to use lower-order solvers in the final steps. Only valid for < 15 inference steps.
         use_karras_sigmas (`bool`, *optional*, defaults to `False`):
@@ -120,11 +123,19 @@ class DEISMultistepScheduler(SchedulerMixin, ConfigMixin):
         use_beta_sigmas (`bool`, *optional*, defaults to `False`):
             Whether to use beta sigmas for step sizes in the noise schedule during the sampling process. Refer to [Beta
             Sampling is All You Need](https://huggingface.co/papers/2407.12173) for more information.
-        timestep_spacing (`str`, defaults to `"linspace"`):
+        use_flow_sigmas (`bool`, *optional*, defaults to `False`):
+            Whether to use flow sigmas for step sizes in the noise schedule during the sampling process.
+        flow_shift (`float`, *optional*, defaults to `1.0`):
+            The flow shift parameter for flow-based models.
+        timestep_spacing (`"linspace"`, `"leading"`, or `"trailing"`, defaults to `"linspace"`):
             The way the timesteps should be scaled. Refer to Table 2 of the [Common Diffusion Noise Schedules and
             Sample Steps are Flawed](https://huggingface.co/papers/2305.08891) for more information.
-        steps_offset (`int`, defaults to 0):
+        steps_offset (`int`, defaults to `0`):
             An offset added to the inference steps, as required by some model families.
+        use_dynamic_shifting (`bool`, defaults to `False`):
+            Whether to use dynamic shifting for the noise schedule.
+        time_shift_type (`"exponential"`, defaults to `"exponential"`):
+            The type of time shifting to apply.
     """
 
     _compatibles = [e.name for e in KarrasDiffusionSchedulers]
@@ -136,29 +147,38 @@ def __init__(
         num_train_timesteps: int = 1000,
         beta_start: float = 0.0001,
         beta_end: float = 0.02,
-        beta_schedule: str = "linear",
-        trained_betas: Optional[np.ndarray] = None,
+        beta_schedule: Literal["linear", "scaled_linear", "squaredcos_cap_v2"] = "linear",
+        trained_betas: Optional[Union[np.ndarray, List[float]]] = None,
         solver_order: int = 2,
-        prediction_type: str = "epsilon",
+        prediction_type: Literal["epsilon", "sample", "v_prediction", "flow_prediction"] = "epsilon",
         thresholding: bool = False,
         dynamic_thresholding_ratio: float = 0.995,
         sample_max_value: float = 1.0,
-        algorithm_type: str = "deis",
-        solver_type: str = "logrho",
+        algorithm_type: Literal["deis"] = "deis",
+        solver_type: Literal["logrho"] = "logrho",
         lower_order_final: bool = True,
         use_karras_sigmas: Optional[bool] = False,
         use_exponential_sigmas: Optional[bool] = False,
         use_beta_sigmas: Optional[bool] = False,
         use_flow_sigmas: Optional[bool] = False,
         flow_shift: Optional[float] = 1.0,
-        timestep_spacing: str = "linspace",
+        timestep_spacing: Literal["linspace", "leading", "trailing"] = "linspace",
         steps_offset: int = 0,
         use_dynamic_shifting: bool = False,
-        time_shift_type: str = "exponential",
-    ):
+        time_shift_type: Literal["exponential"] = "exponential",
+    ) -> None:
         if self.config.use_beta_sigmas and not is_scipy_available():
             raise ImportError("Make sure to install scipy if you want to use beta sigmas.")
-        if sum([self.config.use_beta_sigmas, self.config.use_exponential_sigmas, self.config.use_karras_sigmas]) > 1:
+        if (
+            sum(
+                [
+                    self.config.use_beta_sigmas,
+                    self.config.use_exponential_sigmas,
+                    self.config.use_karras_sigmas,
+                ]
+            )
+            > 1
+        ):
             raise ValueError(
                 "Only one of `config.use_beta_sigmas`, `config.use_exponential_sigmas`, `config.use_karras_sigmas` can be used."
             )
@@ -168,7 +188,15 @@ def __init__(
             self.betas = torch.linspace(beta_start, beta_end, num_train_timesteps, dtype=torch.float32)
         elif beta_schedule == "scaled_linear":
             # this schedule is very specific to the latent diffusion model.
-            self.betas = torch.linspace(beta_start**0.5, beta_end**0.5, num_train_timesteps, dtype=torch.float32) ** 2
+            self.betas = (
+                torch.linspace(
+                    beta_start**0.5,
+                    beta_end**0.5,
+                    num_train_timesteps,
+                    dtype=torch.float32,
+                )
+                ** 2
+            )
         elif beta_schedule == "squaredcos_cap_v2":
             # Glide cosine schedule
             self.betas = betas_for_alpha_bar(num_train_timesteps)
@@ -210,33 +238,36 @@ def __init__(
         self.sigmas = self.sigmas.to("cpu")  # to avoid too much CPU/GPU communication
 
     @property
-    def step_index(self):
+    def step_index(self) -> Optional[int]:
         """
         The index counter for current timestep. It will increase 1 after each scheduler step.
         """
         return self._step_index
 
     @property
-    def begin_index(self):
+    def begin_index(self) -> Optional[int]:
         """
         The index for the first timestep. It should be set from pipeline with `set_begin_index` method.
         """
         return self._begin_index
 
     # Copied from diffusers.schedulers.scheduling_dpmsolver_multistep.DPMSolverMultistepScheduler.set_begin_index
-    def set_begin_index(self, begin_index: int = 0):
+    def set_begin_index(self, begin_index: int = 0) -> None:
         """
         Sets the begin index for the scheduler. This function should be run from pipeline before the inference.
 
         Args:
-            begin_index (`int`):
+            begin_index (`int`, defaults to `0`):
                 The begin index for the scheduler.
         """
         self._begin_index = begin_index
 
     def set_timesteps(
-        self, num_inference_steps: int, device: Union[str, torch.device] = None, mu: Optional[float] = None
-    ):
+        self,
+        num_inference_steps: int,
+        device: Union[str, torch.device] = None,
+        mu: Optional[float] = None,
+    ) -> None:
         """
         Sets the discrete timesteps used for the diffusion chain (to be run before inference).
 
@@ -245,6 +276,9 @@ def set_timesteps(
                 The number of diffusion steps used when generating samples with a pre-trained model.
             device (`str` or `torch.device`, *optional*):
                 The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
+            mu (`float`, *optional*):
+                The mu parameter for dynamic shifting. Only used when `use_dynamic_shifting=True` and
+                `time_shift_type="exponential"`.
         """
         if mu is not None:
             assert self.config.use_dynamic_shifting and self.config.time_shift_type == "exponential"
@@ -320,6 +354,8 @@ def set_timesteps(
     # Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler._threshold_sample
     def _threshold_sample(self, sample: torch.Tensor) -> torch.Tensor:
         """
+        Apply dynamic thresholding to the predicted sample.
+
         "Dynamic thresholding: At each sampling step we set s to a certain percentile absolute pixel value in xt0 (the
         prediction of x_0 at timestep t), and if s > 1, then we threshold xt0 to the range [-s, s] and then divide by
         s. Dynamic thresholding pushes saturated pixels (those near -1 and 1) inwards, thereby actively preventing
@@ -327,6 +363,14 @@ def _threshold_sample(self, sample: torch.Tensor) -> torch.Tensor:
         photorealism as well as better image-text alignment, especially when using very large guidance weights."
 
         https://huggingface.co/papers/2205.11487
+
+        Args:
+            sample (`torch.Tensor`):
+                The predicted sample to be thresholded.
+
+        Returns:
+            `torch.Tensor`:
+                The thresholded sample.
         """
         dtype = sample.dtype
         batch_size, channels, *remaining_dims = sample.shape
@@ -352,7 +396,20 @@ def _threshold_sample(self, sample: torch.Tensor) -> torch.Tensor:
         return sample
 
     # Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler._sigma_to_t
-    def _sigma_to_t(self, sigma, log_sigmas):
+    def _sigma_to_t(self, sigma: np.ndarray, log_sigmas: np.ndarray) -> np.ndarray:
+        """
+        Convert sigma values to corresponding timestep values through interpolation.
+
+        Args:
+            sigma (`np.ndarray`):
+                The sigma value(s) to convert to timestep(s).
+            log_sigmas (`np.ndarray`):
+                The logarithm of the sigma schedule used for interpolation.
+
+        Returns:
+            `np.ndarray`:
+                The interpolated timestep value(s) corresponding to the input sigma(s).
+        """
         # get log sigma
         log_sigma = np.log(np.maximum(sigma, 1e-10))
 
@@ -376,7 +433,18 @@ def _sigma_to_t(self, sigma, log_sigmas):
         return t
 
     # Copied from diffusers.schedulers.scheduling_dpmsolver_multistep.DPMSolverMultistepScheduler._sigma_to_alpha_sigma_t
-    def _sigma_to_alpha_sigma_t(self, sigma):
+    def _sigma_to_alpha_sigma_t(self, sigma: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
+        """
+        Convert sigma values to alpha_t and sigma_t values.
+
+        Args:
+            sigma (`torch.Tensor`):
+                The sigma value(s) to convert.
+
+        Returns:
+            `Tuple[torch.Tensor, torch.Tensor]`:
+                A tuple containing (alpha_t, sigma_t) values.
+        """
         if self.config.use_flow_sigmas:
             alpha_t = 1 - sigma
             sigma_t = sigma
@@ -387,8 +455,21 @@ def _sigma_to_alpha_sigma_t(self, sigma):
         return alpha_t, sigma_t
 
     # Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler._convert_to_karras
-    def _convert_to_karras(self, in_sigmas: torch.Tensor, num_inference_steps) -> torch.Tensor:
-        """Constructs the noise schedule of Karras et al. (2022)."""
+    def _convert_to_karras(self, in_sigmas: torch.Tensor, num_inference_steps: int) -> torch.Tensor:
+        """
+        Construct the noise schedule as proposed in [Elucidating the Design Space of Diffusion-Based Generative
+        Models](https://huggingface.co/papers/2206.00364).
+
+        Args:
+            in_sigmas (`torch.Tensor`):
+                The input sigma values to be converted.
+            num_inference_steps (`int`):
+                The number of inference steps to generate the noise schedule for.
+
+        Returns:
+            `torch.Tensor`:
+                The converted sigma values following the Karras noise schedule.
+        """
 
         # Hack to make sure that other schedulers which copy this function don't break
         # TODO: Add this logic to the other schedulers
@@ -414,7 +495,19 @@ def _convert_to_karras(self, in_sigmas: torch.Tensor, num_inference_steps) -> to
 
     # Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler._convert_to_exponential
     def _convert_to_exponential(self, in_sigmas: torch.Tensor, num_inference_steps: int) -> torch.Tensor:
-        """Constructs an exponential noise schedule."""
+        """
+        Construct an exponential noise schedule.
+
+        Args:
+            in_sigmas (`torch.Tensor`):
+                The input sigma values to be converted.
+            num_inference_steps (`int`):
+                The number of inference steps to generate the noise schedule for.
+
+        Returns:
+            `torch.Tensor`:
+                The converted sigma values following an exponential schedule.
+        """
 
         # Hack to make sure that other schedulers which copy this function don't break
         # TODO: Add this logic to the other schedulers
@@ -438,7 +531,24 @@ def _convert_to_exponential(self, in_sigmas: torch.Tensor, num_inference_steps:
     def _convert_to_beta(
         self, in_sigmas: torch.Tensor, num_inference_steps: int, alpha: float = 0.6, beta: float = 0.6
     ) -> torch.Tensor:
-        """From "Beta Sampling is All You Need" [arXiv:2407.12173] (Lee et. al, 2024)"""
+        """
+        Construct a beta noise schedule as proposed in [Beta Sampling is All You
+        Need](https://huggingface.co/papers/2407.12173).
+
+        Args:
+            in_sigmas (`torch.Tensor`):
+                The input sigma values to be converted.
+            num_inference_steps (`int`):
+                The number of inference steps to generate the noise schedule for.
+            alpha (`float`, *optional*, defaults to `0.6`):
+                The alpha parameter for the beta distribution.
+            beta (`float`, *optional*, defaults to `0.6`):
+                The beta parameter for the beta distribution.
+
+        Returns:
+            `torch.Tensor`:
+                The converted sigma values following a beta distribution schedule.
+        """
 
         # Hack to make sure that other schedulers which copy this function don't break
         # TODO: Add this logic to the other schedulers
@@ -571,7 +681,10 @@ def deis_first_order_update(
                 "Passing `prev_timestep` is deprecated and has no effect as model output conversion is now handled via an internal counter `self.step_index`",
             )
 
-        sigma_t, sigma_s = self.sigmas[self.step_index + 1], self.sigmas[self.step_index]
+        sigma_t, sigma_s = (
+            self.sigmas[self.step_index + 1],
+            self.sigmas[self.step_index],
+        )
         alpha_t, sigma_t = self._sigma_to_alpha_sigma_t(sigma_t)
         alpha_s, sigma_s = self._sigma_to_alpha_sigma_t(sigma_s)
         lambda_t = torch.log(alpha_t) - torch.log(sigma_t)
@@ -637,7 +750,11 @@ def multistep_deis_second_order_update(
 
         m0, m1 = model_output_list[-1], model_output_list[-2]
 
-        rho_t, rho_s0, rho_s1 = sigma_t / alpha_t, sigma_s0 / alpha_s0, sigma_s1 / alpha_s1
+        rho_t, rho_s0, rho_s1 = (
+            sigma_t / alpha_t,
+            sigma_s0 / alpha_s0,
+            sigma_s1 / alpha_s1,
+        )
 
         if self.config.algorithm_type == "deis":
 
@@ -742,7 +859,22 @@ def ind_fn(t, b, c, d):
             raise NotImplementedError("only support log-rho multistep deis now")
 
     # Copied from diffusers.schedulers.scheduling_dpmsolver_multistep.DPMSolverMultistepScheduler.index_for_timestep
-    def index_for_timestep(self, timestep, schedule_timesteps=None):
+    def index_for_timestep(
+        self, timestep: Union[int, torch.Tensor], schedule_timesteps: Optional[torch.Tensor] = None
+    ) -> int:
+        """
+        Find the index for a given timestep in the schedule.
+
+        Args:
+            timestep (`int` or `torch.Tensor`):
+                The timestep for which to find the index.
+            schedule_timesteps (`torch.Tensor`, *optional*):
+                The timestep schedule to search in. If `None`, uses `self.timesteps`.
+
+        Returns:
+            `int`:
+                The index of the timestep in the schedule.
+        """
         if schedule_timesteps is None:
             schedule_timesteps = self.timesteps
 
@@ -762,9 +894,13 @@ def index_for_timestep(self, timestep, schedule_timesteps=None):
         return step_index
 
     # Copied from diffusers.schedulers.scheduling_dpmsolver_multistep.DPMSolverMultistepScheduler._init_step_index
-    def _init_step_index(self, timestep):
+    def _init_step_index(self, timestep: Union[int, torch.Tensor]) -> None:
         """
         Initialize the step_index counter for the scheduler.
+
+        Args:
+            timestep (`int` or `torch.Tensor`):
+                The current timestep for which to initialize the step index.
         """
 
         if self.begin_index is None:
@@ -788,18 +924,17 @@ def step(
         Args:
             model_output (`torch.Tensor`):
                 The direct output from learned diffusion model.
-            timestep (`int`):
+            timestep (`int` or `torch.Tensor`):
                 The current discrete timestep in the diffusion chain.
             sample (`torch.Tensor`):
                 A current instance of a sample created by the diffusion process.
-            return_dict (`bool`):
+            return_dict (`bool`, defaults to `True`):
                 Whether or not to return a [`~schedulers.scheduling_utils.SchedulerOutput`] or `tuple`.
 
         Returns:
             [`~schedulers.scheduling_utils.SchedulerOutput`] or `tuple`:
                 If return_dict is `True`, [`~schedulers.scheduling_utils.SchedulerOutput`] is returned, otherwise a
                 tuple is returned where the first element is the sample tensor.
-
         """
         if self.num_inference_steps is None:
             raise ValueError(
@@ -861,6 +996,21 @@ def add_noise(
         noise: torch.Tensor,
         timesteps: torch.IntTensor,
     ) -> torch.Tensor:
+        """
+        Add noise to the original samples according to the noise schedule at the specified timesteps.
+
+        Args:
+            original_samples (`torch.Tensor`):
+                The original samples without noise.
+            noise (`torch.Tensor`):
+                The noise to add to the samples.
+            timesteps (`torch.IntTensor`):
+                The timesteps at which to add noise to the samples.
+
+        Returns:
+            `torch.Tensor`:
+                The noisy samples.
+        """
         # Make sure sigmas and timesteps have the same device and dtype as original_samples
         sigmas = self.sigmas.to(device=original_samples.device, dtype=original_samples.dtype)
         if original_samples.device.type == "mps" and torch.is_floating_point(timesteps):
@@ -889,5 +1039,5 @@ def add_noise(
         noisy_samples = alpha_t * original_samples + sigma_t * noise
         return noisy_samples
 
-    def __len__(self):
+    def __len__(self) -> int:
         return self.config.num_train_timesteps
diff --git a/src/diffusers/schedulers/scheduling_dpm_cogvideox.py b/src/diffusers/schedulers/scheduling_dpm_cogvideox.py
index f7b63720e107..0a9082208cf4 100644
--- a/src/diffusers/schedulers/scheduling_dpm_cogvideox.py
+++ b/src/diffusers/schedulers/scheduling_dpm_cogvideox.py
@@ -18,7 +18,7 @@
 
 import math
 from dataclasses import dataclass
-from typing import List, Optional, Tuple, Union
+from typing import List, Literal, Optional, Tuple, Union
 
 import numpy as np
 import torch
@@ -50,10 +50,10 @@ class DDIMSchedulerOutput(BaseOutput):
 
 # Copied from diffusers.schedulers.scheduling_ddpm.betas_for_alpha_bar
 def betas_for_alpha_bar(
-    num_diffusion_timesteps,
-    max_beta=0.999,
-    alpha_transform_type="cosine",
-):
+    num_diffusion_timesteps: int,
+    max_beta: float = 0.999,
+    alpha_transform_type: Literal["cosine", "exp"] = "cosine",
+) -> torch.Tensor:
     """
     Create a beta schedule that discretizes the given alpha_t_bar function, which defines the cumulative product of
     (1-beta) over time from t = [0,1].
@@ -61,16 +61,17 @@ def betas_for_alpha_bar(
     Contains a function alpha_bar that takes an argument t and transforms it to the cumulative product of (1-beta) up
     to that part of the diffusion process.
 
-
     Args:
-        num_diffusion_timesteps (`int`): the number of betas to produce.
-        max_beta (`float`): the maximum beta to use; use values lower than 1 to
-                     prevent singularities.
-        alpha_transform_type (`str`, *optional*, default to `cosine`): the type of noise schedule for alpha_bar.
-                     Choose from `cosine` or `exp`
+        num_diffusion_timesteps (`int`):
+            The number of betas to produce.
+        max_beta (`float`, defaults to `0.999`):
+            The maximum beta to use; use values lower than 1 to avoid numerical instability.
+        alpha_transform_type (`"cosine"` or `"exp"`, defaults to `"cosine"`):
+            The type of noise schedule for `alpha_bar`. Choose from `cosine` or `exp`.
 
     Returns:
-        betas (`np.ndarray`): the betas used by the scheduler to step the model outputs
+        `torch.Tensor`:
+            The betas used by the scheduler to step the model outputs.
     """
     if alpha_transform_type == "cosine":
 
@@ -157,7 +158,7 @@ class CogVideoXDPMScheduler(SchedulerMixin, ConfigMixin):
         prediction_type (`str`, defaults to `epsilon`, *optional*):
             Prediction type of the scheduler function; can be `epsilon` (predicts the noise of the diffusion process),
             `sample` (directly predicts the noisy sample`) or `v_prediction` (see section 2.4 of [Imagen
-            Video](https://imagen.research.google/video/paper.pdf) paper).
+            Video](https://huggingface.co/papers/2210.02303) paper).
         thresholding (`bool`, defaults to `False`):
             Whether to use the "dynamic thresholding" method. This is unsuitable for latent-space diffusion models such
             as Stable Diffusion.
@@ -445,6 +446,22 @@ def add_noise(
         noise: torch.Tensor,
         timesteps: torch.IntTensor,
     ) -> torch.Tensor:
+        """
+        Add noise to the original samples according to the noise magnitude at each timestep (this is the forward
+        diffusion process).
+
+        Args:
+            original_samples (`torch.Tensor`):
+                The original samples to which noise will be added.
+            noise (`torch.Tensor`):
+                The noise to add to the samples.
+            timesteps (`torch.IntTensor`):
+                The timesteps indicating the noise level for each sample.
+
+        Returns:
+            `torch.Tensor`:
+                The noisy samples.
+        """
         # Make sure alphas_cumprod and timestep have same device and dtype as original_samples
         # Move the self.alphas_cumprod to device to avoid redundant CPU to GPU data movement
         # for the subsequent add_noise calls
@@ -467,6 +484,21 @@ def add_noise(
 
     # Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler.get_velocity
     def get_velocity(self, sample: torch.Tensor, noise: torch.Tensor, timesteps: torch.IntTensor) -> torch.Tensor:
+        """
+        Compute the velocity prediction from the sample and noise according to the velocity formula.
+
+        Args:
+            sample (`torch.Tensor`):
+                The input sample.
+            noise (`torch.Tensor`):
+                The noise tensor.
+            timesteps (`torch.IntTensor`):
+                The timesteps for velocity computation.
+
+        Returns:
+            `torch.Tensor`:
+                The computed velocity.
+        """
         # Make sure alphas_cumprod and timestep have same device and dtype as sample
         self.alphas_cumprod = self.alphas_cumprod.to(device=sample.device)
         alphas_cumprod = self.alphas_cumprod.to(dtype=sample.dtype)
diff --git a/src/diffusers/schedulers/scheduling_dpmsolver_multistep.py b/src/diffusers/schedulers/scheduling_dpmsolver_multistep.py
index 8b523cd13f1f..e7ba0ba1f30e 100644
--- a/src/diffusers/schedulers/scheduling_dpmsolver_multistep.py
+++ b/src/diffusers/schedulers/scheduling_dpmsolver_multistep.py
@@ -15,7 +15,7 @@
 # DISCLAIMER: This file is strongly influenced by https://github.com/LuChengTHU/dpm-solver
 
 import math
-from typing import List, Optional, Tuple, Union
+from typing import List, Literal, Optional, Tuple, Union
 
 import numpy as np
 import torch
@@ -32,10 +32,10 @@
 
 # Copied from diffusers.schedulers.scheduling_ddpm.betas_for_alpha_bar
 def betas_for_alpha_bar(
-    num_diffusion_timesteps,
-    max_beta=0.999,
-    alpha_transform_type="cosine",
-):
+    num_diffusion_timesteps: int,
+    max_beta: float = 0.999,
+    alpha_transform_type: Literal["cosine", "exp"] = "cosine",
+) -> torch.Tensor:
     """
     Create a beta schedule that discretizes the given alpha_t_bar function, which defines the cumulative product of
     (1-beta) over time from t = [0,1].
@@ -43,16 +43,17 @@ def betas_for_alpha_bar(
     Contains a function alpha_bar that takes an argument t and transforms it to the cumulative product of (1-beta) up
     to that part of the diffusion process.
 
-
     Args:
-        num_diffusion_timesteps (`int`): the number of betas to produce.
-        max_beta (`float`): the maximum beta to use; use values lower than 1 to
-                     prevent singularities.
-        alpha_transform_type (`str`, *optional*, default to `cosine`): the type of noise schedule for alpha_bar.
-                     Choose from `cosine` or `exp`
+        num_diffusion_timesteps (`int`):
+            The number of betas to produce.
+        max_beta (`float`, defaults to `0.999`):
+            The maximum beta to use; use values lower than 1 to avoid numerical instability.
+        alpha_transform_type (`"cosine"` or `"exp"`, defaults to `"cosine"`):
+            The type of noise schedule for `alpha_bar`. Choose from `cosine` or `exp`.
 
     Returns:
-        betas (`np.ndarray`): the betas used by the scheduler to step the model outputs
+        `torch.Tensor`:
+            The betas used by the scheduler to step the model outputs.
     """
     if alpha_transform_type == "cosine":
 
@@ -80,13 +81,13 @@ def rescale_zero_terminal_snr(betas):
     """
     Rescales betas to have zero terminal SNR Based on https://huggingface.co/papers/2305.08891 (Algorithm 1)
 
-
     Args:
         betas (`torch.Tensor`):
-            the betas that the scheduler is being initialized with.
+            The betas that the scheduler is being initialized with.
 
     Returns:
-        `torch.Tensor`: rescaled betas with zero terminal SNR
+        `torch.Tensor`:
+            Rescaled betas with zero terminal SNR.
     """
     # Convert betas to alphas_bar_sqrt
     alphas = 1.0 - betas
@@ -126,18 +127,17 @@ class DPMSolverMultistepScheduler(SchedulerMixin, ConfigMixin):
             The starting `beta` value of inference.
         beta_end (`float`, defaults to 0.02):
             The final `beta` value.
-        beta_schedule (`str`, defaults to `"linear"`):
-            The beta schedule, a mapping from a beta range to a sequence of betas for stepping the model. Choose from
-            `linear`, `scaled_linear`, or `squaredcos_cap_v2`.
+        beta_schedule (`"linear"`, `"scaled_linear"`, or `"squaredcos_cap_v2"`, defaults to `"linear"`):
+            The beta schedule, a mapping from a beta range to a sequence of betas for stepping the model.
         trained_betas (`np.ndarray`, *optional*):
             Pass an array of betas directly to the constructor to bypass `beta_start` and `beta_end`.
         solver_order (`int`, defaults to 2):
             The DPMSolver order which can be `1` or `2` or `3`. It is recommended to use `solver_order=2` for guided
             sampling, and `solver_order=3` for unconditional sampling.
-        prediction_type (`str`, defaults to `epsilon`, *optional*):
-            Prediction type of the scheduler function; can be `epsilon` (predicts the noise of the diffusion process),
-            `sample` (directly predicts the noisy sample), `v_prediction` (see section 2.4 of [Imagen
-            Video](https://imagen.research.google/video/paper.pdf) paper), or `flow_prediction`.
+        prediction_type (`"epsilon"`, `"sample"`, `"v_prediction"`, or `"flow_prediction"`, defaults to `"epsilon"`):
+            Prediction type of the scheduler function. `epsilon` predicts the noise of the diffusion process, `sample`
+            directly predicts the noisy sample, `v_prediction` predicts the velocity (see section 2.4 of [Imagen
+            Video](https://huggingface.co/papers/2210.02303) paper), and `flow_prediction` predicts the flow.
         thresholding (`bool`, defaults to `False`):
             Whether to use the "dynamic thresholding" method. This is unsuitable for latent-space diffusion models such
             as Stable Diffusion.
@@ -146,15 +146,14 @@ class DPMSolverMultistepScheduler(SchedulerMixin, ConfigMixin):
         sample_max_value (`float`, defaults to 1.0):
             The threshold value for dynamic thresholding. Valid only when `thresholding=True` and
             `algorithm_type="dpmsolver++"`.
-        algorithm_type (`str`, defaults to `dpmsolver++`):
-            Algorithm type for the solver; can be `dpmsolver`, `dpmsolver++`, `sde-dpmsolver` or `sde-dpmsolver++`. The
-            `dpmsolver` type implements the algorithms in the [DPMSolver](https://huggingface.co/papers/2206.00927)
-            paper, and the `dpmsolver++` type implements the algorithms in the
-            [DPMSolver++](https://huggingface.co/papers/2211.01095) paper. It is recommended to use `dpmsolver++` or
-            `sde-dpmsolver++` with `solver_order=2` for guided sampling like in Stable Diffusion.
-        solver_type (`str`, defaults to `midpoint`):
-            Solver type for the second-order solver; can be `midpoint` or `heun`. The solver type slightly affects the
-            sample quality, especially for a small number of steps. It is recommended to use `midpoint` solvers.
+        algorithm_type (`"dpmsolver"`, `"dpmsolver++"`, `"sde-dpmsolver"`, or `"sde-dpmsolver++"`, defaults to `"dpmsolver++"`):
+            Algorithm type for the solver. The `dpmsolver` type implements the algorithms in the
+            [DPMSolver](https://huggingface.co/papers/2206.00927) paper, and the `dpmsolver++` type implements the
+            algorithms in the [DPMSolver++](https://huggingface.co/papers/2211.01095) paper. It is recommended to use
+            `dpmsolver++` or `sde-dpmsolver++` with `solver_order=2` for guided sampling like in Stable Diffusion.
+        solver_type (`"midpoint"` or `"heun"`, defaults to `"midpoint"`):
+            Solver type for the second-order solver. The solver type slightly affects the sample quality, especially
+            for a small number of steps. It is recommended to use `midpoint` solvers.
         lower_order_final (`bool`, defaults to `True`):
             Whether to use lower-order solvers in the final steps. Only valid for < 15 inference steps. This can
             stabilize the sampling of DPMSolver for steps < 15, especially for steps <= 10.
@@ -178,16 +177,16 @@ class DPMSolverMultistepScheduler(SchedulerMixin, ConfigMixin):
             Whether to use flow sigmas for step sizes in the noise schedule during the sampling process.
         flow_shift (`float`, *optional*, defaults to 1.0):
             The shift value for the timestep schedule for flow matching.
-        final_sigmas_type (`str`, defaults to `"zero"`):
+        final_sigmas_type (`"zero"` or `"sigma_min"`, *optional*, defaults to `"zero"`):
             The final `sigma` value for the noise schedule during the sampling process. If `"sigma_min"`, the final
-            sigma is the same as the last sigma in the training schedule. If `zero`, the final sigma is set to 0.
+            sigma is the same as the last sigma in the training schedule. If `"zero"`, the final sigma is set to 0.
         lambda_min_clipped (`float`, defaults to `-inf`):
             Clipping threshold for the minimum value of `lambda(t)` for numerical stability. This is critical for the
             cosine (`squaredcos_cap_v2`) noise schedule.
-        variance_type (`str`, *optional*):
-            Set to "learned" or "learned_range" for diffusion models that predict variance. If set, the model's output
-            contains the predicted Gaussian variance.
-        timestep_spacing (`str`, defaults to `"linspace"`):
+        variance_type (`"learned"` or `"learned_range"`, *optional*):
+            Set to `"learned"` or `"learned_range"` for diffusion models that predict variance. If set, the model's
+            output contains the predicted Gaussian variance.
+        timestep_spacing (`"linspace"`, `"leading"`, or `"trailing"`, defaults to `"linspace"`):
             The way the timesteps should be scaled. Refer to Table 2 of the [Common Diffusion Noise Schedules and
             Sample Steps are Flawed](https://huggingface.co/papers/2305.08891) for more information.
         steps_offset (`int`, defaults to 0):
@@ -196,6 +195,10 @@ class DPMSolverMultistepScheduler(SchedulerMixin, ConfigMixin):
             Whether to rescale the betas to have zero terminal SNR. This enables the model to generate very bright and
             dark samples instead of limiting it to samples with medium brightness. Loosely related to
             [`--offset_noise`](https://github.com/huggingface/diffusers/blob/74fd735eb073eb1d774b1ab4154a0876eb82f055/examples/dreambooth/train_dreambooth.py#L506).
+        use_dynamic_shifting (`bool`, defaults to `False`):
+            Whether to use dynamic shifting for the timestep schedule.
+        time_shift_type (`"exponential"`, defaults to `"exponential"`):
+            The type of time shift to apply when using dynamic shifting.
     """
 
     _compatibles = [e.name for e in KarrasDiffusionSchedulers]
@@ -207,15 +210,15 @@ def __init__(
         num_train_timesteps: int = 1000,
         beta_start: float = 0.0001,
         beta_end: float = 0.02,
-        beta_schedule: str = "linear",
+        beta_schedule: Literal["linear", "scaled_linear", "squaredcos_cap_v2"] = "linear",
         trained_betas: Optional[Union[np.ndarray, List[float]]] = None,
         solver_order: int = 2,
-        prediction_type: str = "epsilon",
+        prediction_type: Literal["epsilon", "sample", "v_prediction", "flow_prediction"] = "epsilon",
         thresholding: bool = False,
         dynamic_thresholding_ratio: float = 0.995,
         sample_max_value: float = 1.0,
-        algorithm_type: str = "dpmsolver++",
-        solver_type: str = "midpoint",
+        algorithm_type: Literal["dpmsolver", "dpmsolver++", "sde-dpmsolver", "sde-dpmsolver++"] = "dpmsolver++",
+        solver_type: Literal["midpoint", "heun"] = "midpoint",
         lower_order_final: bool = True,
         euler_at_final: bool = False,
         use_karras_sigmas: Optional[bool] = False,
@@ -224,14 +227,14 @@ def __init__(
         use_lu_lambdas: Optional[bool] = False,
         use_flow_sigmas: Optional[bool] = False,
         flow_shift: Optional[float] = 1.0,
-        final_sigmas_type: Optional[str] = "zero",  # "zero", "sigma_min"
+        final_sigmas_type: Optional[Literal["zero", "sigma_min"]] = "zero",
         lambda_min_clipped: float = -float("inf"),
-        variance_type: Optional[str] = None,
-        timestep_spacing: str = "linspace",
+        variance_type: Optional[Literal["learned", "learned_range"]] = None,
+        timestep_spacing: Literal["linspace", "leading", "trailing"] = "linspace",
         steps_offset: int = 0,
         rescale_betas_zero_snr: bool = False,
         use_dynamic_shifting: bool = False,
-        time_shift_type: str = "exponential",
+        time_shift_type: Literal["exponential"] = "exponential",
     ):
         if self.config.use_beta_sigmas and not is_scipy_available():
             raise ImportError("Make sure to install scipy if you want to use beta sigmas.")
@@ -323,26 +326,29 @@ def set_begin_index(self, begin_index: int = 0):
         Sets the begin index for the scheduler. This function should be run from pipeline before the inference.
 
         Args:
-            begin_index (`int`):
+            begin_index (`int`, defaults to `0`):
                 The begin index for the scheduler.
         """
         self._begin_index = begin_index
 
     def set_timesteps(
         self,
-        num_inference_steps: int = None,
-        device: Union[str, torch.device] = None,
+        num_inference_steps: Optional[int] = None,
+        device: Optional[Union[str, torch.device]] = None,
         mu: Optional[float] = None,
         timesteps: Optional[List[int]] = None,
-    ):
+    ) -> None:
         """
         Sets the discrete timesteps used for the diffusion chain (to be run before inference).
 
         Args:
-            num_inference_steps (`int`):
+            num_inference_steps (`int`, *optional*):
                 The number of diffusion steps used when generating samples with a pre-trained model.
             device (`str` or `torch.device`, *optional*):
                 The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
+            mu (`float`, *optional*):
+                The mu parameter for dynamic shifting. If provided, requires `use_dynamic_shifting=True` and
+                `time_shift_type="exponential"`.
             timesteps (`List[int]`, *optional*):
                 Custom timesteps used to support arbitrary timesteps schedule. If `None`, timesteps will be generated
                 based on the `timestep_spacing` attribute. If `timesteps` is passed, `num_inference_steps` and `sigmas`
@@ -460,6 +466,8 @@ def set_timesteps(
     # Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler._threshold_sample
     def _threshold_sample(self, sample: torch.Tensor) -> torch.Tensor:
         """
+        Apply dynamic thresholding to the predicted sample.
+
         "Dynamic thresholding: At each sampling step we set s to a certain percentile absolute pixel value in xt0 (the
         prediction of x_0 at timestep t), and if s > 1, then we threshold xt0 to the range [-s, s] and then divide by
         s. Dynamic thresholding pushes saturated pixels (those near -1 and 1) inwards, thereby actively preventing
@@ -467,6 +475,14 @@ def _threshold_sample(self, sample: torch.Tensor) -> torch.Tensor:
         photorealism as well as better image-text alignment, especially when using very large guidance weights."
 
         https://huggingface.co/papers/2205.11487
+
+        Args:
+            sample (`torch.Tensor`):
+                The predicted sample to be thresholded.
+
+        Returns:
+            `torch.Tensor`:
+                The thresholded sample.
         """
         dtype = sample.dtype
         batch_size, channels, *remaining_dims = sample.shape
@@ -492,7 +508,20 @@ def _threshold_sample(self, sample: torch.Tensor) -> torch.Tensor:
         return sample
 
     # Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler._sigma_to_t
-    def _sigma_to_t(self, sigma, log_sigmas):
+    def _sigma_to_t(self, sigma: np.ndarray, log_sigmas: np.ndarray) -> np.ndarray:
+        """
+        Convert sigma values to corresponding timestep values through interpolation.
+
+        Args:
+            sigma (`np.ndarray`):
+                The sigma value(s) to convert to timestep(s).
+            log_sigmas (`np.ndarray`):
+                The logarithm of the sigma schedule used for interpolation.
+
+        Returns:
+            `np.ndarray`:
+                The interpolated timestep value(s) corresponding to the input sigma(s).
+        """
         # get log sigma
         log_sigma = np.log(np.maximum(sigma, 1e-10))
 
@@ -515,7 +544,18 @@ def _sigma_to_t(self, sigma, log_sigmas):
         t = t.reshape(sigma.shape)
         return t
 
-    def _sigma_to_alpha_sigma_t(self, sigma):
+    def _sigma_to_alpha_sigma_t(self, sigma: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
+        """
+        Convert sigma values to alpha_t and sigma_t values.
+
+        Args:
+            sigma (`torch.Tensor`):
+                The sigma value(s) to convert.
+
+        Returns:
+            `Tuple[torch.Tensor, torch.Tensor]`:
+                A tuple containing (alpha_t, sigma_t) values.
+        """
         if self.config.use_flow_sigmas:
             alpha_t = 1 - sigma
             sigma_t = sigma
@@ -527,7 +567,20 @@ def _sigma_to_alpha_sigma_t(self, sigma):
 
     # Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler._convert_to_karras
     def _convert_to_karras(self, in_sigmas: torch.Tensor, num_inference_steps) -> torch.Tensor:
-        """Constructs the noise schedule of Karras et al. (2022)."""
+        """
+        Construct the noise schedule as proposed in [Elucidating the Design Space of Diffusion-Based Generative
+        Models](https://huggingface.co/papers/2206.00364).
+
+        Args:
+            in_sigmas (`torch.Tensor`):
+                The input sigma values to be converted.
+            num_inference_steps (`int`):
+                The number of inference steps to generate the noise schedule for.
+
+        Returns:
+            `torch.Tensor`:
+                The converted sigma values following the Karras noise schedule.
+        """
 
         # Hack to make sure that other schedulers which copy this function don't break
         # TODO: Add this logic to the other schedulers
@@ -551,8 +604,21 @@ def _convert_to_karras(self, in_sigmas: torch.Tensor, num_inference_steps) -> to
         sigmas = (max_inv_rho + ramp * (min_inv_rho - max_inv_rho)) ** rho
         return sigmas
 
-    def _convert_to_lu(self, in_lambdas: torch.Tensor, num_inference_steps) -> torch.Tensor:
-        """Constructs the noise schedule of Lu et al. (2022)."""
+    def _convert_to_lu(self, in_lambdas: torch.Tensor, num_inference_steps: int) -> torch.Tensor:
+        """
+        Construct the noise schedule as proposed in [DPM-Solver: A Fast ODE Solver for Diffusion Probabilistic Model
+        Sampling in Around 10 Steps](https://huggingface.co/papers/2206.00927) by Lu et al. (2022).
+
+        Args:
+            in_lambdas (`torch.Tensor`):
+                The input lambda values to be converted.
+            num_inference_steps (`int`):
+                The number of inference steps to generate the noise schedule for.
+
+        Returns:
+            `torch.Tensor`:
+                The converted lambda values following the Lu noise schedule.
+        """
 
         lambda_min: float = in_lambdas[-1].item()
         lambda_max: float = in_lambdas[0].item()
@@ -566,7 +632,19 @@ def _convert_to_lu(self, in_lambdas: torch.Tensor, num_inference_steps) -> torch
 
     # Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler._convert_to_exponential
     def _convert_to_exponential(self, in_sigmas: torch.Tensor, num_inference_steps: int) -> torch.Tensor:
-        """Constructs an exponential noise schedule."""
+        """
+        Construct an exponential noise schedule.
+
+        Args:
+            in_sigmas (`torch.Tensor`):
+                The input sigma values to be converted.
+            num_inference_steps (`int`):
+                The number of inference steps to generate the noise schedule for.
+
+        Returns:
+            `torch.Tensor`:
+                The converted sigma values following an exponential schedule.
+        """
 
         # Hack to make sure that other schedulers which copy this function don't break
         # TODO: Add this logic to the other schedulers
@@ -590,7 +668,24 @@ def _convert_to_exponential(self, in_sigmas: torch.Tensor, num_inference_steps:
     def _convert_to_beta(
         self, in_sigmas: torch.Tensor, num_inference_steps: int, alpha: float = 0.6, beta: float = 0.6
     ) -> torch.Tensor:
-        """From "Beta Sampling is All You Need" [arXiv:2407.12173] (Lee et. al, 2024)"""
+        """
+        Construct a beta noise schedule as proposed in [Beta Sampling is All You
+        Need](https://huggingface.co/papers/2407.12173).
+
+        Args:
+            in_sigmas (`torch.Tensor`):
+                The input sigma values to be converted.
+            num_inference_steps (`int`):
+                The number of inference steps to generate the noise schedule for.
+            alpha (`float`, *optional*, defaults to `0.6`):
+                The alpha parameter for the beta distribution.
+            beta (`float`, *optional*, defaults to `0.6`):
+                The beta parameter for the beta distribution.
+
+        Returns:
+            `torch.Tensor`:
+                The converted sigma values following a beta distribution schedule.
+        """
 
         # Hack to make sure that other schedulers which copy this function don't break
         # TODO: Add this logic to the other schedulers
@@ -1003,7 +1098,22 @@ def multistep_dpm_solver_third_order_update(
             )
         return x_t
 
-    def index_for_timestep(self, timestep, schedule_timesteps=None):
+    def index_for_timestep(
+        self, timestep: Union[int, torch.Tensor], schedule_timesteps: Optional[torch.Tensor] = None
+    ) -> int:
+        """
+        Find the index for a given timestep in the schedule.
+
+        Args:
+            timestep (`int` or `torch.Tensor`):
+                The timestep for which to find the index.
+            schedule_timesteps (`torch.Tensor`, *optional*):
+                The timestep schedule to search in. If `None`, uses `self.timesteps`.
+
+        Returns:
+            `int`:
+                The index of the timestep in the schedule.
+        """
         if schedule_timesteps is None:
             schedule_timesteps = self.timesteps
 
@@ -1022,9 +1132,13 @@ def index_for_timestep(self, timestep, schedule_timesteps=None):
 
         return step_index
 
-    def _init_step_index(self, timestep):
+    def _init_step_index(self, timestep: Union[int, torch.Tensor]) -> None:
         """
         Initialize the step_index counter for the scheduler.
+
+        Args:
+            timestep (`int` or `torch.Tensor`):
+                The current timestep for which to initialize the step index.
         """
 
         if self.begin_index is None:
@@ -1039,7 +1153,7 @@ def step(
         model_output: torch.Tensor,
         timestep: Union[int, torch.Tensor],
         sample: torch.Tensor,
-        generator=None,
+        generator: Optional[torch.Generator] = None,
         variance_noise: Optional[torch.Tensor] = None,
         return_dict: bool = True,
     ) -> Union[SchedulerOutput, Tuple]:
@@ -1049,22 +1163,22 @@ def step(
 
         Args:
             model_output (`torch.Tensor`):
-                The direct output from learned diffusion model.
-            timestep (`int`):
+                The direct output from the learned diffusion model.
+            timestep (`int` or `torch.Tensor`):
                 The current discrete timestep in the diffusion chain.
             sample (`torch.Tensor`):
                 A current instance of a sample created by the diffusion process.
             generator (`torch.Generator`, *optional*):
                 A random number generator.
-            variance_noise (`torch.Tensor`):
+            variance_noise (`torch.Tensor`, *optional*):
                 Alternative to generating noise with `generator` by directly providing the noise for the variance
                 itself. Useful for methods such as [`LEdits++`].
-            return_dict (`bool`):
+            return_dict (`bool`, defaults to `True`):
                 Whether or not to return a [`~schedulers.scheduling_utils.SchedulerOutput`] or `tuple`.
 
         Returns:
             [`~schedulers.scheduling_utils.SchedulerOutput`] or `tuple`:
-                If return_dict is `True`, [`~schedulers.scheduling_utils.SchedulerOutput`] is returned, otherwise a
+                If `return_dict` is `True`, [`~schedulers.scheduling_utils.SchedulerOutput`] is returned, otherwise a
                 tuple is returned where the first element is the sample tensor.
 
         """
@@ -1144,6 +1258,21 @@ def add_noise(
         noise: torch.Tensor,
         timesteps: torch.IntTensor,
     ) -> torch.Tensor:
+        """
+        Add noise to the original samples according to the noise schedule at the specified timesteps.
+
+        Args:
+            original_samples (`torch.Tensor`):
+                The original samples without noise.
+            noise (`torch.Tensor`):
+                The noise to add to the samples.
+            timesteps (`torch.IntTensor`):
+                The timesteps at which to add noise to the samples.
+
+        Returns:
+            `torch.Tensor`:
+                The noisy samples.
+        """
         # Make sure sigmas and timesteps have the same device and dtype as original_samples
         sigmas = self.sigmas.to(device=original_samples.device, dtype=original_samples.dtype)
         if original_samples.device.type == "mps" and torch.is_floating_point(timesteps):
diff --git a/src/diffusers/schedulers/scheduling_dpmsolver_multistep_inverse.py b/src/diffusers/schedulers/scheduling_dpmsolver_multistep_inverse.py
index f1a1ac3d8216..6696b0375f9f 100644
--- a/src/diffusers/schedulers/scheduling_dpmsolver_multistep_inverse.py
+++ b/src/diffusers/schedulers/scheduling_dpmsolver_multistep_inverse.py
@@ -15,7 +15,7 @@
 # DISCLAIMER: This file is strongly influenced by https://github.com/LuChengTHU/dpm-solver
 
 import math
-from typing import List, Optional, Tuple, Union
+from typing import List, Literal, Optional, Tuple, Union
 
 import numpy as np
 import torch
@@ -32,10 +32,10 @@
 
 # Copied from diffusers.schedulers.scheduling_ddpm.betas_for_alpha_bar
 def betas_for_alpha_bar(
-    num_diffusion_timesteps,
-    max_beta=0.999,
-    alpha_transform_type="cosine",
-):
+    num_diffusion_timesteps: int,
+    max_beta: float = 0.999,
+    alpha_transform_type: Literal["cosine", "exp"] = "cosine",
+) -> torch.Tensor:
     """
     Create a beta schedule that discretizes the given alpha_t_bar function, which defines the cumulative product of
     (1-beta) over time from t = [0,1].
@@ -43,16 +43,17 @@ def betas_for_alpha_bar(
     Contains a function alpha_bar that takes an argument t and transforms it to the cumulative product of (1-beta) up
     to that part of the diffusion process.
 
-
     Args:
-        num_diffusion_timesteps (`int`): the number of betas to produce.
-        max_beta (`float`): the maximum beta to use; use values lower than 1 to
-                     prevent singularities.
-        alpha_transform_type (`str`, *optional*, default to `cosine`): the type of noise schedule for alpha_bar.
-                     Choose from `cosine` or `exp`
+        num_diffusion_timesteps (`int`):
+            The number of betas to produce.
+        max_beta (`float`, defaults to `0.999`):
+            The maximum beta to use; use values lower than 1 to avoid numerical instability.
+        alpha_transform_type (`"cosine"` or `"exp"`, defaults to `"cosine"`):
+            The type of noise schedule for `alpha_bar`. Choose from `cosine` or `exp`.
 
     Returns:
-        betas (`np.ndarray`): the betas used by the scheduler to step the model outputs
+        `torch.Tensor`:
+            The betas used by the scheduler to step the model outputs.
     """
     if alpha_transform_type == "cosine":
 
@@ -100,7 +101,7 @@ class DPMSolverMultistepInverseScheduler(SchedulerMixin, ConfigMixin):
         prediction_type (`str`, defaults to `epsilon`, *optional*):
             Prediction type of the scheduler function; can be `epsilon` (predicts the noise of the diffusion process),
             `sample` (directly predicts the noisy sample`) or `v_prediction` (see section 2.4 of [Imagen
-            Video](https://imagen.research.google/video/paper.pdf) paper).
+            Video](https://huggingface.co/papers/2210.02303) paper).
         thresholding (`bool`, defaults to `False`):
             Whether to use the "dynamic thresholding" method. This is unsuitable for latent-space diffusion models such
             as Stable Diffusion.
@@ -332,6 +333,8 @@ def set_timesteps(self, num_inference_steps: int = None, device: Union[str, torc
     # Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler._threshold_sample
     def _threshold_sample(self, sample: torch.Tensor) -> torch.Tensor:
         """
+        Apply dynamic thresholding to the predicted sample.
+
         "Dynamic thresholding: At each sampling step we set s to a certain percentile absolute pixel value in xt0 (the
         prediction of x_0 at timestep t), and if s > 1, then we threshold xt0 to the range [-s, s] and then divide by
         s. Dynamic thresholding pushes saturated pixels (those near -1 and 1) inwards, thereby actively preventing
@@ -339,6 +342,14 @@ def _threshold_sample(self, sample: torch.Tensor) -> torch.Tensor:
         photorealism as well as better image-text alignment, especially when using very large guidance weights."
 
         https://huggingface.co/papers/2205.11487
+
+        Args:
+            sample (`torch.Tensor`):
+                The predicted sample to be thresholded.
+
+        Returns:
+            `torch.Tensor`:
+                The thresholded sample.
         """
         dtype = sample.dtype
         batch_size, channels, *remaining_dims = sample.shape
@@ -365,6 +376,19 @@ def _threshold_sample(self, sample: torch.Tensor) -> torch.Tensor:
 
     # Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler._sigma_to_t
     def _sigma_to_t(self, sigma, log_sigmas):
+        """
+        Convert sigma values to corresponding timestep values through interpolation.
+
+        Args:
+            sigma (`np.ndarray`):
+                The sigma value(s) to convert to timestep(s).
+            log_sigmas (`np.ndarray`):
+                The logarithm of the sigma schedule used for interpolation.
+
+        Returns:
+            `np.ndarray`:
+                The interpolated timestep value(s) corresponding to the input sigma(s).
+        """
         # get log sigma
         log_sigma = np.log(np.maximum(sigma, 1e-10))
 
@@ -389,6 +413,17 @@ def _sigma_to_t(self, sigma, log_sigmas):
 
     # Copied from diffusers.schedulers.scheduling_dpmsolver_multistep.DPMSolverMultistepScheduler._sigma_to_alpha_sigma_t
     def _sigma_to_alpha_sigma_t(self, sigma):
+        """
+        Convert sigma values to alpha_t and sigma_t values.
+
+        Args:
+            sigma (`torch.Tensor`):
+                The sigma value(s) to convert.
+
+        Returns:
+            `Tuple[torch.Tensor, torch.Tensor]`:
+                A tuple containing (alpha_t, sigma_t) values.
+        """
         if self.config.use_flow_sigmas:
             alpha_t = 1 - sigma
             sigma_t = sigma
@@ -400,7 +435,20 @@ def _sigma_to_alpha_sigma_t(self, sigma):
 
     # Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler._convert_to_karras
     def _convert_to_karras(self, in_sigmas: torch.Tensor, num_inference_steps) -> torch.Tensor:
-        """Constructs the noise schedule of Karras et al. (2022)."""
+        """
+        Construct the noise schedule as proposed in [Elucidating the Design Space of Diffusion-Based Generative
+        Models](https://huggingface.co/papers/2206.00364).
+
+        Args:
+            in_sigmas (`torch.Tensor`):
+                The input sigma values to be converted.
+            num_inference_steps (`int`):
+                The number of inference steps to generate the noise schedule for.
+
+        Returns:
+            `torch.Tensor`:
+                The converted sigma values following the Karras noise schedule.
+        """
 
         # Hack to make sure that other schedulers which copy this function don't break
         # TODO: Add this logic to the other schedulers
@@ -426,7 +474,19 @@ def _convert_to_karras(self, in_sigmas: torch.Tensor, num_inference_steps) -> to
 
     # Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler._convert_to_exponential
     def _convert_to_exponential(self, in_sigmas: torch.Tensor, num_inference_steps: int) -> torch.Tensor:
-        """Constructs an exponential noise schedule."""
+        """
+        Construct an exponential noise schedule.
+
+        Args:
+            in_sigmas (`torch.Tensor`):
+                The input sigma values to be converted.
+            num_inference_steps (`int`):
+                The number of inference steps to generate the noise schedule for.
+
+        Returns:
+            `torch.Tensor`:
+                The converted sigma values following an exponential schedule.
+        """
 
         # Hack to make sure that other schedulers which copy this function don't break
         # TODO: Add this logic to the other schedulers
@@ -450,7 +510,24 @@ def _convert_to_exponential(self, in_sigmas: torch.Tensor, num_inference_steps:
     def _convert_to_beta(
         self, in_sigmas: torch.Tensor, num_inference_steps: int, alpha: float = 0.6, beta: float = 0.6
     ) -> torch.Tensor:
-        """From "Beta Sampling is All You Need" [arXiv:2407.12173] (Lee et. al, 2024)"""
+        """
+        Construct a beta noise schedule as proposed in [Beta Sampling is All You
+        Need](https://huggingface.co/papers/2407.12173).
+
+        Args:
+            in_sigmas (`torch.Tensor`):
+                The input sigma values to be converted.
+            num_inference_steps (`int`):
+                The number of inference steps to generate the noise schedule for.
+            alpha (`float`, *optional*, defaults to `0.6`):
+                The alpha parameter for the beta distribution.
+            beta (`float`, *optional*, defaults to `0.6`):
+                The beta parameter for the beta distribution.
+
+        Returns:
+            `torch.Tensor`:
+                The converted sigma values following a beta distribution schedule.
+        """
 
         # Hack to make sure that other schedulers which copy this function don't break
         # TODO: Add this logic to the other schedulers
diff --git a/src/diffusers/schedulers/scheduling_dpmsolver_sde.py b/src/diffusers/schedulers/scheduling_dpmsolver_sde.py
index eeb06773d977..81c9e4134f57 100644
--- a/src/diffusers/schedulers/scheduling_dpmsolver_sde.py
+++ b/src/diffusers/schedulers/scheduling_dpmsolver_sde.py
@@ -14,7 +14,7 @@
 
 import math
 from dataclasses import dataclass
-from typing import List, Optional, Tuple, Union
+from typing import List, Literal, Optional, Tuple, Union
 
 import numpy as np
 import torch
@@ -115,10 +115,10 @@ def __call__(self, sigma, sigma_next):
 
 # Copied from diffusers.schedulers.scheduling_ddpm.betas_for_alpha_bar
 def betas_for_alpha_bar(
-    num_diffusion_timesteps,
-    max_beta=0.999,
-    alpha_transform_type="cosine",
-):
+    num_diffusion_timesteps: int,
+    max_beta: float = 0.999,
+    alpha_transform_type: Literal["cosine", "exp"] = "cosine",
+) -> torch.Tensor:
     """
     Create a beta schedule that discretizes the given alpha_t_bar function, which defines the cumulative product of
     (1-beta) over time from t = [0,1].
@@ -126,16 +126,17 @@ def betas_for_alpha_bar(
     Contains a function alpha_bar that takes an argument t and transforms it to the cumulative product of (1-beta) up
     to that part of the diffusion process.
 
-
     Args:
-        num_diffusion_timesteps (`int`): the number of betas to produce.
-        max_beta (`float`): the maximum beta to use; use values lower than 1 to
-                     prevent singularities.
-        alpha_transform_type (`str`, *optional*, default to `cosine`): the type of noise schedule for alpha_bar.
-                     Choose from `cosine` or `exp`
+        num_diffusion_timesteps (`int`):
+            The number of betas to produce.
+        max_beta (`float`, defaults to `0.999`):
+            The maximum beta to use; use values lower than 1 to avoid numerical instability.
+        alpha_transform_type (`"cosine"` or `"exp"`, defaults to `"cosine"`):
+            The type of noise schedule for `alpha_bar`. Choose from `cosine` or `exp`.
 
     Returns:
-        betas (`np.ndarray`): the betas used by the scheduler to step the model outputs
+        `torch.Tensor`:
+            The betas used by the scheduler to step the model outputs.
     """
     if alpha_transform_type == "cosine":
 
@@ -181,7 +182,7 @@ class DPMSolverSDEScheduler(SchedulerMixin, ConfigMixin):
         prediction_type (`str`, defaults to `epsilon`, *optional*):
             Prediction type of the scheduler function; can be `epsilon` (predicts the noise of the diffusion process),
             `sample` (directly predicts the noisy sample`) or `v_prediction` (see section 2.4 of [Imagen
-            Video](https://imagen.research.google/video/paper.pdf) paper).
+            Video](https://huggingface.co/papers/2210.02303) paper).
         use_karras_sigmas (`bool`, *optional*, defaults to `False`):
             Whether to use Karras sigmas for step sizes in the noise schedule during the sampling process. If `True`,
             the sigmas are determined according to a sequence of noise levels {σi}.
@@ -250,7 +251,23 @@ def __init__(
         self.sigmas = self.sigmas.to("cpu")  # to avoid too much CPU/GPU communication
 
     # Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler.index_for_timestep
-    def index_for_timestep(self, timestep, schedule_timesteps=None):
+    def index_for_timestep(
+        self, timestep: Union[float, torch.Tensor], schedule_timesteps: Optional[torch.Tensor] = None
+    ) -> int:
+        """
+        Find the index of a given timestep in the timestep schedule.
+
+        Args:
+            timestep (`float` or `torch.Tensor`):
+                The timestep value to find in the schedule.
+            schedule_timesteps (`torch.Tensor`, *optional*):
+                The timestep schedule to search in. If `None`, uses `self.timesteps`.
+
+        Returns:
+            `int`:
+                The index of the timestep in the schedule. For the very first step, returns the second index if
+                multiple matches exist to avoid skipping a sigma when starting mid-schedule (e.g., for image-to-image).
+        """
         if schedule_timesteps is None:
             schedule_timesteps = self.timesteps
 
@@ -265,7 +282,14 @@ def index_for_timestep(self, timestep, schedule_timesteps=None):
         return indices[pos].item()
 
     # Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler._init_step_index
-    def _init_step_index(self, timestep):
+    def _init_step_index(self, timestep: Union[float, torch.Tensor]) -> None:
+        """
+        Initialize the step index for the scheduler based on the given timestep.
+
+        Args:
+            timestep (`float` or `torch.Tensor`):
+                The current timestep to initialize the step index from.
+        """
         if self.begin_index is None:
             if isinstance(timestep, torch.Tensor):
                 timestep = timestep.to(self.timesteps.device)
@@ -301,7 +325,7 @@ def set_begin_index(self, begin_index: int = 0):
         Sets the begin index for the scheduler. This function should be run from pipeline before the inference.
 
         Args:
-            begin_index (`int`):
+            begin_index (`int`, defaults to `0`):
                 The begin index for the scheduler.
         """
         self._begin_index = begin_index
@@ -429,6 +453,19 @@ def t_fn(_sigma):
 
     # Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler._sigma_to_t
     def _sigma_to_t(self, sigma, log_sigmas):
+        """
+        Convert sigma values to corresponding timestep values through interpolation.
+
+        Args:
+            sigma (`np.ndarray`):
+                The sigma value(s) to convert to timestep(s).
+            log_sigmas (`np.ndarray`):
+                The logarithm of the sigma schedule used for interpolation.
+
+        Returns:
+            `np.ndarray`:
+                The interpolated timestep value(s) corresponding to the input sigma(s).
+        """
         # get log sigma
         log_sigma = np.log(np.maximum(sigma, 1e-10))
 
@@ -451,9 +488,20 @@ def _sigma_to_t(self, sigma, log_sigmas):
         t = t.reshape(sigma.shape)
         return t
 
-    # copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler._convert_to_karras
+    # Copied from diffusers.schedulers.scheduling_lms_discrete.LMSDiscreteScheduler._convert_to_karras
     def _convert_to_karras(self, in_sigmas: torch.Tensor) -> torch.Tensor:
-        """Constructs the noise schedule of Karras et al. (2022)."""
+        """
+        Construct the noise schedule as proposed in [Elucidating the Design Space of Diffusion-Based Generative
+        Models](https://huggingface.co/papers/2206.00364).
+
+        Args:
+            in_sigmas (`torch.Tensor`):
+                The input sigma values to be converted.
+
+        Returns:
+            `torch.Tensor`:
+                The converted sigma values following the Karras noise schedule.
+        """
 
         sigma_min: float = in_sigmas[-1].item()
         sigma_max: float = in_sigmas[0].item()
@@ -467,7 +515,19 @@ def _convert_to_karras(self, in_sigmas: torch.Tensor) -> torch.Tensor:
 
     # Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler._convert_to_exponential
     def _convert_to_exponential(self, in_sigmas: torch.Tensor, num_inference_steps: int) -> torch.Tensor:
-        """Constructs an exponential noise schedule."""
+        """
+        Construct an exponential noise schedule.
+
+        Args:
+            in_sigmas (`torch.Tensor`):
+                The input sigma values to be converted.
+            num_inference_steps (`int`):
+                The number of inference steps to generate the noise schedule for.
+
+        Returns:
+            `torch.Tensor`:
+                The converted sigma values following an exponential schedule.
+        """
 
         # Hack to make sure that other schedulers which copy this function don't break
         # TODO: Add this logic to the other schedulers
@@ -491,7 +551,24 @@ def _convert_to_exponential(self, in_sigmas: torch.Tensor, num_inference_steps:
     def _convert_to_beta(
         self, in_sigmas: torch.Tensor, num_inference_steps: int, alpha: float = 0.6, beta: float = 0.6
     ) -> torch.Tensor:
-        """From "Beta Sampling is All You Need" [arXiv:2407.12173] (Lee et. al, 2024)"""
+        """
+        Construct a beta noise schedule as proposed in [Beta Sampling is All You
+        Need](https://huggingface.co/papers/2407.12173).
+
+        Args:
+            in_sigmas (`torch.Tensor`):
+                The input sigma values to be converted.
+            num_inference_steps (`int`):
+                The number of inference steps to generate the noise schedule for.
+            alpha (`float`, *optional*, defaults to `0.6`):
+                The alpha parameter for the beta distribution.
+            beta (`float`, *optional*, defaults to `0.6`):
+                The beta parameter for the beta distribution.
+
+        Returns:
+            `torch.Tensor`:
+                The converted sigma values following a beta distribution schedule.
+        """
 
         # Hack to make sure that other schedulers which copy this function don't break
         # TODO: Add this logic to the other schedulers
@@ -645,6 +722,21 @@ def add_noise(
         noise: torch.Tensor,
         timesteps: torch.Tensor,
     ) -> torch.Tensor:
+        """
+        Add noise to the original samples according to the noise schedule at the specified timesteps.
+
+        Args:
+            original_samples (`torch.Tensor`):
+                The original samples to which noise will be added.
+            noise (`torch.Tensor`):
+                The noise tensor to add to the original samples.
+            timesteps (`torch.Tensor`):
+                The timesteps at which to add noise, determining the noise level from the schedule.
+
+        Returns:
+            `torch.Tensor`:
+                The noisy samples with added noise scaled according to the timestep schedule.
+        """
         # Make sure sigmas and timesteps have the same device and dtype as original_samples
         sigmas = self.sigmas.to(device=original_samples.device, dtype=original_samples.dtype)
         if original_samples.device.type == "mps" and torch.is_floating_point(timesteps):
diff --git a/src/diffusers/schedulers/scheduling_dpmsolver_singlestep.py b/src/diffusers/schedulers/scheduling_dpmsolver_singlestep.py
index 1ae824973034..4916e1abb549 100644
--- a/src/diffusers/schedulers/scheduling_dpmsolver_singlestep.py
+++ b/src/diffusers/schedulers/scheduling_dpmsolver_singlestep.py
@@ -15,7 +15,7 @@
 # DISCLAIMER: This file is strongly influenced by https://github.com/LuChengTHU/dpm-solver
 
 import math
-from typing import List, Optional, Tuple, Union
+from typing import List, Literal, Optional, Tuple, Union
 
 import numpy as np
 import torch
@@ -34,10 +34,10 @@
 
 # Copied from diffusers.schedulers.scheduling_ddpm.betas_for_alpha_bar
 def betas_for_alpha_bar(
-    num_diffusion_timesteps,
-    max_beta=0.999,
-    alpha_transform_type="cosine",
-):
+    num_diffusion_timesteps: int,
+    max_beta: float = 0.999,
+    alpha_transform_type: Literal["cosine", "exp"] = "cosine",
+) -> torch.Tensor:
     """
     Create a beta schedule that discretizes the given alpha_t_bar function, which defines the cumulative product of
     (1-beta) over time from t = [0,1].
@@ -45,16 +45,17 @@ def betas_for_alpha_bar(
     Contains a function alpha_bar that takes an argument t and transforms it to the cumulative product of (1-beta) up
     to that part of the diffusion process.
 
-
     Args:
-        num_diffusion_timesteps (`int`): the number of betas to produce.
-        max_beta (`float`): the maximum beta to use; use values lower than 1 to
-                     prevent singularities.
-        alpha_transform_type (`str`, *optional*, default to `cosine`): the type of noise schedule for alpha_bar.
-                     Choose from `cosine` or `exp`
+        num_diffusion_timesteps (`int`):
+            The number of betas to produce.
+        max_beta (`float`, defaults to `0.999`):
+            The maximum beta to use; use values lower than 1 to avoid numerical instability.
+        alpha_transform_type (`"cosine"` or `"exp"`, defaults to `"cosine"`):
+            The type of noise schedule for `alpha_bar`. Choose from `cosine` or `exp`.
 
     Returns:
-        betas (`np.ndarray`): the betas used by the scheduler to step the model outputs
+        `torch.Tensor`:
+            The betas used by the scheduler to step the model outputs.
     """
     if alpha_transform_type == "cosine":
 
@@ -85,42 +86,42 @@ class DPMSolverSinglestepScheduler(SchedulerMixin, ConfigMixin):
     methods the library implements for all schedulers such as loading and saving.
 
     Args:
-        num_train_timesteps (`int`, defaults to 1000):
+        num_train_timesteps (`int`, defaults to `1000`):
             The number of diffusion steps to train the model.
-        beta_start (`float`, defaults to 0.0001):
+        beta_start (`float`, defaults to `0.0001`):
             The starting `beta` value of inference.
-        beta_end (`float`, defaults to 0.02):
+        beta_end (`float`, defaults to `0.02`):
             The final `beta` value.
-        beta_schedule (`str`, defaults to `"linear"`):
+        beta_schedule (`"linear"`, `"scaled_linear"`, or `"squaredcos_cap_v2"`, defaults to `"linear"`):
             The beta schedule, a mapping from a beta range to a sequence of betas for stepping the model. Choose from
             `linear`, `scaled_linear`, or `squaredcos_cap_v2`.
-        trained_betas (`np.ndarray`, *optional*):
+        trained_betas (`np.ndarray` or `List[float]`, *optional*):
             Pass an array of betas directly to the constructor to bypass `beta_start` and `beta_end`.
-        solver_order (`int`, defaults to 2):
+        solver_order (`int`, defaults to `2`):
             The DPMSolver order which can be `1` or `2` or `3`. It is recommended to use `solver_order=2` for guided
             sampling, and `solver_order=3` for unconditional sampling.
-        prediction_type (`str`, defaults to `epsilon`, *optional*):
+        prediction_type (`"epsilon"`, `"sample"`, `"v_prediction"`, or `"flow_prediction"`, defaults to `"epsilon"`):
             Prediction type of the scheduler function; can be `epsilon` (predicts the noise of the diffusion process),
-            `sample` (directly predicts the noisy sample`) or `v_prediction` (see section 2.4 of [Imagen
-            Video](https://imagen.research.google/video/paper.pdf) paper).
+            `sample` (directly predicts the noisy sample`), `v_prediction` (see section 2.4 of [Imagen
+            Video](https://huggingface.co/papers/2210.02303) paper), or `flow_prediction`.
         thresholding (`bool`, defaults to `False`):
             Whether to use the "dynamic thresholding" method. This is unsuitable for latent-space diffusion models such
             as Stable Diffusion.
-        dynamic_thresholding_ratio (`float`, defaults to 0.995):
+        dynamic_thresholding_ratio (`float`, defaults to `0.995`):
             The ratio for the dynamic thresholding method. Valid only when `thresholding=True`.
-        sample_max_value (`float`, defaults to 1.0):
+        sample_max_value (`float`, defaults to `1.0`):
             The threshold value for dynamic thresholding. Valid only when `thresholding=True` and
             `algorithm_type="dpmsolver++"`.
-        algorithm_type (`str`, defaults to `dpmsolver++`):
-            Algorithm type for the solver; can be `dpmsolver` or `dpmsolver++` or `sde-dpmsolver++`. The `dpmsolver`
+        algorithm_type (`"dpmsolver"`, `"dpmsolver++"`, or `"sde-dpmsolver++"`, defaults to `"dpmsolver++"`):
+            Algorithm type for the solver; can be `dpmsolver`, `dpmsolver++`, or `sde-dpmsolver++`. The `dpmsolver`
             type implements the algorithms in the [DPMSolver](https://huggingface.co/papers/2206.00927) paper, and the
             `dpmsolver++` type implements the algorithms in the [DPMSolver++](https://huggingface.co/papers/2211.01095)
             paper. It is recommended to use `dpmsolver++` or `sde-dpmsolver++` with `solver_order=2` for guided
             sampling like in Stable Diffusion.
-        solver_type (`str`, defaults to `midpoint`):
+        solver_type (`"midpoint"` or `"heun"`, defaults to `"midpoint"`):
             Solver type for the second-order solver; can be `midpoint` or `heun`. The solver type slightly affects the
             sample quality, especially for a small number of steps. It is recommended to use `midpoint` solvers.
-        lower_order_final (`bool`, defaults to `True`):
+        lower_order_final (`bool`, defaults to `False`):
             Whether to use lower-order solvers in the final steps. Only valid for < 15 inference steps. This can
             stabilize the sampling of DPMSolver for steps < 15, especially for steps <= 10.
         use_karras_sigmas (`bool`, *optional*, defaults to `False`):
@@ -131,15 +132,23 @@ class DPMSolverSinglestepScheduler(SchedulerMixin, ConfigMixin):
         use_beta_sigmas (`bool`, *optional*, defaults to `False`):
             Whether to use beta sigmas for step sizes in the noise schedule during the sampling process. Refer to [Beta
             Sampling is All You Need](https://huggingface.co/papers/2407.12173) for more information.
-        final_sigmas_type (`str`, *optional*, defaults to `"zero"`):
+        use_flow_sigmas (`bool`, *optional*, defaults to `False`):
+            Whether to use flow sigmas for step sizes in the noise schedule during the sampling process.
+        flow_shift (`float`, *optional*, defaults to `1.0`):
+            The flow shift parameter for flow-based models.
+        final_sigmas_type (`"zero"` or `"sigma_min"`, *optional*, defaults to `"zero"`):
             The final `sigma` value for the noise schedule during the sampling process. If `"sigma_min"`, the final
-            sigma is the same as the last sigma in the training schedule. If `zero`, the final sigma is set to 0.
+            sigma is the same as the last sigma in the training schedule. If `"zero"`, the final sigma is set to 0.
         lambda_min_clipped (`float`, defaults to `-inf`):
             Clipping threshold for the minimum value of `lambda(t)` for numerical stability. This is critical for the
             cosine (`squaredcos_cap_v2`) noise schedule.
-        variance_type (`str`, *optional*):
-            Set to "learned" or "learned_range" for diffusion models that predict variance. If set, the model's output
-            contains the predicted Gaussian variance.
+        variance_type (`"learned"` or `"learned_range"`, *optional*):
+            Set to `"learned"` or `"learned_range"` for diffusion models that predict variance. If set, the model's
+            output contains the predicted Gaussian variance.
+        use_dynamic_shifting (`bool`, defaults to `False`):
+            Whether to use dynamic shifting for the noise schedule.
+        time_shift_type (`"exponential"`, defaults to `"exponential"`):
+            The type of time shifting to apply.
     """
 
     _compatibles = [e.name for e in KarrasDiffusionSchedulers]
@@ -151,27 +160,27 @@ def __init__(
         num_train_timesteps: int = 1000,
         beta_start: float = 0.0001,
         beta_end: float = 0.02,
-        beta_schedule: str = "linear",
-        trained_betas: Optional[np.ndarray] = None,
+        beta_schedule: Literal["linear", "scaled_linear", "squaredcos_cap_v2"] = "linear",
+        trained_betas: Optional[Union[np.ndarray, List[float]]] = None,
         solver_order: int = 2,
-        prediction_type: str = "epsilon",
+        prediction_type: Literal["epsilon", "sample", "v_prediction", "flow_prediction"] = "epsilon",
         thresholding: bool = False,
         dynamic_thresholding_ratio: float = 0.995,
         sample_max_value: float = 1.0,
-        algorithm_type: str = "dpmsolver++",
-        solver_type: str = "midpoint",
+        algorithm_type: Literal["dpmsolver", "dpmsolver++", "sde-dpmsolver++"] = "dpmsolver++",
+        solver_type: Literal["midpoint", "heun"] = "midpoint",
         lower_order_final: bool = False,
         use_karras_sigmas: Optional[bool] = False,
         use_exponential_sigmas: Optional[bool] = False,
         use_beta_sigmas: Optional[bool] = False,
         use_flow_sigmas: Optional[bool] = False,
         flow_shift: Optional[float] = 1.0,
-        final_sigmas_type: Optional[str] = "zero",  # "zero", "sigma_min"
+        final_sigmas_type: Optional[Literal["zero", "sigma_min"]] = "zero",
         lambda_min_clipped: float = -float("inf"),
-        variance_type: Optional[str] = None,
+        variance_type: Optional[Literal["learned", "learned_range"]] = None,
         use_dynamic_shifting: bool = False,
-        time_shift_type: str = "exponential",
-    ):
+        time_shift_type: Literal["exponential"] = "exponential",
+    ) -> None:
         if self.config.use_beta_sigmas and not is_scipy_available():
             raise ImportError("Make sure to install scipy if you want to use beta sigmas.")
         if sum([self.config.use_beta_sigmas, self.config.use_exponential_sigmas, self.config.use_karras_sigmas]) > 1:
@@ -241,6 +250,10 @@ def get_order_list(self, num_inference_steps: int) -> List[int]:
         Args:
             num_inference_steps (`int`):
                 The number of diffusion steps used when generating samples with a pre-trained model.
+
+        Returns:
+            `List[int]`:
+                The list of solver orders for each timestep.
         """
         steps = num_inference_steps
         order = self.config.solver_order
@@ -275,45 +288,55 @@ def get_order_list(self, num_inference_steps: int) -> List[int]:
         return orders
 
     @property
-    def step_index(self):
+    def step_index(self) -> Optional[int]:
         """
         The index counter for current timestep. It will increase 1 after each scheduler step.
+
+        Returns:
+            `int` or `None`:
+                The current step index.
         """
         return self._step_index
 
     @property
-    def begin_index(self):
+    def begin_index(self) -> Optional[int]:
         """
         The index for the first timestep. It should be set from pipeline with `set_begin_index` method.
+
+        Returns:
+            `int` or `None`:
+                The begin index.
         """
         return self._begin_index
 
     # Copied from diffusers.schedulers.scheduling_dpmsolver_multistep.DPMSolverMultistepScheduler.set_begin_index
-    def set_begin_index(self, begin_index: int = 0):
+    def set_begin_index(self, begin_index: int = 0) -> None:
         """
         Sets the begin index for the scheduler. This function should be run from pipeline before the inference.
 
         Args:
-            begin_index (`int`):
+            begin_index (`int`, defaults to `0`):
                 The begin index for the scheduler.
         """
         self._begin_index = begin_index
 
     def set_timesteps(
         self,
-        num_inference_steps: int = None,
-        device: Union[str, torch.device] = None,
+        num_inference_steps: Optional[int] = None,
+        device: Optional[Union[str, torch.device]] = None,
         mu: Optional[float] = None,
         timesteps: Optional[List[int]] = None,
-    ):
+    ) -> None:
         """
         Sets the discrete timesteps used for the diffusion chain (to be run before inference).
 
         Args:
-            num_inference_steps (`int`):
+            num_inference_steps (`int`, *optional*):
                 The number of diffusion steps used when generating samples with a pre-trained model.
             device (`str` or `torch.device`, *optional*):
                 The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
+            mu (`float`, *optional*):
+                The mu parameter for dynamic shifting.
             timesteps (`List[int]`, *optional*):
                 Custom timesteps used to support arbitrary spacing between timesteps. If `None`, then the default
                 timestep spacing strategy of equal spacing between timesteps schedule is used. If `timesteps` is
@@ -410,6 +433,8 @@ def set_timesteps(
     # Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler._threshold_sample
     def _threshold_sample(self, sample: torch.Tensor) -> torch.Tensor:
         """
+        Apply dynamic thresholding to the predicted sample.
+
         "Dynamic thresholding: At each sampling step we set s to a certain percentile absolute pixel value in xt0 (the
         prediction of x_0 at timestep t), and if s > 1, then we threshold xt0 to the range [-s, s] and then divide by
         s. Dynamic thresholding pushes saturated pixels (those near -1 and 1) inwards, thereby actively preventing
@@ -417,6 +442,14 @@ def _threshold_sample(self, sample: torch.Tensor) -> torch.Tensor:
         photorealism as well as better image-text alignment, especially when using very large guidance weights."
 
         https://huggingface.co/papers/2205.11487
+
+        Args:
+            sample (`torch.Tensor`):
+                The predicted sample to be thresholded.
+
+        Returns:
+            `torch.Tensor`:
+                The thresholded sample.
         """
         dtype = sample.dtype
         batch_size, channels, *remaining_dims = sample.shape
@@ -442,7 +475,20 @@ def _threshold_sample(self, sample: torch.Tensor) -> torch.Tensor:
         return sample
 
     # Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler._sigma_to_t
-    def _sigma_to_t(self, sigma, log_sigmas):
+    def _sigma_to_t(self, sigma: np.ndarray, log_sigmas: np.ndarray) -> np.ndarray:
+        """
+        Convert sigma values to corresponding timestep values through interpolation.
+
+        Args:
+            sigma (`np.ndarray`):
+                The sigma value(s) to convert to timestep(s).
+            log_sigmas (`np.ndarray`):
+                The logarithm of the sigma schedule used for interpolation.
+
+        Returns:
+            `np.ndarray`:
+                The interpolated timestep value(s) corresponding to the input sigma(s).
+        """
         # get log sigma
         log_sigma = np.log(np.maximum(sigma, 1e-10))
 
@@ -466,7 +512,18 @@ def _sigma_to_t(self, sigma, log_sigmas):
         return t
 
     # Copied from diffusers.schedulers.scheduling_dpmsolver_multistep.DPMSolverMultistepScheduler._sigma_to_alpha_sigma_t
-    def _sigma_to_alpha_sigma_t(self, sigma):
+    def _sigma_to_alpha_sigma_t(self, sigma: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
+        """
+        Convert sigma values to alpha_t and sigma_t values.
+
+        Args:
+            sigma (`torch.Tensor`):
+                The sigma value(s) to convert.
+
+        Returns:
+            `Tuple[torch.Tensor, torch.Tensor]`:
+                A tuple containing (alpha_t, sigma_t) values.
+        """
         if self.config.use_flow_sigmas:
             alpha_t = 1 - sigma
             sigma_t = sigma
@@ -477,8 +534,21 @@ def _sigma_to_alpha_sigma_t(self, sigma):
         return alpha_t, sigma_t
 
     # Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler._convert_to_karras
-    def _convert_to_karras(self, in_sigmas: torch.Tensor, num_inference_steps) -> torch.Tensor:
-        """Constructs the noise schedule of Karras et al. (2022)."""
+    def _convert_to_karras(self, in_sigmas: torch.Tensor, num_inference_steps: int) -> torch.Tensor:
+        """
+        Construct the noise schedule as proposed in [Elucidating the Design Space of Diffusion-Based Generative
+        Models](https://huggingface.co/papers/2206.00364).
+
+        Args:
+            in_sigmas (`torch.Tensor`):
+                The input sigma values to be converted.
+            num_inference_steps (`int`):
+                The number of inference steps to generate the noise schedule for.
+
+        Returns:
+            `torch.Tensor`:
+                The converted sigma values following the Karras noise schedule.
+        """
 
         # Hack to make sure that other schedulers which copy this function don't break
         # TODO: Add this logic to the other schedulers
@@ -504,7 +574,19 @@ def _convert_to_karras(self, in_sigmas: torch.Tensor, num_inference_steps) -> to
 
     # Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler._convert_to_exponential
     def _convert_to_exponential(self, in_sigmas: torch.Tensor, num_inference_steps: int) -> torch.Tensor:
-        """Constructs an exponential noise schedule."""
+        """
+        Construct an exponential noise schedule.
+
+        Args:
+            in_sigmas (`torch.Tensor`):
+                The input sigma values to be converted.
+            num_inference_steps (`int`):
+                The number of inference steps to generate the noise schedule for.
+
+        Returns:
+            `torch.Tensor`:
+                The converted sigma values following an exponential schedule.
+        """
 
         # Hack to make sure that other schedulers which copy this function don't break
         # TODO: Add this logic to the other schedulers
@@ -528,7 +610,24 @@ def _convert_to_exponential(self, in_sigmas: torch.Tensor, num_inference_steps:
     def _convert_to_beta(
         self, in_sigmas: torch.Tensor, num_inference_steps: int, alpha: float = 0.6, beta: float = 0.6
     ) -> torch.Tensor:
-        """From "Beta Sampling is All You Need" [arXiv:2407.12173] (Lee et. al, 2024)"""
+        """
+        Construct a beta noise schedule as proposed in [Beta Sampling is All You
+        Need](https://huggingface.co/papers/2407.12173).
+
+        Args:
+            in_sigmas (`torch.Tensor`):
+                The input sigma values to be converted.
+            num_inference_steps (`int`):
+                The number of inference steps to generate the noise schedule for.
+            alpha (`float`, *optional*, defaults to `0.6`):
+                The alpha parameter for the beta distribution.
+            beta (`float`, *optional*, defaults to `0.6`):
+                The beta parameter for the beta distribution.
+
+        Returns:
+            `torch.Tensor`:
+                The converted sigma values following a beta distribution schedule.
+        """
 
         # Hack to make sure that other schedulers which copy this function don't break
         # TODO: Add this logic to the other schedulers
@@ -560,7 +659,7 @@ def convert_model_output(
         self,
         model_output: torch.Tensor,
         *args,
-        sample: torch.Tensor = None,
+        sample: Optional[torch.Tensor] = None,
         **kwargs,
     ) -> torch.Tensor:
         """
@@ -656,7 +755,7 @@ def dpm_solver_first_order_update(
         self,
         model_output: torch.Tensor,
         *args,
-        sample: torch.Tensor = None,
+        sample: Optional[torch.Tensor] = None,
         noise: Optional[torch.Tensor] = None,
         **kwargs,
     ) -> torch.Tensor:
@@ -720,7 +819,7 @@ def singlestep_dpm_solver_second_order_update(
         self,
         model_output_list: List[torch.Tensor],
         *args,
-        sample: torch.Tensor = None,
+        sample: Optional[torch.Tensor] = None,
         noise: Optional[torch.Tensor] = None,
         **kwargs,
     ) -> torch.Tensor:
@@ -831,7 +930,7 @@ def singlestep_dpm_solver_third_order_update(
         self,
         model_output_list: List[torch.Tensor],
         *args,
-        sample: torch.Tensor = None,
+        sample: Optional[torch.Tensor] = None,
         noise: Optional[torch.Tensor] = None,
         **kwargs,
     ) -> torch.Tensor:
@@ -953,8 +1052,8 @@ def singlestep_dpm_solver_update(
         self,
         model_output_list: List[torch.Tensor],
         *args,
-        sample: torch.Tensor = None,
-        order: int = None,
+        sample: Optional[torch.Tensor] = None,
+        order: Optional[int] = None,
         noise: Optional[torch.Tensor] = None,
         **kwargs,
     ) -> torch.Tensor:
@@ -1013,7 +1112,22 @@ def singlestep_dpm_solver_update(
             raise ValueError(f"Order must be 1, 2, 3, got {order}")
 
     # Copied from diffusers.schedulers.scheduling_dpmsolver_multistep.DPMSolverMultistepScheduler.index_for_timestep
-    def index_for_timestep(self, timestep, schedule_timesteps=None):
+    def index_for_timestep(
+        self, timestep: Union[int, torch.Tensor], schedule_timesteps: Optional[torch.Tensor] = None
+    ) -> int:
+        """
+        Find the index for a given timestep in the schedule.
+
+        Args:
+            timestep (`int` or `torch.Tensor`):
+                The timestep for which to find the index.
+            schedule_timesteps (`torch.Tensor`, *optional*):
+                The timestep schedule to search in. If `None`, uses `self.timesteps`.
+
+        Returns:
+            `int`:
+                The index of the timestep in the schedule.
+        """
         if schedule_timesteps is None:
             schedule_timesteps = self.timesteps
 
@@ -1033,9 +1147,13 @@ def index_for_timestep(self, timestep, schedule_timesteps=None):
         return step_index
 
     # Copied from diffusers.schedulers.scheduling_dpmsolver_multistep.DPMSolverMultistepScheduler._init_step_index
-    def _init_step_index(self, timestep):
+    def _init_step_index(self, timestep: Union[int, torch.Tensor]) -> None:
         """
         Initialize the step_index counter for the scheduler.
+
+        Args:
+            timestep (`int` or `torch.Tensor`):
+                The current timestep for which to initialize the step index.
         """
 
         if self.begin_index is None:
@@ -1050,7 +1168,7 @@ def step(
         model_output: torch.Tensor,
         timestep: Union[int, torch.Tensor],
         sample: torch.Tensor,
-        generator=None,
+        generator: Optional[torch.Generator] = None,
         return_dict: bool = True,
     ) -> Union[SchedulerOutput, Tuple]:
         """
@@ -1060,11 +1178,13 @@ def step(
         Args:
             model_output (`torch.Tensor`):
                 The direct output from learned diffusion model.
-            timestep (`int`):
+            timestep (`int` or `torch.Tensor`):
                 The current discrete timestep in the diffusion chain.
             sample (`torch.Tensor`):
                 A current instance of a sample created by the diffusion process.
-            return_dict (`bool`):
+            generator (`torch.Generator`, *optional*):
+                A random number generator for stochastic sampling.
+            return_dict (`bool`, defaults to `True`):
                 Whether or not to return a [`~schedulers.scheduling_utils.SchedulerOutput`] or `tuple`.
 
         Returns:
@@ -1138,6 +1258,21 @@ def add_noise(
         noise: torch.Tensor,
         timesteps: torch.IntTensor,
     ) -> torch.Tensor:
+        """
+        Add noise to the original samples according to the noise schedule at the specified timesteps.
+
+        Args:
+            original_samples (`torch.Tensor`):
+                The original samples without noise.
+            noise (`torch.Tensor`):
+                The noise to add to the samples.
+            timesteps (`torch.IntTensor`):
+                The timesteps at which to add noise to the samples.
+
+        Returns:
+            `torch.Tensor`:
+                The noisy samples.
+        """
         # Make sure sigmas and timesteps have the same device and dtype as original_samples
         sigmas = self.sigmas.to(device=original_samples.device, dtype=original_samples.dtype)
         if original_samples.device.type == "mps" and torch.is_floating_point(timesteps):
@@ -1166,5 +1301,5 @@ def add_noise(
         noisy_samples = alpha_t * original_samples + sigma_t * noise
         return noisy_samples
 
-    def __len__(self):
+    def __len__(self) -> int:
         return self.config.num_train_timesteps
diff --git a/src/diffusers/schedulers/scheduling_edm_dpmsolver_multistep.py b/src/diffusers/schedulers/scheduling_edm_dpmsolver_multistep.py
index e9ba695e1f39..d4e8ca5e8b18 100644
--- a/src/diffusers/schedulers/scheduling_edm_dpmsolver_multistep.py
+++ b/src/diffusers/schedulers/scheduling_edm_dpmsolver_multistep.py
@@ -57,7 +57,7 @@ class EDMDPMSolverMultistepScheduler(SchedulerMixin, ConfigMixin):
         prediction_type (`str`, defaults to `epsilon`, *optional*):
             Prediction type of the scheduler function; can be `epsilon` (predicts the noise of the diffusion process),
             `sample` (directly predicts the noisy sample`) or `v_prediction` (see section 2.4 of [Imagen
-            Video](https://imagen.research.google/video/paper.pdf) paper).
+            Video](https://huggingface.co/papers/2210.02303) paper).
         thresholding (`bool`, defaults to `False`):
             Whether to use the "dynamic thresholding" method. This is unsuitable for latent-space diffusion models such
             as Stable Diffusion.
@@ -169,7 +169,7 @@ def set_begin_index(self, begin_index: int = 0):
         Sets the begin index for the scheduler. This function should be run from pipeline before the inference.
 
         Args:
-            begin_index (`int`):
+            begin_index (`int`, defaults to `0`):
                 The begin index for the scheduler.
         """
         self._begin_index = begin_index
@@ -299,6 +299,8 @@ def _compute_exponential_sigmas(self, ramp, sigma_min=None, sigma_max=None) -> t
     # Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler._threshold_sample
     def _threshold_sample(self, sample: torch.Tensor) -> torch.Tensor:
         """
+        Apply dynamic thresholding to the predicted sample.
+
         "Dynamic thresholding: At each sampling step we set s to a certain percentile absolute pixel value in xt0 (the
         prediction of x_0 at timestep t), and if s > 1, then we threshold xt0 to the range [-s, s] and then divide by
         s. Dynamic thresholding pushes saturated pixels (those near -1 and 1) inwards, thereby actively preventing
@@ -306,6 +308,14 @@ def _threshold_sample(self, sample: torch.Tensor) -> torch.Tensor:
         photorealism as well as better image-text alignment, especially when using very large guidance weights."
 
         https://huggingface.co/papers/2205.11487
+
+        Args:
+            sample (`torch.Tensor`):
+                The predicted sample to be thresholded.
+
+        Returns:
+            `torch.Tensor`:
+                The thresholded sample.
         """
         dtype = sample.dtype
         batch_size, channels, *remaining_dims = sample.shape
@@ -332,6 +342,19 @@ def _threshold_sample(self, sample: torch.Tensor) -> torch.Tensor:
 
     # Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler._sigma_to_t
     def _sigma_to_t(self, sigma, log_sigmas):
+        """
+        Convert sigma values to corresponding timestep values through interpolation.
+
+        Args:
+            sigma (`np.ndarray`):
+                The sigma value(s) to convert to timestep(s).
+            log_sigmas (`np.ndarray`):
+                The logarithm of the sigma schedule used for interpolation.
+
+        Returns:
+            `np.ndarray`:
+                The interpolated timestep value(s) corresponding to the input sigma(s).
+        """
         # get log sigma
         log_sigma = np.log(np.maximum(sigma, 1e-10))
 
@@ -555,7 +578,22 @@ def multistep_dpm_solver_third_order_update(
         return x_t
 
     # Copied from diffusers.schedulers.scheduling_dpmsolver_multistep.DPMSolverMultistepScheduler.index_for_timestep
-    def index_for_timestep(self, timestep, schedule_timesteps=None):
+    def index_for_timestep(
+        self, timestep: Union[int, torch.Tensor], schedule_timesteps: Optional[torch.Tensor] = None
+    ) -> int:
+        """
+        Find the index for a given timestep in the schedule.
+
+        Args:
+            timestep (`int` or `torch.Tensor`):
+                The timestep for which to find the index.
+            schedule_timesteps (`torch.Tensor`, *optional*):
+                The timestep schedule to search in. If `None`, uses `self.timesteps`.
+
+        Returns:
+            `int`:
+                The index of the timestep in the schedule.
+        """
         if schedule_timesteps is None:
             schedule_timesteps = self.timesteps
 
@@ -578,6 +616,10 @@ def index_for_timestep(self, timestep, schedule_timesteps=None):
     def _init_step_index(self, timestep):
         """
         Initialize the step_index counter for the scheduler.
+
+        Args:
+            timestep (`int` or `torch.Tensor`):
+                The current timestep for which to initialize the step index.
         """
 
         if self.begin_index is None:
@@ -672,6 +714,21 @@ def add_noise(
         noise: torch.Tensor,
         timesteps: torch.Tensor,
     ) -> torch.Tensor:
+        """
+        Add noise to the original samples according to the noise schedule at the specified timesteps.
+
+        Args:
+            original_samples (`torch.Tensor`):
+                The original samples to which noise will be added.
+            noise (`torch.Tensor`):
+                The noise tensor to add to the original samples.
+            timesteps (`torch.Tensor`):
+                The timesteps at which to add noise, determining the noise level from the schedule.
+
+        Returns:
+            `torch.Tensor`:
+                The noisy samples with added noise scaled according to the timestep schedule.
+        """
         # Make sure sigmas and timesteps have the same device and dtype as original_samples
         sigmas = self.sigmas.to(device=original_samples.device, dtype=original_samples.dtype)
         if original_samples.device.type == "mps" and torch.is_floating_point(timesteps):
diff --git a/src/diffusers/schedulers/scheduling_edm_euler.py b/src/diffusers/schedulers/scheduling_edm_euler.py
index dbeff3de5652..2ed05d396514 100644
--- a/src/diffusers/schedulers/scheduling_edm_euler.py
+++ b/src/diffusers/schedulers/scheduling_edm_euler.py
@@ -74,7 +74,7 @@ class EDMEulerScheduler(SchedulerMixin, ConfigMixin):
         prediction_type (`str`, defaults to `epsilon`, *optional*):
             Prediction type of the scheduler function; can be `epsilon` (predicts the noise of the diffusion process),
             `sample` (directly predicts the noisy sample`) or `v_prediction` (see section 2.4 of [Imagen
-            Video](https://imagen.research.google/video/paper.pdf) paper).
+            Video](https://huggingface.co/papers/2210.02303) paper).
         rho (`float`, *optional*, defaults to 7.0):
             The rho parameter used for calculating the Karras sigma schedule, which is set to 7.0 in the EDM paper [1].
         final_sigmas_type (`str`, defaults to `"zero"`):
@@ -155,7 +155,7 @@ def set_begin_index(self, begin_index: int = 0):
         Sets the begin index for the scheduler. This function should be run from pipeline before the inference.
 
         Args:
-            begin_index (`int`):
+            begin_index (`int`, defaults to `0`):
                 The begin index for the scheduler.
         """
         self._begin_index = begin_index
@@ -284,7 +284,23 @@ def _compute_exponential_sigmas(self, ramp, sigma_min=None, sigma_max=None) -> t
         return sigmas
 
     # Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler.index_for_timestep
-    def index_for_timestep(self, timestep, schedule_timesteps=None):
+    def index_for_timestep(
+        self, timestep: Union[float, torch.Tensor], schedule_timesteps: Optional[torch.Tensor] = None
+    ) -> int:
+        """
+        Find the index of a given timestep in the timestep schedule.
+
+        Args:
+            timestep (`float` or `torch.Tensor`):
+                The timestep value to find in the schedule.
+            schedule_timesteps (`torch.Tensor`, *optional*):
+                The timestep schedule to search in. If `None`, uses `self.timesteps`.
+
+        Returns:
+            `int`:
+                The index of the timestep in the schedule. For the very first step, returns the second index if
+                multiple matches exist to avoid skipping a sigma when starting mid-schedule (e.g., for image-to-image).
+        """
         if schedule_timesteps is None:
             schedule_timesteps = self.timesteps
 
@@ -299,7 +315,14 @@ def index_for_timestep(self, timestep, schedule_timesteps=None):
         return indices[pos].item()
 
     # Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler._init_step_index
-    def _init_step_index(self, timestep):
+    def _init_step_index(self, timestep: Union[float, torch.Tensor]) -> None:
+        """
+        Initialize the step index for the scheduler based on the given timestep.
+
+        Args:
+            timestep (`float` or `torch.Tensor`):
+                The current timestep to initialize the step index from.
+        """
         if self.begin_index is None:
             if isinstance(timestep, torch.Tensor):
                 timestep = timestep.to(self.timesteps.device)
@@ -413,6 +436,21 @@ def add_noise(
         noise: torch.Tensor,
         timesteps: torch.Tensor,
     ) -> torch.Tensor:
+        """
+        Add noise to the original samples according to the noise schedule at the specified timesteps.
+
+        Args:
+            original_samples (`torch.Tensor`):
+                The original samples to which noise will be added.
+            noise (`torch.Tensor`):
+                The noise tensor to add to the original samples.
+            timesteps (`torch.Tensor`):
+                The timesteps at which to add noise, determining the noise level from the schedule.
+
+        Returns:
+            `torch.Tensor`:
+                The noisy samples with added noise scaled according to the timestep schedule.
+        """
         # Make sure sigmas and timesteps have the same device and dtype as original_samples
         sigmas = self.sigmas.to(device=original_samples.device, dtype=original_samples.dtype)
         if original_samples.device.type == "mps" and torch.is_floating_point(timesteps):
diff --git a/src/diffusers/schedulers/scheduling_euler_ancestral_discrete.py b/src/diffusers/schedulers/scheduling_euler_ancestral_discrete.py
index 9cdaa2c5e101..97fd84db5621 100644
--- a/src/diffusers/schedulers/scheduling_euler_ancestral_discrete.py
+++ b/src/diffusers/schedulers/scheduling_euler_ancestral_discrete.py
@@ -14,7 +14,7 @@
 
 import math
 from dataclasses import dataclass
-from typing import List, Optional, Tuple, Union
+from typing import List, Literal, Optional, Tuple, Union
 
 import numpy as np
 import torch
@@ -49,10 +49,10 @@ class EulerAncestralDiscreteSchedulerOutput(BaseOutput):
 
 # Copied from diffusers.schedulers.scheduling_ddpm.betas_for_alpha_bar
 def betas_for_alpha_bar(
-    num_diffusion_timesteps,
-    max_beta=0.999,
-    alpha_transform_type="cosine",
-):
+    num_diffusion_timesteps: int,
+    max_beta: float = 0.999,
+    alpha_transform_type: Literal["cosine", "exp"] = "cosine",
+) -> torch.Tensor:
     """
     Create a beta schedule that discretizes the given alpha_t_bar function, which defines the cumulative product of
     (1-beta) over time from t = [0,1].
@@ -60,16 +60,17 @@ def betas_for_alpha_bar(
     Contains a function alpha_bar that takes an argument t and transforms it to the cumulative product of (1-beta) up
     to that part of the diffusion process.
 
-
     Args:
-        num_diffusion_timesteps (`int`): the number of betas to produce.
-        max_beta (`float`): the maximum beta to use; use values lower than 1 to
-                     prevent singularities.
-        alpha_transform_type (`str`, *optional*, default to `cosine`): the type of noise schedule for alpha_bar.
-                     Choose from `cosine` or `exp`
+        num_diffusion_timesteps (`int`):
+            The number of betas to produce.
+        max_beta (`float`, defaults to `0.999`):
+            The maximum beta to use; use values lower than 1 to avoid numerical instability.
+        alpha_transform_type (`"cosine"` or `"exp"`, defaults to `"cosine"`):
+            The type of noise schedule for `alpha_bar`. Choose from `cosine` or `exp`.
 
     Returns:
-        betas (`np.ndarray`): the betas used by the scheduler to step the model outputs
+        `torch.Tensor`:
+            The betas used by the scheduler to step the model outputs.
     """
     if alpha_transform_type == "cosine":
 
@@ -93,17 +94,17 @@ def alpha_bar_fn(t):
 
 
 # Copied from diffusers.schedulers.scheduling_ddim.rescale_zero_terminal_snr
-def rescale_zero_terminal_snr(betas):
+def rescale_zero_terminal_snr(betas: torch.Tensor) -> torch.Tensor:
     """
     Rescales betas to have zero terminal SNR Based on https://huggingface.co/papers/2305.08891 (Algorithm 1)
 
-
     Args:
         betas (`torch.Tensor`):
-            the betas that the scheduler is being initialized with.
+            The betas that the scheduler is being initialized with.
 
     Returns:
-        `torch.Tensor`: rescaled betas with zero terminal SNR
+        `torch.Tensor`:
+            Rescaled betas with zero terminal SNR.
     """
     # Convert betas to alphas_bar_sqrt
     alphas = 1.0 - betas
@@ -143,16 +144,16 @@ class EulerAncestralDiscreteScheduler(SchedulerMixin, ConfigMixin):
             The starting `beta` value of inference.
         beta_end (`float`, defaults to 0.02):
             The final `beta` value.
-        beta_schedule (`str`, defaults to `"linear"`):
+        beta_schedule (`"linear"`, `"scaled_linear"`, or `"squaredcos_cap_v2"`, defaults to `"linear"`):
             The beta schedule, a mapping from a beta range to a sequence of betas for stepping the model. Choose from
-            `linear` or `scaled_linear`.
+            `linear`, `scaled_linear`, or `squaredcos_cap_v2`.
         trained_betas (`np.ndarray`, *optional*):
             Pass an array of betas directly to the constructor to bypass `beta_start` and `beta_end`.
-        prediction_type (`str`, defaults to `epsilon`, *optional*):
+        prediction_type (`"epsilon"`, `"sample"`, or `"v_prediction"`, defaults to `"epsilon"`, *optional*):
             Prediction type of the scheduler function; can be `epsilon` (predicts the noise of the diffusion process),
             `sample` (directly predicts the noisy sample`) or `v_prediction` (see section 2.4 of [Imagen
-            Video](https://imagen.research.google/video/paper.pdf) paper).
-        timestep_spacing (`str`, defaults to `"linspace"`):
+            Video](https://huggingface.co/papers/2210.02303) paper).
+        timestep_spacing (`"linspace"`, `"leading"`, or `"trailing"`, defaults to `"linspace"`):
             The way the timesteps should be scaled. Refer to Table 2 of the [Common Diffusion Noise Schedules and
             Sample Steps are Flawed](https://huggingface.co/papers/2305.08891) for more information.
         steps_offset (`int`, defaults to 0):
@@ -172,13 +173,13 @@ def __init__(
         num_train_timesteps: int = 1000,
         beta_start: float = 0.0001,
         beta_end: float = 0.02,
-        beta_schedule: str = "linear",
+        beta_schedule: Literal["linear", "scaled_linear", "squaredcos_cap_v2"] = "linear",
         trained_betas: Optional[Union[np.ndarray, List[float]]] = None,
-        prediction_type: str = "epsilon",
-        timestep_spacing: str = "linspace",
+        prediction_type: Literal["epsilon", "sample", "v_prediction"] = "epsilon",
+        timestep_spacing: Literal["linspace", "leading", "trailing"] = "linspace",
         steps_offset: int = 0,
         rescale_betas_zero_snr: bool = False,
-    ):
+    ) -> None:
         if trained_betas is not None:
             self.betas = torch.tensor(trained_betas, dtype=torch.float32)
         elif beta_schedule == "linear":
@@ -218,7 +219,7 @@ def __init__(
         self.sigmas = self.sigmas.to("cpu")  # to avoid too much CPU/GPU communication
 
     @property
-    def init_noise_sigma(self):
+    def init_noise_sigma(self) -> torch.Tensor:
         # standard deviation of the initial noise distribution
         if self.config.timestep_spacing in ["linspace", "trailing"]:
             return self.sigmas.max()
@@ -226,26 +227,26 @@ def init_noise_sigma(self):
         return (self.sigmas.max() ** 2 + 1) ** 0.5
 
     @property
-    def step_index(self):
+    def step_index(self) -> Optional[int]:
         """
         The index counter for current timestep. It will increase 1 after each scheduler step.
         """
         return self._step_index
 
     @property
-    def begin_index(self):
+    def begin_index(self) -> Optional[int]:
         """
         The index for the first timestep. It should be set from pipeline with `set_begin_index` method.
         """
         return self._begin_index
 
     # Copied from diffusers.schedulers.scheduling_dpmsolver_multistep.DPMSolverMultistepScheduler.set_begin_index
-    def set_begin_index(self, begin_index: int = 0):
+    def set_begin_index(self, begin_index: int = 0) -> None:
         """
         Sets the begin index for the scheduler. This function should be run from pipeline before the inference.
 
         Args:
-            begin_index (`int`):
+            begin_index (`int`, defaults to `0`):
                 The begin index for the scheduler.
         """
         self._begin_index = begin_index
@@ -258,7 +259,7 @@ def scale_model_input(self, sample: torch.Tensor, timestep: Union[float, torch.T
         Args:
             sample (`torch.Tensor`):
                 The input sample.
-            timestep (`int`, *optional*):
+            timestep (`float` or `torch.Tensor`):
                 The current timestep in the diffusion chain.
 
         Returns:
@@ -274,7 +275,7 @@ def scale_model_input(self, sample: torch.Tensor, timestep: Union[float, torch.T
         self.is_scale_input_called = True
         return sample
 
-    def set_timesteps(self, num_inference_steps: int, device: Union[str, torch.device] = None):
+    def set_timesteps(self, num_inference_steps: int, device: Optional[Union[str, torch.device]] = None) -> None:
         """
         Sets the discrete timesteps used for the diffusion chain (to be run before inference).
 
@@ -319,7 +320,23 @@ def set_timesteps(self, num_inference_steps: int, device: Union[str, torch.devic
         self.sigmas = self.sigmas.to("cpu")  # to avoid too much CPU/GPU communication
 
     # Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler.index_for_timestep
-    def index_for_timestep(self, timestep, schedule_timesteps=None):
+    def index_for_timestep(
+        self, timestep: Union[float, torch.Tensor], schedule_timesteps: Optional[torch.Tensor] = None
+    ) -> int:
+        """
+        Find the index of a given timestep in the timestep schedule.
+
+        Args:
+            timestep (`float` or `torch.Tensor`):
+                The timestep value to find in the schedule.
+            schedule_timesteps (`torch.Tensor`, *optional*):
+                The timestep schedule to search in. If `None`, uses `self.timesteps`.
+
+        Returns:
+            `int`:
+                The index of the timestep in the schedule. For the very first step, returns the second index if
+                multiple matches exist to avoid skipping a sigma when starting mid-schedule (e.g., for image-to-image).
+        """
         if schedule_timesteps is None:
             schedule_timesteps = self.timesteps
 
@@ -334,7 +351,14 @@ def index_for_timestep(self, timestep, schedule_timesteps=None):
         return indices[pos].item()
 
     # Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler._init_step_index
-    def _init_step_index(self, timestep):
+    def _init_step_index(self, timestep: Union[float, torch.Tensor]) -> None:
+        """
+        Initialize the step index for the scheduler based on the given timestep.
+
+        Args:
+            timestep (`float` or `torch.Tensor`):
+                The current timestep to initialize the step index from.
+        """
         if self.begin_index is None:
             if isinstance(timestep, torch.Tensor):
                 timestep = timestep.to(self.timesteps.device)
@@ -357,13 +381,13 @@ def step(
         Args:
             model_output (`torch.Tensor`):
                 The direct output from learned diffusion model.
-            timestep (`float`):
+            timestep (`float` or `torch.Tensor`):
                 The current discrete timestep in the diffusion chain.
             sample (`torch.Tensor`):
                 A current instance of a sample created by the diffusion process.
             generator (`torch.Generator`, *optional*):
                 A random number generator.
-            return_dict (`bool`):
+            return_dict (`bool`, defaults to `True`):
                 Whether or not to return a
                 [`~schedulers.scheduling_euler_ancestral_discrete.EulerAncestralDiscreteSchedulerOutput`] or tuple.
 
@@ -451,6 +475,21 @@ def add_noise(
         noise: torch.Tensor,
         timesteps: torch.Tensor,
     ) -> torch.Tensor:
+        """
+        Add noise to the original samples according to the noise schedule at the specified timesteps.
+
+        Args:
+            original_samples (`torch.Tensor`):
+                The original samples to which noise will be added.
+            noise (`torch.Tensor`):
+                The noise tensor to add to the original samples.
+            timesteps (`torch.Tensor`):
+                The timesteps at which to add noise, determining the noise level from the schedule.
+
+        Returns:
+            `torch.Tensor`:
+                The noisy samples with added noise scaled according to the timestep schedule.
+        """
         # Make sure sigmas and timesteps have the same device and dtype as original_samples
         sigmas = self.sigmas.to(device=original_samples.device, dtype=original_samples.dtype)
         if original_samples.device.type == "mps" and torch.is_floating_point(timesteps):
@@ -478,5 +517,5 @@ def add_noise(
         noisy_samples = original_samples + noise * sigma
         return noisy_samples
 
-    def __len__(self):
+    def __len__(self) -> int:
         return self.config.num_train_timesteps
diff --git a/src/diffusers/schedulers/scheduling_euler_discrete.py b/src/diffusers/schedulers/scheduling_euler_discrete.py
index f58d918dbfbe..a55a76626cec 100644
--- a/src/diffusers/schedulers/scheduling_euler_discrete.py
+++ b/src/diffusers/schedulers/scheduling_euler_discrete.py
@@ -14,7 +14,7 @@
 
 import math
 from dataclasses import dataclass
-from typing import List, Optional, Tuple, Union
+from typing import List, Literal, Optional, Tuple, Union
 
 import numpy as np
 import torch
@@ -52,10 +52,10 @@ class EulerDiscreteSchedulerOutput(BaseOutput):
 
 # Copied from diffusers.schedulers.scheduling_ddpm.betas_for_alpha_bar
 def betas_for_alpha_bar(
-    num_diffusion_timesteps,
-    max_beta=0.999,
-    alpha_transform_type="cosine",
-):
+    num_diffusion_timesteps: int,
+    max_beta: float = 0.999,
+    alpha_transform_type: Literal["cosine", "exp"] = "cosine",
+) -> torch.Tensor:
     """
     Create a beta schedule that discretizes the given alpha_t_bar function, which defines the cumulative product of
     (1-beta) over time from t = [0,1].
@@ -63,16 +63,17 @@ def betas_for_alpha_bar(
     Contains a function alpha_bar that takes an argument t and transforms it to the cumulative product of (1-beta) up
     to that part of the diffusion process.
 
-
     Args:
-        num_diffusion_timesteps (`int`): the number of betas to produce.
-        max_beta (`float`): the maximum beta to use; use values lower than 1 to
-                     prevent singularities.
-        alpha_transform_type (`str`, *optional*, default to `cosine`): the type of noise schedule for alpha_bar.
-                     Choose from `cosine` or `exp`
+        num_diffusion_timesteps (`int`):
+            The number of betas to produce.
+        max_beta (`float`, defaults to `0.999`):
+            The maximum beta to use; use values lower than 1 to avoid numerical instability.
+        alpha_transform_type (`"cosine"` or `"exp"`, defaults to `"cosine"`):
+            The type of noise schedule for `alpha_bar`. Choose from `cosine` or `exp`.
 
     Returns:
-        betas (`np.ndarray`): the betas used by the scheduler to step the model outputs
+        `torch.Tensor`:
+            The betas used by the scheduler to step the model outputs.
     """
     if alpha_transform_type == "cosine":
 
@@ -96,17 +97,17 @@ def alpha_bar_fn(t):
 
 
 # Copied from diffusers.schedulers.scheduling_ddim.rescale_zero_terminal_snr
-def rescale_zero_terminal_snr(betas):
+def rescale_zero_terminal_snr(betas: torch.Tensor) -> torch.Tensor:
     """
     Rescales betas to have zero terminal SNR Based on https://huggingface.co/papers/2305.08891 (Algorithm 1)
 
-
     Args:
         betas (`torch.Tensor`):
-            the betas that the scheduler is being initialized with.
+            The betas that the scheduler is being initialized with.
 
     Returns:
-        `torch.Tensor`: rescaled betas with zero terminal SNR
+        `torch.Tensor`:
+            Rescaled betas with zero terminal SNR.
     """
     # Convert betas to alphas_bar_sqrt
     alphas = 1.0 - betas
@@ -146,17 +147,17 @@ class EulerDiscreteScheduler(SchedulerMixin, ConfigMixin):
             The starting `beta` value of inference.
         beta_end (`float`, defaults to 0.02):
             The final `beta` value.
-        beta_schedule (`str`, defaults to `"linear"`):
+        beta_schedule (`Literal["linear", "scaled_linear", "squaredcos_cap_v2"]`, defaults to `"linear"`):
             The beta schedule, a mapping from a beta range to a sequence of betas for stepping the model. Choose from
-            `linear` or `scaled_linear`.
+            `"linear"`, `"scaled_linear"`, or `"squaredcos_cap_v2"`.
         trained_betas (`np.ndarray`, *optional*):
             Pass an array of betas directly to the constructor to bypass `beta_start` and `beta_end`.
-        prediction_type (`str`, defaults to `epsilon`, *optional*):
-            Prediction type of the scheduler function; can be `epsilon` (predicts the noise of the diffusion process),
-            `sample` (directly predicts the noisy sample`) or `v_prediction` (see section 2.4 of [Imagen
-            Video](https://imagen.research.google/video/paper.pdf) paper).
-        interpolation_type(`str`, defaults to `"linear"`, *optional*):
-            The interpolation type to compute intermediate sigmas for the scheduler denoising steps. Should be on of
+        prediction_type (`Literal["epsilon", "sample", "v_prediction"]`, defaults to `"epsilon"`, *optional*):
+            Prediction type of the scheduler function; can be `"epsilon"` (predicts the noise of the diffusion
+            process), `"sample"` (directly predicts the noisy sample`) or `"v_prediction"` (see section 2.4 of [Imagen
+            Video](https://huggingface.co/papers/2210.02303) paper).
+        interpolation_type (`Literal["linear", "log_linear"]`, defaults to `"linear"`, *optional*):
+            The interpolation type to compute intermediate sigmas for the scheduler denoising steps. Should be one of
             `"linear"` or `"log_linear"`.
         use_karras_sigmas (`bool`, *optional*, defaults to `False`):
             Whether to use Karras sigmas for step sizes in the noise schedule during the sampling process. If `True`,
@@ -166,18 +167,26 @@ class EulerDiscreteScheduler(SchedulerMixin, ConfigMixin):
         use_beta_sigmas (`bool`, *optional*, defaults to `False`):
             Whether to use beta sigmas for step sizes in the noise schedule during the sampling process. Refer to [Beta
             Sampling is All You Need](https://huggingface.co/papers/2407.12173) for more information.
-        timestep_spacing (`str`, defaults to `"linspace"`):
+        sigma_min (`float`, *optional*):
+            The minimum sigma value for the noise schedule. If not provided, defaults to the last sigma in the
+            schedule.
+        sigma_max (`float`, *optional*):
+            The maximum sigma value for the noise schedule. If not provided, defaults to the first sigma in the
+            schedule.
+        timestep_spacing (`Literal["linspace", "leading", "trailing"]`, defaults to `"linspace"`):
             The way the timesteps should be scaled. Refer to Table 2 of the [Common Diffusion Noise Schedules and
             Sample Steps are Flawed](https://huggingface.co/papers/2305.08891) for more information.
+        timestep_type (`Literal["discrete", "continuous"]`, defaults to `"discrete"`):
+            The type of timesteps to use. Can be `"discrete"` or `"continuous"`.
         steps_offset (`int`, defaults to 0):
             An offset added to the inference steps, as required by some model families.
         rescale_betas_zero_snr (`bool`, defaults to `False`):
             Whether to rescale the betas to have zero terminal SNR. This enables the model to generate very bright and
             dark samples instead of limiting it to samples with medium brightness. Loosely related to
             [`--offset_noise`](https://github.com/huggingface/diffusers/blob/74fd735eb073eb1d774b1ab4154a0876eb82f055/examples/dreambooth/train_dreambooth.py#L506).
-        final_sigmas_type (`str`, defaults to `"zero"`):
+        final_sigmas_type (`Literal["zero", "sigma_min"]`, defaults to `"zero"`):
             The final `sigma` value for the noise schedule during the sampling process. If `"sigma_min"`, the final
-            sigma is the same as the last sigma in the training schedule. If `zero`, the final sigma is set to 0.
+            sigma is the same as the last sigma in the training schedule. If `"zero"`, the final sigma is set to 0.
     """
 
     _compatibles = [e.name for e in KarrasDiffusionSchedulers]
@@ -189,20 +198,20 @@ def __init__(
         num_train_timesteps: int = 1000,
         beta_start: float = 0.0001,
         beta_end: float = 0.02,
-        beta_schedule: str = "linear",
+        beta_schedule: Literal["linear", "scaled_linear", "squaredcos_cap_v2"] = "linear",
         trained_betas: Optional[Union[np.ndarray, List[float]]] = None,
-        prediction_type: str = "epsilon",
-        interpolation_type: str = "linear",
+        prediction_type: Literal["epsilon", "sample", "v_prediction"] = "epsilon",
+        interpolation_type: Literal["linear", "log_linear"] = "linear",
         use_karras_sigmas: Optional[bool] = False,
         use_exponential_sigmas: Optional[bool] = False,
         use_beta_sigmas: Optional[bool] = False,
         sigma_min: Optional[float] = None,
         sigma_max: Optional[float] = None,
-        timestep_spacing: str = "linspace",
-        timestep_type: str = "discrete",  # can be "discrete" or "continuous"
+        timestep_spacing: Literal["linspace", "leading", "trailing"] = "linspace",
+        timestep_type: Literal["discrete", "continuous"] = "discrete",
         steps_offset: int = 0,
         rescale_betas_zero_snr: bool = False,
-        final_sigmas_type: str = "zero",  # can be "zero" or "sigma_min"
+        final_sigmas_type: Literal["zero", "sigma_min"] = "zero",
     ):
         if self.config.use_beta_sigmas and not is_scipy_available():
             raise ImportError("Make sure to install scipy if you want to use beta sigmas.")
@@ -259,8 +268,15 @@ def __init__(
         self.sigmas = self.sigmas.to("cpu")  # to avoid too much CPU/GPU communication
 
     @property
-    def init_noise_sigma(self):
-        # standard deviation of the initial noise distribution
+    def init_noise_sigma(self) -> Union[float, torch.Tensor]:
+        """
+        The standard deviation of the initial noise distribution.
+
+        Returns:
+            `float` or `torch.Tensor`:
+                The standard deviation of the initial noise distribution, computed based on the maximum sigma value and
+                the timestep spacing configuration.
+        """
         max_sigma = max(self.sigmas) if isinstance(self.sigmas, list) else self.sigmas.max()
         if self.config.timestep_spacing in ["linspace", "trailing"]:
             return max_sigma
@@ -268,26 +284,34 @@ def init_noise_sigma(self):
         return (max_sigma**2 + 1) ** 0.5
 
     @property
-    def step_index(self):
+    def step_index(self) -> Optional[int]:
         """
-        The index counter for current timestep. It will increase 1 after each scheduler step.
+        The index counter for current timestep. It will increase by 1 after each scheduler step.
+
+        Returns:
+            `int` or `None`:
+                The current step index, or `None` if not initialized.
         """
         return self._step_index
 
     @property
-    def begin_index(self):
+    def begin_index(self) -> Optional[int]:
         """
         The index for the first timestep. It should be set from pipeline with `set_begin_index` method.
+
+        Returns:
+            `int` or `None`:
+                The begin index for the scheduler, or `None` if not set.
         """
         return self._begin_index
 
     # Copied from diffusers.schedulers.scheduling_dpmsolver_multistep.DPMSolverMultistepScheduler.set_begin_index
-    def set_begin_index(self, begin_index: int = 0):
+    def set_begin_index(self, begin_index: int = 0) -> None:
         """
         Sets the begin index for the scheduler. This function should be run from pipeline before the inference.
 
         Args:
-            begin_index (`int`):
+            begin_index (`int`, defaults to `0`):
                 The begin index for the scheduler.
         """
         self._begin_index = begin_index
@@ -299,13 +323,13 @@ def scale_model_input(self, sample: torch.Tensor, timestep: Union[float, torch.T
 
         Args:
             sample (`torch.Tensor`):
-                The input sample.
-            timestep (`int`, *optional*):
+                The input sample to be scaled.
+            timestep (`float` or `torch.Tensor`):
                 The current timestep in the diffusion chain.
 
         Returns:
             `torch.Tensor`:
-                A scaled input sample.
+                A scaled input sample, divided by `(sigma**2 + 1) ** 0.5`.
         """
         if self.step_index is None:
             self._init_step_index(timestep)
@@ -318,17 +342,18 @@ def scale_model_input(self, sample: torch.Tensor, timestep: Union[float, torch.T
 
     def set_timesteps(
         self,
-        num_inference_steps: int = None,
-        device: Union[str, torch.device] = None,
+        num_inference_steps: Optional[int] = None,
+        device: Optional[Union[str, torch.device]] = None,
         timesteps: Optional[List[int]] = None,
         sigmas: Optional[List[float]] = None,
-    ):
+    ) -> None:
         """
         Sets the discrete timesteps used for the diffusion chain (to be run before inference).
 
         Args:
-            num_inference_steps (`int`):
-                The number of diffusion steps used when generating samples with a pre-trained model.
+            num_inference_steps (`int`, *optional*):
+                The number of diffusion steps used when generating samples with a pre-trained model. If `None`,
+                `timesteps` or `sigmas` must be provided.
             device (`str` or `torch.device`, *optional*):
                 The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
             timesteps (`List[int]`, *optional*):
@@ -336,10 +361,9 @@ def set_timesteps(
                 based on the `timestep_spacing` attribute. If `timesteps` is passed, `num_inference_steps` and `sigmas`
                 must be `None`, and `timestep_spacing` attribute will be ignored.
             sigmas (`List[float]`, *optional*):
-                Custom sigmas used to support arbitrary timesteps schedule schedule. If `None`, timesteps and sigmas
-                will be generated based on the relevant scheduler attributes. If `sigmas` is passed,
-                `num_inference_steps` and `timesteps` must be `None`, and the timesteps will be generated based on the
-                custom sigmas schedule.
+                Custom sigmas used to support arbitrary timesteps schedule. If `None`, timesteps and sigmas will be
+                generated based on the relevant scheduler attributes. If `sigmas` is passed, `num_inference_steps` and
+                `timesteps` must be `None`, and the timesteps will be generated based on the custom sigmas schedule.
         """
 
         if timesteps is not None and sigmas is not None:
@@ -449,7 +473,20 @@ def set_timesteps(
         self._begin_index = None
         self.sigmas = sigmas.to("cpu")  # to avoid too much CPU/GPU communication
 
-    def _sigma_to_t(self, sigma, log_sigmas):
+    def _sigma_to_t(self, sigma: np.ndarray, log_sigmas: np.ndarray) -> np.ndarray:
+        """
+        Convert sigma values to corresponding timestep values through interpolation.
+
+        Args:
+            sigma (`np.ndarray`):
+                The sigma value(s) to convert to timestep(s).
+            log_sigmas (`np.ndarray`):
+                The logarithm of the sigma schedule used for interpolation.
+
+        Returns:
+            `np.ndarray`:
+                The interpolated timestep value(s) corresponding to the input sigma(s).
+        """
         # get log sigma
         log_sigma = np.log(np.maximum(sigma, 1e-10))
 
@@ -473,8 +510,21 @@ def _sigma_to_t(self, sigma, log_sigmas):
         return t
 
     # Copied from https://github.com/crowsonkb/k-diffusion/blob/686dbad0f39640ea25c8a8c6a6e56bb40eacefa2/k_diffusion/sampling.py#L17
-    def _convert_to_karras(self, in_sigmas: torch.Tensor, num_inference_steps) -> torch.Tensor:
-        """Constructs the noise schedule of Karras et al. (2022)."""
+    def _convert_to_karras(self, in_sigmas: torch.Tensor, num_inference_steps: int) -> torch.Tensor:
+        """
+        Construct the noise schedule as proposed in [Elucidating the Design Space of Diffusion-Based Generative
+        Models](https://huggingface.co/papers/2206.00364).
+
+        Args:
+            in_sigmas (`torch.Tensor`):
+                The input sigma values to be converted.
+            num_inference_steps (`int`):
+                The number of inference steps to generate the noise schedule for.
+
+        Returns:
+            `torch.Tensor`:
+                The converted sigma values following the Karras noise schedule.
+        """
 
         # Hack to make sure that other schedulers which copy this function don't break
         # TODO: Add this logic to the other schedulers
@@ -500,7 +550,19 @@ def _convert_to_karras(self, in_sigmas: torch.Tensor, num_inference_steps) -> to
 
     # Copied from https://github.com/crowsonkb/k-diffusion/blob/686dbad0f39640ea25c8a8c6a6e56bb40eacefa2/k_diffusion/sampling.py#L26
     def _convert_to_exponential(self, in_sigmas: torch.Tensor, num_inference_steps: int) -> torch.Tensor:
-        """Constructs an exponential noise schedule."""
+        """
+        Construct an exponential noise schedule.
+
+        Args:
+            in_sigmas (`torch.Tensor`):
+                The input sigma values to be converted.
+            num_inference_steps (`int`):
+                The number of inference steps to generate the noise schedule for.
+
+        Returns:
+            `torch.Tensor`:
+                The converted sigma values following an exponential schedule.
+        """
 
         # Hack to make sure that other schedulers which copy this function don't break
         # TODO: Add this logic to the other schedulers
@@ -523,7 +585,24 @@ def _convert_to_exponential(self, in_sigmas: torch.Tensor, num_inference_steps:
     def _convert_to_beta(
         self, in_sigmas: torch.Tensor, num_inference_steps: int, alpha: float = 0.6, beta: float = 0.6
     ) -> torch.Tensor:
-        """From "Beta Sampling is All You Need" [arXiv:2407.12173] (Lee et. al, 2024)"""
+        """
+        Construct a beta noise schedule as proposed in [Beta Sampling is All You
+        Need](https://huggingface.co/papers/2407.12173).
+
+        Args:
+            in_sigmas (`torch.Tensor`):
+                The input sigma values to be converted.
+            num_inference_steps (`int`):
+                The number of inference steps to generate the noise schedule for.
+            alpha (`float`, *optional*, defaults to `0.6`):
+                The alpha parameter for the beta distribution.
+            beta (`float`, *optional*, defaults to `0.6`):
+                The beta parameter for the beta distribution.
+
+        Returns:
+            `torch.Tensor`:
+                The converted sigma values following a beta distribution schedule.
+        """
 
         # Hack to make sure that other schedulers which copy this function don't break
         # TODO: Add this logic to the other schedulers
@@ -551,7 +630,23 @@ def _convert_to_beta(
         )
         return sigmas
 
-    def index_for_timestep(self, timestep, schedule_timesteps=None):
+    def index_for_timestep(
+        self, timestep: Union[float, torch.Tensor], schedule_timesteps: Optional[torch.Tensor] = None
+    ) -> int:
+        """
+        Find the index of a given timestep in the timestep schedule.
+
+        Args:
+            timestep (`float` or `torch.Tensor`):
+                The timestep value to find in the schedule.
+            schedule_timesteps (`torch.Tensor`, *optional*):
+                The timestep schedule to search in. If `None`, uses `self.timesteps`.
+
+        Returns:
+            `int`:
+                The index of the timestep in the schedule. For the very first step, returns the second index if
+                multiple matches exist to avoid skipping a sigma when starting mid-schedule (e.g., for image-to-image).
+        """
         if schedule_timesteps is None:
             schedule_timesteps = self.timesteps
 
@@ -565,7 +660,14 @@ def index_for_timestep(self, timestep, schedule_timesteps=None):
 
         return indices[pos].item()
 
-    def _init_step_index(self, timestep):
+    def _init_step_index(self, timestep: Union[float, torch.Tensor]) -> None:
+        """
+        Initialize the step index for the scheduler based on the given timestep.
+
+        Args:
+            timestep (`float` or `torch.Tensor`):
+                The current timestep to initialize the step index from.
+        """
         if self.begin_index is None:
             if isinstance(timestep, torch.Tensor):
                 timestep = timestep.to(self.timesteps.device)
@@ -591,26 +693,33 @@ def step(
 
         Args:
             model_output (`torch.Tensor`):
-                The direct output from learned diffusion model.
-            timestep (`float`):
+                The direct output from the learned diffusion model.
+            timestep (`float` or `torch.Tensor`):
                 The current discrete timestep in the diffusion chain.
             sample (`torch.Tensor`):
                 A current instance of a sample created by the diffusion process.
-            s_churn (`float`):
-            s_tmin  (`float`):
-            s_tmax  (`float`):
-            s_noise (`float`, defaults to 1.0):
+            s_churn (`float`, *optional*, defaults to `0.0`):
+                Stochasticity parameter that controls the amount of noise added during sampling. Higher values increase
+                randomness.
+            s_tmin (`float`, *optional*, defaults to `0.0`):
+                Minimum timestep threshold for applying stochasticity. Only timesteps above this value will have noise
+                added.
+            s_tmax (`float`, *optional*, defaults to `inf`):
+                Maximum timestep threshold for applying stochasticity. Only timesteps below this value will have noise
+                added.
+            s_noise (`float`, *optional*, defaults to `1.0`):
                 Scaling factor for noise added to the sample.
             generator (`torch.Generator`, *optional*):
-                A random number generator.
-            return_dict (`bool`):
+                A random number generator for reproducible sampling.
+            return_dict (`bool`, *optional*, defaults to `True`):
                 Whether or not to return a [`~schedulers.scheduling_euler_discrete.EulerDiscreteSchedulerOutput`] or
                 tuple.
 
         Returns:
             [`~schedulers.scheduling_euler_discrete.EulerDiscreteSchedulerOutput`] or `tuple`:
-                If return_dict is `True`, [`~schedulers.scheduling_euler_discrete.EulerDiscreteSchedulerOutput`] is
-                returned, otherwise a tuple is returned where the first element is the sample tensor.
+                If `return_dict` is `True`, [`~schedulers.scheduling_euler_discrete.EulerDiscreteSchedulerOutput`] is
+                returned, otherwise a tuple is returned where the first element is the sample tensor and the second
+                element is the predicted original sample.
         """
 
         if isinstance(timestep, (int, torch.IntTensor, torch.LongTensor)):
@@ -689,6 +798,21 @@ def add_noise(
         noise: torch.Tensor,
         timesteps: torch.Tensor,
     ) -> torch.Tensor:
+        """
+        Add noise to the original samples according to the noise schedule at the specified timesteps.
+
+        Args:
+            original_samples (`torch.Tensor`):
+                The original samples to which noise will be added.
+            noise (`torch.Tensor`):
+                The noise tensor to add to the original samples.
+            timesteps (`torch.Tensor`):
+                The timesteps at which to add noise, determining the noise level from the schedule.
+
+        Returns:
+            `torch.Tensor`:
+                The noisy samples with added noise scaled according to the timestep schedule.
+        """
         # Make sure sigmas and timesteps have the same device and dtype as original_samples
         sigmas = self.sigmas.to(device=original_samples.device, dtype=original_samples.dtype)
         if original_samples.device.type == "mps" and torch.is_floating_point(timesteps):
@@ -717,6 +841,24 @@ def add_noise(
         return noisy_samples
 
     def get_velocity(self, sample: torch.Tensor, noise: torch.Tensor, timesteps: torch.Tensor) -> torch.Tensor:
+        """
+        Compute the velocity prediction for the given sample and noise at the specified timesteps.
+
+        This method implements the velocity prediction used in v-prediction models, which predicts a linear combination
+        of the sample and noise.
+
+        Args:
+            sample (`torch.Tensor`):
+                The input sample for which to compute the velocity.
+            noise (`torch.Tensor`):
+                The noise tensor corresponding to the sample.
+            timesteps (`torch.Tensor`):
+                The timesteps at which to compute the velocity.
+
+        Returns:
+            `torch.Tensor`:
+                The velocity prediction computed as `sqrt(alpha_prod) * noise - sqrt(1 - alpha_prod) * sample`.
+        """
         if (
             isinstance(timesteps, int)
             or isinstance(timesteps, torch.IntTensor)
@@ -753,5 +895,5 @@ def get_velocity(self, sample: torch.Tensor, noise: torch.Tensor, timesteps: tor
         velocity = sqrt_alpha_prod * noise - sqrt_one_minus_alpha_prod * sample
         return velocity
 
-    def __len__(self):
+    def __len__(self) -> int:
         return self.config.num_train_timesteps
diff --git a/src/diffusers/schedulers/scheduling_euler_discrete_flax.py b/src/diffusers/schedulers/scheduling_euler_discrete_flax.py
index dae01302ac6a..09341c909d2e 100644
--- a/src/diffusers/schedulers/scheduling_euler_discrete_flax.py
+++ b/src/diffusers/schedulers/scheduling_euler_discrete_flax.py
@@ -74,7 +74,7 @@ class FlaxEulerDiscreteScheduler(FlaxSchedulerMixin, ConfigMixin):
         prediction_type (`str`, default `epsilon`, optional):
             prediction type of the scheduler function, one of `epsilon` (predicting the noise of the diffusion
             process), `sample` (directly predicting the noisy sample`) or `v_prediction` (see section 2.4
-            https://imagen.research.google/video/paper.pdf)
+            https://huggingface.co/papers/2210.02303)
         dtype (`jnp.dtype`, *optional*, defaults to `jnp.float32`):
             the `dtype` used for params and computation.
     """
diff --git a/src/diffusers/schedulers/scheduling_flow_match_euler_discrete.py b/src/diffusers/schedulers/scheduling_flow_match_euler_discrete.py
index 1a4f12ddfa53..9fd61d9e18d1 100644
--- a/src/diffusers/schedulers/scheduling_flow_match_euler_discrete.py
+++ b/src/diffusers/schedulers/scheduling_flow_match_euler_discrete.py
@@ -160,7 +160,7 @@ def set_begin_index(self, begin_index: int = 0):
         Sets the begin index for the scheduler. This function should be run from pipeline before the inference.
 
         Args:
-            begin_index (`int`):
+            begin_index (`int`, defaults to `0`):
                 The begin index for the scheduler.
         """
         self._begin_index = begin_index
@@ -473,7 +473,20 @@ def step(
 
     # Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler._convert_to_karras
     def _convert_to_karras(self, in_sigmas: torch.Tensor, num_inference_steps) -> torch.Tensor:
-        """Constructs the noise schedule of Karras et al. (2022)."""
+        """
+        Construct the noise schedule as proposed in [Elucidating the Design Space of Diffusion-Based Generative
+        Models](https://huggingface.co/papers/2206.00364).
+
+        Args:
+            in_sigmas (`torch.Tensor`):
+                The input sigma values to be converted.
+            num_inference_steps (`int`):
+                The number of inference steps to generate the noise schedule for.
+
+        Returns:
+            `torch.Tensor`:
+                The converted sigma values following the Karras noise schedule.
+        """
 
         # Hack to make sure that other schedulers which copy this function don't break
         # TODO: Add this logic to the other schedulers
@@ -499,7 +512,19 @@ def _convert_to_karras(self, in_sigmas: torch.Tensor, num_inference_steps) -> to
 
     # Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler._convert_to_exponential
     def _convert_to_exponential(self, in_sigmas: torch.Tensor, num_inference_steps: int) -> torch.Tensor:
-        """Constructs an exponential noise schedule."""
+        """
+        Construct an exponential noise schedule.
+
+        Args:
+            in_sigmas (`torch.Tensor`):
+                The input sigma values to be converted.
+            num_inference_steps (`int`):
+                The number of inference steps to generate the noise schedule for.
+
+        Returns:
+            `torch.Tensor`:
+                The converted sigma values following an exponential schedule.
+        """
 
         # Hack to make sure that other schedulers which copy this function don't break
         # TODO: Add this logic to the other schedulers
@@ -523,7 +548,24 @@ def _convert_to_exponential(self, in_sigmas: torch.Tensor, num_inference_steps:
     def _convert_to_beta(
         self, in_sigmas: torch.Tensor, num_inference_steps: int, alpha: float = 0.6, beta: float = 0.6
     ) -> torch.Tensor:
-        """From "Beta Sampling is All You Need" [arXiv:2407.12173] (Lee et. al, 2024)"""
+        """
+        Construct a beta noise schedule as proposed in [Beta Sampling is All You
+        Need](https://huggingface.co/papers/2407.12173).
+
+        Args:
+            in_sigmas (`torch.Tensor`):
+                The input sigma values to be converted.
+            num_inference_steps (`int`):
+                The number of inference steps to generate the noise schedule for.
+            alpha (`float`, *optional*, defaults to `0.6`):
+                The alpha parameter for the beta distribution.
+            beta (`float`, *optional*, defaults to `0.6`):
+                The beta parameter for the beta distribution.
+
+        Returns:
+            `torch.Tensor`:
+                The converted sigma values following a beta distribution schedule.
+        """
 
         # Hack to make sure that other schedulers which copy this function don't break
         # TODO: Add this logic to the other schedulers
diff --git a/src/diffusers/schedulers/scheduling_flow_match_heun_discrete.py b/src/diffusers/schedulers/scheduling_flow_match_heun_discrete.py
index 38e5f1ba77a8..6febee444c5a 100644
--- a/src/diffusers/schedulers/scheduling_flow_match_heun_discrete.py
+++ b/src/diffusers/schedulers/scheduling_flow_match_heun_discrete.py
@@ -102,7 +102,7 @@ def set_begin_index(self, begin_index: int = 0):
         Sets the begin index for the scheduler. This function should be run from pipeline before the inference.
 
         Args:
-            begin_index (`int`):
+            begin_index (`int`, defaults to `0`):
                 The begin index for the scheduler.
         """
         self._begin_index = begin_index
diff --git a/src/diffusers/schedulers/scheduling_flow_match_lcm.py b/src/diffusers/schedulers/scheduling_flow_match_lcm.py
index 933bb1cf8e3d..25186d1fe969 100644
--- a/src/diffusers/schedulers/scheduling_flow_match_lcm.py
+++ b/src/diffusers/schedulers/scheduling_flow_match_lcm.py
@@ -168,7 +168,7 @@ def set_begin_index(self, begin_index: int = 0):
         Sets the begin index for the scheduler. This function should be run from pipeline before the inference.
 
         Args:
-            begin_index (`int`):
+            begin_index (`int`, defaults to `0`):
                 The begin index for the scheduler.
         """
         self._begin_index = begin_index
@@ -473,7 +473,20 @@ def step(
 
     # Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler._convert_to_karras
     def _convert_to_karras(self, in_sigmas: torch.Tensor, num_inference_steps) -> torch.Tensor:
-        """Constructs the noise schedule of Karras et al. (2022)."""
+        """
+        Construct the noise schedule as proposed in [Elucidating the Design Space of Diffusion-Based Generative
+        Models](https://huggingface.co/papers/2206.00364).
+
+        Args:
+            in_sigmas (`torch.Tensor`):
+                The input sigma values to be converted.
+            num_inference_steps (`int`):
+                The number of inference steps to generate the noise schedule for.
+
+        Returns:
+            `torch.Tensor`:
+                The converted sigma values following the Karras noise schedule.
+        """
 
         # Hack to make sure that other schedulers which copy this function don't break
         # TODO: Add this logic to the other schedulers
@@ -499,7 +512,19 @@ def _convert_to_karras(self, in_sigmas: torch.Tensor, num_inference_steps) -> to
 
     # Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler._convert_to_exponential
     def _convert_to_exponential(self, in_sigmas: torch.Tensor, num_inference_steps: int) -> torch.Tensor:
-        """Constructs an exponential noise schedule."""
+        """
+        Construct an exponential noise schedule.
+
+        Args:
+            in_sigmas (`torch.Tensor`):
+                The input sigma values to be converted.
+            num_inference_steps (`int`):
+                The number of inference steps to generate the noise schedule for.
+
+        Returns:
+            `torch.Tensor`:
+                The converted sigma values following an exponential schedule.
+        """
 
         # Hack to make sure that other schedulers which copy this function don't break
         # TODO: Add this logic to the other schedulers
@@ -523,7 +548,24 @@ def _convert_to_exponential(self, in_sigmas: torch.Tensor, num_inference_steps:
     def _convert_to_beta(
         self, in_sigmas: torch.Tensor, num_inference_steps: int, alpha: float = 0.6, beta: float = 0.6
     ) -> torch.Tensor:
-        """From "Beta Sampling is All You Need" [arXiv:2407.12173] (Lee et. al, 2024)"""
+        """
+        Construct a beta noise schedule as proposed in [Beta Sampling is All You
+        Need](https://huggingface.co/papers/2407.12173).
+
+        Args:
+            in_sigmas (`torch.Tensor`):
+                The input sigma values to be converted.
+            num_inference_steps (`int`):
+                The number of inference steps to generate the noise schedule for.
+            alpha (`float`, *optional*, defaults to `0.6`):
+                The alpha parameter for the beta distribution.
+            beta (`float`, *optional*, defaults to `0.6`):
+                The beta parameter for the beta distribution.
+
+        Returns:
+            `torch.Tensor`:
+                The converted sigma values following a beta distribution schedule.
+        """
 
         # Hack to make sure that other schedulers which copy this function don't break
         # TODO: Add this logic to the other schedulers
diff --git a/src/diffusers/schedulers/scheduling_heun_discrete.py b/src/diffusers/schedulers/scheduling_heun_discrete.py
index bd1239cfaec7..b113f9b49832 100644
--- a/src/diffusers/schedulers/scheduling_heun_discrete.py
+++ b/src/diffusers/schedulers/scheduling_heun_discrete.py
@@ -14,7 +14,7 @@
 
 import math
 from dataclasses import dataclass
-from typing import List, Optional, Tuple, Union
+from typing import List, Literal, Optional, Tuple, Union
 
 import numpy as np
 import torch
@@ -49,10 +49,10 @@ class HeunDiscreteSchedulerOutput(BaseOutput):
 
 # Copied from diffusers.schedulers.scheduling_ddpm.betas_for_alpha_bar
 def betas_for_alpha_bar(
-    num_diffusion_timesteps,
-    max_beta=0.999,
-    alpha_transform_type="cosine",
-):
+    num_diffusion_timesteps: int,
+    max_beta: float = 0.999,
+    alpha_transform_type: Literal["cosine", "exp"] = "cosine",
+) -> torch.Tensor:
     """
     Create a beta schedule that discretizes the given alpha_t_bar function, which defines the cumulative product of
     (1-beta) over time from t = [0,1].
@@ -60,16 +60,17 @@ def betas_for_alpha_bar(
     Contains a function alpha_bar that takes an argument t and transforms it to the cumulative product of (1-beta) up
     to that part of the diffusion process.
 
-
     Args:
-        num_diffusion_timesteps (`int`): the number of betas to produce.
-        max_beta (`float`): the maximum beta to use; use values lower than 1 to
-                     prevent singularities.
-        alpha_transform_type (`str`, *optional*, default to `cosine`): the type of noise schedule for alpha_bar.
-                     Choose from `cosine` or `exp`
+        num_diffusion_timesteps (`int`):
+            The number of betas to produce.
+        max_beta (`float`, defaults to `0.999`):
+            The maximum beta to use; use values lower than 1 to avoid numerical instability.
+        alpha_transform_type (`"cosine"` or `"exp"`, defaults to `"cosine"`):
+            The type of noise schedule for `alpha_bar`. Choose from `cosine` or `exp`.
 
     Returns:
-        betas (`np.ndarray`): the betas used by the scheduler to step the model outputs
+        `torch.Tensor`:
+            The betas used by the scheduler to step the model outputs.
     """
     if alpha_transform_type == "cosine":
 
@@ -106,15 +107,15 @@ class HeunDiscreteScheduler(SchedulerMixin, ConfigMixin):
             The starting `beta` value of inference.
         beta_end (`float`, defaults to 0.02):
             The final `beta` value.
-        beta_schedule (`str`, defaults to `"linear"`):
+        beta_schedule (`"linear"`, `"scaled_linear"`, `"squaredcos_cap_v2"`, or `"exp"`, defaults to `"linear"`):
             The beta schedule, a mapping from a beta range to a sequence of betas for stepping the model. Choose from
-            `linear` or `scaled_linear`.
+            `linear`, `scaled_linear`, `squaredcos_cap_v2`, or `exp`.
         trained_betas (`np.ndarray`, *optional*):
             Pass an array of betas directly to the constructor to bypass `beta_start` and `beta_end`.
-        prediction_type (`str`, defaults to `epsilon`, *optional*):
+        prediction_type (`"epsilon"`, `"sample"`, or `"v_prediction"`, defaults to `"epsilon"`, *optional*):
             Prediction type of the scheduler function; can be `epsilon` (predicts the noise of the diffusion process),
             `sample` (directly predicts the noisy sample`) or `v_prediction` (see section 2.4 of [Imagen
-            Video](https://imagen.research.google/video/paper.pdf) paper).
+            Video](https://huggingface.co/papers/2210.02303) paper).
         clip_sample (`bool`, defaults to `True`):
             Clip the predicted sample for numerical stability.
         clip_sample_range (`float`, defaults to 1.0):
@@ -127,7 +128,7 @@ class HeunDiscreteScheduler(SchedulerMixin, ConfigMixin):
         use_beta_sigmas (`bool`, *optional*, defaults to `False`):
             Whether to use beta sigmas for step sizes in the noise schedule during the sampling process. Refer to [Beta
             Sampling is All You Need](https://huggingface.co/papers/2407.12173) for more information.
-        timestep_spacing (`str`, defaults to `"linspace"`):
+        timestep_spacing (`"linspace"`, `"leading"`, or `"trailing"`, defaults to `"linspace"`):
             The way the timesteps should be scaled. Refer to Table 2 of the [Common Diffusion Noise Schedules and
             Sample Steps are Flawed](https://huggingface.co/papers/2305.08891) for more information.
         steps_offset (`int`, defaults to 0):
@@ -143,17 +144,17 @@ def __init__(
         num_train_timesteps: int = 1000,
         beta_start: float = 0.00085,  # sensible defaults
         beta_end: float = 0.012,
-        beta_schedule: str = "linear",
+        beta_schedule: Literal["linear", "scaled_linear", "squaredcos_cap_v2", "exp"] = "linear",
         trained_betas: Optional[Union[np.ndarray, List[float]]] = None,
-        prediction_type: str = "epsilon",
+        prediction_type: Literal["epsilon", "sample", "v_prediction"] = "epsilon",
         use_karras_sigmas: Optional[bool] = False,
         use_exponential_sigmas: Optional[bool] = False,
         use_beta_sigmas: Optional[bool] = False,
         clip_sample: Optional[bool] = False,
         clip_sample_range: float = 1.0,
-        timestep_spacing: str = "linspace",
+        timestep_spacing: Literal["linspace", "leading", "trailing"] = "linspace",
         steps_offset: int = 0,
-    ):
+    ) -> None:
         if self.config.use_beta_sigmas and not is_scipy_available():
             raise ImportError("Make sure to install scipy if you want to use beta sigmas.")
         if sum([self.config.use_beta_sigmas, self.config.use_exponential_sigmas, self.config.use_karras_sigmas]) > 1:
@@ -187,7 +188,23 @@ def __init__(
         self.sigmas = self.sigmas.to("cpu")  # to avoid too much CPU/GPU communication
 
     # Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler.index_for_timestep
-    def index_for_timestep(self, timestep, schedule_timesteps=None):
+    def index_for_timestep(
+        self, timestep: Union[float, torch.Tensor], schedule_timesteps: Optional[torch.Tensor] = None
+    ) -> int:
+        """
+        Find the index of a given timestep in the timestep schedule.
+
+        Args:
+            timestep (`float` or `torch.Tensor`):
+                The timestep value to find in the schedule.
+            schedule_timesteps (`torch.Tensor`, *optional*):
+                The timestep schedule to search in. If `None`, uses `self.timesteps`.
+
+        Returns:
+            `int`:
+                The index of the timestep in the schedule. For the very first step, returns the second index if
+                multiple matches exist to avoid skipping a sigma when starting mid-schedule (e.g., for image-to-image).
+        """
         if schedule_timesteps is None:
             schedule_timesteps = self.timesteps
 
@@ -224,12 +241,12 @@ def begin_index(self):
         return self._begin_index
 
     # Copied from diffusers.schedulers.scheduling_dpmsolver_multistep.DPMSolverMultistepScheduler.set_begin_index
-    def set_begin_index(self, begin_index: int = 0):
+    def set_begin_index(self, begin_index: int = 0) -> None:
         """
         Sets the begin index for the scheduler. This function should be run from pipeline before the inference.
 
         Args:
-            begin_index (`int`):
+            begin_index (`int`, defaults to `0`):
                 The begin index for the scheduler.
         """
         self._begin_index = begin_index
@@ -246,7 +263,7 @@ def scale_model_input(
         Args:
             sample (`torch.Tensor`):
                 The input sample.
-            timestep (`int`, *optional*):
+            timestep (`float` or `torch.Tensor`):
                 The current timestep in the diffusion chain.
 
         Returns:
@@ -266,19 +283,19 @@ def set_timesteps(
         device: Union[str, torch.device] = None,
         num_train_timesteps: Optional[int] = None,
         timesteps: Optional[List[int]] = None,
-    ):
+    ) -> None:
         """
         Sets the discrete timesteps used for the diffusion chain (to be run before inference).
 
         Args:
-            num_inference_steps (`int`):
+            num_inference_steps (`int`, *optional*, defaults to `None`):
                 The number of diffusion steps used when generating samples with a pre-trained model.
-            device (`str` or `torch.device`, *optional*):
+            device (`str`, `torch.device`, *optional*, defaults to `None`):
                 The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
-            num_train_timesteps (`int`, *optional*):
+            num_train_timesteps (`int`, *optional*, defaults to `None`):
                 The number of diffusion steps used when training the model. If `None`, the default
                 `num_train_timesteps` attribute is used.
-            timesteps (`List[int]`, *optional*):
+            timesteps (`List[int]`, *optional*, defaults to `None`):
                 Custom timesteps used to support arbitrary spacing between timesteps. If `None`, timesteps will be
                 generated based on the `timestep_spacing` attribute. If `timesteps` is passed, `num_inference_steps`
                 must be `None`, and `timestep_spacing` attribute will be ignored.
@@ -353,7 +370,20 @@ def set_timesteps(
         self.sigmas = self.sigmas.to("cpu")  # to avoid too much CPU/GPU communication
 
     # Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler._sigma_to_t
-    def _sigma_to_t(self, sigma, log_sigmas):
+    def _sigma_to_t(self, sigma: np.ndarray, log_sigmas: np.ndarray) -> np.ndarray:
+        """
+        Convert sigma values to corresponding timestep values through interpolation.
+
+        Args:
+            sigma (`np.ndarray`):
+                The sigma value(s) to convert to timestep(s).
+            log_sigmas (`np.ndarray`):
+                The logarithm of the sigma schedule used for interpolation.
+
+        Returns:
+            `np.ndarray`:
+                The interpolated timestep value(s) corresponding to the input sigma(s).
+        """
         # get log sigma
         log_sigma = np.log(np.maximum(sigma, 1e-10))
 
@@ -377,8 +407,21 @@ def _sigma_to_t(self, sigma, log_sigmas):
         return t
 
     # Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler._convert_to_karras
-    def _convert_to_karras(self, in_sigmas: torch.Tensor, num_inference_steps) -> torch.Tensor:
-        """Constructs the noise schedule of Karras et al. (2022)."""
+    def _convert_to_karras(self, in_sigmas: torch.Tensor, num_inference_steps: int) -> torch.Tensor:
+        """
+        Construct the noise schedule as proposed in [Elucidating the Design Space of Diffusion-Based Generative
+        Models](https://huggingface.co/papers/2206.00364).
+
+        Args:
+            in_sigmas (`torch.Tensor`):
+                The input sigma values to be converted.
+            num_inference_steps (`int`):
+                The number of inference steps to generate the noise schedule for.
+
+        Returns:
+            `torch.Tensor`:
+                The converted sigma values following the Karras noise schedule.
+        """
 
         # Hack to make sure that other schedulers which copy this function don't break
         # TODO: Add this logic to the other schedulers
@@ -404,7 +447,19 @@ def _convert_to_karras(self, in_sigmas: torch.Tensor, num_inference_steps) -> to
 
     # Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler._convert_to_exponential
     def _convert_to_exponential(self, in_sigmas: torch.Tensor, num_inference_steps: int) -> torch.Tensor:
-        """Constructs an exponential noise schedule."""
+        """
+        Construct an exponential noise schedule.
+
+        Args:
+            in_sigmas (`torch.Tensor`):
+                The input sigma values to be converted.
+            num_inference_steps (`int`):
+                The number of inference steps to generate the noise schedule for.
+
+        Returns:
+            `torch.Tensor`:
+                The converted sigma values following an exponential schedule.
+        """
 
         # Hack to make sure that other schedulers which copy this function don't break
         # TODO: Add this logic to the other schedulers
@@ -428,7 +483,24 @@ def _convert_to_exponential(self, in_sigmas: torch.Tensor, num_inference_steps:
     def _convert_to_beta(
         self, in_sigmas: torch.Tensor, num_inference_steps: int, alpha: float = 0.6, beta: float = 0.6
     ) -> torch.Tensor:
-        """From "Beta Sampling is All You Need" [arXiv:2407.12173] (Lee et. al, 2024)"""
+        """
+        Construct a beta noise schedule as proposed in [Beta Sampling is All You
+        Need](https://huggingface.co/papers/2407.12173).
+
+        Args:
+            in_sigmas (`torch.Tensor`):
+                The input sigma values to be converted.
+            num_inference_steps (`int`):
+                The number of inference steps to generate the noise schedule for.
+            alpha (`float`, *optional*, defaults to `0.6`):
+                The alpha parameter for the beta distribution.
+            beta (`float`, *optional*, defaults to `0.6`):
+                The beta parameter for the beta distribution.
+
+        Returns:
+            `torch.Tensor`:
+                The converted sigma values following a beta distribution schedule.
+        """
 
         # Hack to make sure that other schedulers which copy this function don't break
         # TODO: Add this logic to the other schedulers
@@ -461,7 +533,14 @@ def state_in_first_order(self):
         return self.dt is None
 
     # Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler._init_step_index
-    def _init_step_index(self, timestep):
+    def _init_step_index(self, timestep: Union[float, torch.Tensor]) -> None:
+        """
+        Initialize the step index for the scheduler based on the given timestep.
+
+        Args:
+            timestep (`float` or `torch.Tensor`):
+                The current timestep to initialize the step index from.
+        """
         if self.begin_index is None:
             if isinstance(timestep, torch.Tensor):
                 timestep = timestep.to(self.timesteps.device)
@@ -579,6 +658,21 @@ def add_noise(
         noise: torch.Tensor,
         timesteps: torch.Tensor,
     ) -> torch.Tensor:
+        """
+        Add noise to the original samples according to the noise schedule at the specified timesteps.
+
+        Args:
+            original_samples (`torch.Tensor`):
+                The original samples to which noise will be added.
+            noise (`torch.Tensor`):
+                The noise tensor to add to the original samples.
+            timesteps (`torch.Tensor`):
+                The timesteps at which to add noise, determining the noise level from the schedule.
+
+        Returns:
+            `torch.Tensor`:
+                The noisy samples with added noise scaled according to the timestep schedule.
+        """
         # Make sure sigmas and timesteps have the same device and dtype as original_samples
         sigmas = self.sigmas.to(device=original_samples.device, dtype=original_samples.dtype)
         if original_samples.device.type == "mps" and torch.is_floating_point(timesteps):
@@ -606,5 +700,5 @@ def add_noise(
         noisy_samples = original_samples + noise * sigma
         return noisy_samples
 
-    def __len__(self):
+    def __len__(self) -> int:
         return self.config.num_train_timesteps
diff --git a/src/diffusers/schedulers/scheduling_ipndm.py b/src/diffusers/schedulers/scheduling_ipndm.py
index 23bc21f10ca4..da188fe8297c 100644
--- a/src/diffusers/schedulers/scheduling_ipndm.py
+++ b/src/diffusers/schedulers/scheduling_ipndm.py
@@ -78,7 +78,7 @@ def set_begin_index(self, begin_index: int = 0):
         Sets the begin index for the scheduler. This function should be run from pipeline before the inference.
 
         Args:
-            begin_index (`int`):
+            begin_index (`int`, defaults to `0`):
                 The begin index for the scheduler.
         """
         self._begin_index = begin_index
@@ -112,7 +112,23 @@ def set_timesteps(self, num_inference_steps: int, device: Union[str, torch.devic
         self._begin_index = None
 
     # Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler.index_for_timestep
-    def index_for_timestep(self, timestep, schedule_timesteps=None):
+    def index_for_timestep(
+        self, timestep: Union[float, torch.Tensor], schedule_timesteps: Optional[torch.Tensor] = None
+    ) -> int:
+        """
+        Find the index of a given timestep in the timestep schedule.
+
+        Args:
+            timestep (`float` or `torch.Tensor`):
+                The timestep value to find in the schedule.
+            schedule_timesteps (`torch.Tensor`, *optional*):
+                The timestep schedule to search in. If `None`, uses `self.timesteps`.
+
+        Returns:
+            `int`:
+                The index of the timestep in the schedule. For the very first step, returns the second index if
+                multiple matches exist to avoid skipping a sigma when starting mid-schedule (e.g., for image-to-image).
+        """
         if schedule_timesteps is None:
             schedule_timesteps = self.timesteps
 
@@ -127,7 +143,14 @@ def index_for_timestep(self, timestep, schedule_timesteps=None):
         return indices[pos].item()
 
     # Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler._init_step_index
-    def _init_step_index(self, timestep):
+    def _init_step_index(self, timestep: Union[float, torch.Tensor]) -> None:
+        """
+        Initialize the step index for the scheduler based on the given timestep.
+
+        Args:
+            timestep (`float` or `torch.Tensor`):
+                The current timestep to initialize the step index from.
+        """
         if self.begin_index is None:
             if isinstance(timestep, torch.Tensor):
                 timestep = timestep.to(self.timesteps.device)
diff --git a/src/diffusers/schedulers/scheduling_k_dpm_2_ancestral_discrete.py b/src/diffusers/schedulers/scheduling_k_dpm_2_ancestral_discrete.py
index 6588464073a1..da40bed635e1 100644
--- a/src/diffusers/schedulers/scheduling_k_dpm_2_ancestral_discrete.py
+++ b/src/diffusers/schedulers/scheduling_k_dpm_2_ancestral_discrete.py
@@ -14,7 +14,7 @@
 
 import math
 from dataclasses import dataclass
-from typing import List, Optional, Tuple, Union
+from typing import List, Literal, Optional, Tuple, Union
 
 import numpy as np
 import torch
@@ -50,10 +50,10 @@ class KDPM2AncestralDiscreteSchedulerOutput(BaseOutput):
 
 # Copied from diffusers.schedulers.scheduling_ddpm.betas_for_alpha_bar
 def betas_for_alpha_bar(
-    num_diffusion_timesteps,
-    max_beta=0.999,
-    alpha_transform_type="cosine",
-):
+    num_diffusion_timesteps: int,
+    max_beta: float = 0.999,
+    alpha_transform_type: Literal["cosine", "exp"] = "cosine",
+) -> torch.Tensor:
     """
     Create a beta schedule that discretizes the given alpha_t_bar function, which defines the cumulative product of
     (1-beta) over time from t = [0,1].
@@ -61,16 +61,17 @@ def betas_for_alpha_bar(
     Contains a function alpha_bar that takes an argument t and transforms it to the cumulative product of (1-beta) up
     to that part of the diffusion process.
 
-
     Args:
-        num_diffusion_timesteps (`int`): the number of betas to produce.
-        max_beta (`float`): the maximum beta to use; use values lower than 1 to
-                     prevent singularities.
-        alpha_transform_type (`str`, *optional*, default to `cosine`): the type of noise schedule for alpha_bar.
-                     Choose from `cosine` or `exp`
+        num_diffusion_timesteps (`int`):
+            The number of betas to produce.
+        max_beta (`float`, defaults to `0.999`):
+            The maximum beta to use; use values lower than 1 to avoid numerical instability.
+        alpha_transform_type (`"cosine"` or `"exp"`, defaults to `"cosine"`):
+            The type of noise schedule for `alpha_bar`. Choose from `cosine` or `exp`.
 
     Returns:
-        betas (`np.ndarray`): the betas used by the scheduler to step the model outputs
+        `torch.Tensor`:
+            The betas used by the scheduler to step the model outputs.
     """
     if alpha_transform_type == "cosine":
 
@@ -124,7 +125,7 @@ class KDPM2AncestralDiscreteScheduler(SchedulerMixin, ConfigMixin):
         prediction_type (`str`, defaults to `epsilon`, *optional*):
             Prediction type of the scheduler function; can be `epsilon` (predicts the noise of the diffusion process),
             `sample` (directly predicts the noisy sample`) or `v_prediction` (see section 2.4 of [Imagen
-            Video](https://imagen.research.google/video/paper.pdf) paper).
+            Video](https://huggingface.co/papers/2210.02303) paper).
         timestep_spacing (`str`, defaults to `"linspace"`):
             The way the timesteps should be scaled. Refer to Table 2 of the [Common Diffusion Noise Schedules and
             Sample Steps are Flawed](https://huggingface.co/papers/2305.08891) for more information.
@@ -206,7 +207,7 @@ def set_begin_index(self, begin_index: int = 0):
         Sets the begin index for the scheduler. This function should be run from pipeline before the inference.
 
         Args:
-            begin_index (`int`):
+            begin_index (`int`, defaults to `0`):
                 The begin index for the scheduler.
         """
         self._begin_index = begin_index
@@ -342,6 +343,19 @@ def set_timesteps(
 
     # Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler._sigma_to_t
     def _sigma_to_t(self, sigma, log_sigmas):
+        """
+        Convert sigma values to corresponding timestep values through interpolation.
+
+        Args:
+            sigma (`np.ndarray`):
+                The sigma value(s) to convert to timestep(s).
+            log_sigmas (`np.ndarray`):
+                The logarithm of the sigma schedule used for interpolation.
+
+        Returns:
+            `np.ndarray`:
+                The interpolated timestep value(s) corresponding to the input sigma(s).
+        """
         # get log sigma
         log_sigma = np.log(np.maximum(sigma, 1e-10))
 
@@ -366,7 +380,20 @@ def _sigma_to_t(self, sigma, log_sigmas):
 
     # Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler._convert_to_karras
     def _convert_to_karras(self, in_sigmas: torch.Tensor, num_inference_steps) -> torch.Tensor:
-        """Constructs the noise schedule of Karras et al. (2022)."""
+        """
+        Construct the noise schedule as proposed in [Elucidating the Design Space of Diffusion-Based Generative
+        Models](https://huggingface.co/papers/2206.00364).
+
+        Args:
+            in_sigmas (`torch.Tensor`):
+                The input sigma values to be converted.
+            num_inference_steps (`int`):
+                The number of inference steps to generate the noise schedule for.
+
+        Returns:
+            `torch.Tensor`:
+                The converted sigma values following the Karras noise schedule.
+        """
 
         # Hack to make sure that other schedulers which copy this function don't break
         # TODO: Add this logic to the other schedulers
@@ -392,7 +419,19 @@ def _convert_to_karras(self, in_sigmas: torch.Tensor, num_inference_steps) -> to
 
     # Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler._convert_to_exponential
     def _convert_to_exponential(self, in_sigmas: torch.Tensor, num_inference_steps: int) -> torch.Tensor:
-        """Constructs an exponential noise schedule."""
+        """
+        Construct an exponential noise schedule.
+
+        Args:
+            in_sigmas (`torch.Tensor`):
+                The input sigma values to be converted.
+            num_inference_steps (`int`):
+                The number of inference steps to generate the noise schedule for.
+
+        Returns:
+            `torch.Tensor`:
+                The converted sigma values following an exponential schedule.
+        """
 
         # Hack to make sure that other schedulers which copy this function don't break
         # TODO: Add this logic to the other schedulers
@@ -416,7 +455,24 @@ def _convert_to_exponential(self, in_sigmas: torch.Tensor, num_inference_steps:
     def _convert_to_beta(
         self, in_sigmas: torch.Tensor, num_inference_steps: int, alpha: float = 0.6, beta: float = 0.6
     ) -> torch.Tensor:
-        """From "Beta Sampling is All You Need" [arXiv:2407.12173] (Lee et. al, 2024)"""
+        """
+        Construct a beta noise schedule as proposed in [Beta Sampling is All You
+        Need](https://huggingface.co/papers/2407.12173).
+
+        Args:
+            in_sigmas (`torch.Tensor`):
+                The input sigma values to be converted.
+            num_inference_steps (`int`):
+                The number of inference steps to generate the noise schedule for.
+            alpha (`float`, *optional*, defaults to `0.6`):
+                The alpha parameter for the beta distribution.
+            beta (`float`, *optional*, defaults to `0.6`):
+                The beta parameter for the beta distribution.
+
+        Returns:
+            `torch.Tensor`:
+                The converted sigma values following a beta distribution schedule.
+        """
 
         # Hack to make sure that other schedulers which copy this function don't break
         # TODO: Add this logic to the other schedulers
@@ -449,7 +505,23 @@ def state_in_first_order(self):
         return self.sample is None
 
     # Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler.index_for_timestep
-    def index_for_timestep(self, timestep, schedule_timesteps=None):
+    def index_for_timestep(
+        self, timestep: Union[float, torch.Tensor], schedule_timesteps: Optional[torch.Tensor] = None
+    ) -> int:
+        """
+        Find the index of a given timestep in the timestep schedule.
+
+        Args:
+            timestep (`float` or `torch.Tensor`):
+                The timestep value to find in the schedule.
+            schedule_timesteps (`torch.Tensor`, *optional*):
+                The timestep schedule to search in. If `None`, uses `self.timesteps`.
+
+        Returns:
+            `int`:
+                The index of the timestep in the schedule. For the very first step, returns the second index if
+                multiple matches exist to avoid skipping a sigma when starting mid-schedule (e.g., for image-to-image).
+        """
         if schedule_timesteps is None:
             schedule_timesteps = self.timesteps
 
@@ -464,7 +536,14 @@ def index_for_timestep(self, timestep, schedule_timesteps=None):
         return indices[pos].item()
 
     # Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler._init_step_index
-    def _init_step_index(self, timestep):
+    def _init_step_index(self, timestep: Union[float, torch.Tensor]) -> None:
+        """
+        Initialize the step index for the scheduler based on the given timestep.
+
+        Args:
+            timestep (`float` or `torch.Tensor`):
+                The current timestep to initialize the step index from.
+        """
         if self.begin_index is None:
             if isinstance(timestep, torch.Tensor):
                 timestep = timestep.to(self.timesteps.device)
@@ -586,6 +665,21 @@ def add_noise(
         noise: torch.Tensor,
         timesteps: torch.Tensor,
     ) -> torch.Tensor:
+        """
+        Add noise to the original samples according to the noise schedule at the specified timesteps.
+
+        Args:
+            original_samples (`torch.Tensor`):
+                The original samples to which noise will be added.
+            noise (`torch.Tensor`):
+                The noise tensor to add to the original samples.
+            timesteps (`torch.Tensor`):
+                The timesteps at which to add noise, determining the noise level from the schedule.
+
+        Returns:
+            `torch.Tensor`:
+                The noisy samples with added noise scaled according to the timestep schedule.
+        """
         # Make sure sigmas and timesteps have the same device and dtype as original_samples
         sigmas = self.sigmas.to(device=original_samples.device, dtype=original_samples.dtype)
         if original_samples.device.type == "mps" and torch.is_floating_point(timesteps):
diff --git a/src/diffusers/schedulers/scheduling_k_dpm_2_discrete.py b/src/diffusers/schedulers/scheduling_k_dpm_2_discrete.py
index 9b4cd4e204d6..6dc08d4d0a86 100644
--- a/src/diffusers/schedulers/scheduling_k_dpm_2_discrete.py
+++ b/src/diffusers/schedulers/scheduling_k_dpm_2_discrete.py
@@ -14,7 +14,7 @@
 
 import math
 from dataclasses import dataclass
-from typing import List, Optional, Tuple, Union
+from typing import List, Literal, Optional, Tuple, Union
 
 import numpy as np
 import torch
@@ -49,10 +49,10 @@ class KDPM2DiscreteSchedulerOutput(BaseOutput):
 
 # Copied from diffusers.schedulers.scheduling_ddpm.betas_for_alpha_bar
 def betas_for_alpha_bar(
-    num_diffusion_timesteps,
-    max_beta=0.999,
-    alpha_transform_type="cosine",
-):
+    num_diffusion_timesteps: int,
+    max_beta: float = 0.999,
+    alpha_transform_type: Literal["cosine", "exp"] = "cosine",
+) -> torch.Tensor:
     """
     Create a beta schedule that discretizes the given alpha_t_bar function, which defines the cumulative product of
     (1-beta) over time from t = [0,1].
@@ -60,16 +60,17 @@ def betas_for_alpha_bar(
     Contains a function alpha_bar that takes an argument t and transforms it to the cumulative product of (1-beta) up
     to that part of the diffusion process.
 
-
     Args:
-        num_diffusion_timesteps (`int`): the number of betas to produce.
-        max_beta (`float`): the maximum beta to use; use values lower than 1 to
-                     prevent singularities.
-        alpha_transform_type (`str`, *optional*, default to `cosine`): the type of noise schedule for alpha_bar.
-                     Choose from `cosine` or `exp`
+        num_diffusion_timesteps (`int`):
+            The number of betas to produce.
+        max_beta (`float`, defaults to `0.999`):
+            The maximum beta to use; use values lower than 1 to avoid numerical instability.
+        alpha_transform_type (`"cosine"` or `"exp"`, defaults to `"cosine"`):
+            The type of noise schedule for `alpha_bar`. Choose from `cosine` or `exp`.
 
     Returns:
-        betas (`np.ndarray`): the betas used by the scheduler to step the model outputs
+        `torch.Tensor`:
+            The betas used by the scheduler to step the model outputs.
     """
     if alpha_transform_type == "cosine":
 
@@ -123,7 +124,7 @@ class KDPM2DiscreteScheduler(SchedulerMixin, ConfigMixin):
         prediction_type (`str`, defaults to `epsilon`, *optional*):
             Prediction type of the scheduler function; can be `epsilon` (predicts the noise of the diffusion process),
             `sample` (directly predicts the noisy sample`) or `v_prediction` (see section 2.4 of [Imagen
-            Video](https://imagen.research.google/video/paper.pdf) paper).
+            Video](https://huggingface.co/papers/2210.02303) paper).
         timestep_spacing (`str`, defaults to `"linspace"`):
             The way the timesteps should be scaled. Refer to Table 2 of the [Common Diffusion Noise Schedules and
             Sample Steps are Flawed](https://huggingface.co/papers/2305.08891) for more information.
@@ -206,7 +207,7 @@ def set_begin_index(self, begin_index: int = 0):
         Sets the begin index for the scheduler. This function should be run from pipeline before the inference.
 
         Args:
-            begin_index (`int`):
+            begin_index (`int`, defaults to `0`):
                 The begin index for the scheduler.
         """
         self._begin_index = begin_index
@@ -330,7 +331,23 @@ def state_in_first_order(self):
         return self.sample is None
 
     # Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler.index_for_timestep
-    def index_for_timestep(self, timestep, schedule_timesteps=None):
+    def index_for_timestep(
+        self, timestep: Union[float, torch.Tensor], schedule_timesteps: Optional[torch.Tensor] = None
+    ) -> int:
+        """
+        Find the index of a given timestep in the timestep schedule.
+
+        Args:
+            timestep (`float` or `torch.Tensor`):
+                The timestep value to find in the schedule.
+            schedule_timesteps (`torch.Tensor`, *optional*):
+                The timestep schedule to search in. If `None`, uses `self.timesteps`.
+
+        Returns:
+            `int`:
+                The index of the timestep in the schedule. For the very first step, returns the second index if
+                multiple matches exist to avoid skipping a sigma when starting mid-schedule (e.g., for image-to-image).
+        """
         if schedule_timesteps is None:
             schedule_timesteps = self.timesteps
 
@@ -345,7 +362,14 @@ def index_for_timestep(self, timestep, schedule_timesteps=None):
         return indices[pos].item()
 
     # Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler._init_step_index
-    def _init_step_index(self, timestep):
+    def _init_step_index(self, timestep: Union[float, torch.Tensor]) -> None:
+        """
+        Initialize the step index for the scheduler based on the given timestep.
+
+        Args:
+            timestep (`float` or `torch.Tensor`):
+                The current timestep to initialize the step index from.
+        """
         if self.begin_index is None:
             if isinstance(timestep, torch.Tensor):
                 timestep = timestep.to(self.timesteps.device)
@@ -355,6 +379,19 @@ def _init_step_index(self, timestep):
 
     # Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler._sigma_to_t
     def _sigma_to_t(self, sigma, log_sigmas):
+        """
+        Convert sigma values to corresponding timestep values through interpolation.
+
+        Args:
+            sigma (`np.ndarray`):
+                The sigma value(s) to convert to timestep(s).
+            log_sigmas (`np.ndarray`):
+                The logarithm of the sigma schedule used for interpolation.
+
+        Returns:
+            `np.ndarray`:
+                The interpolated timestep value(s) corresponding to the input sigma(s).
+        """
         # get log sigma
         log_sigma = np.log(np.maximum(sigma, 1e-10))
 
@@ -379,7 +416,20 @@ def _sigma_to_t(self, sigma, log_sigmas):
 
     # Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler._convert_to_karras
     def _convert_to_karras(self, in_sigmas: torch.Tensor, num_inference_steps) -> torch.Tensor:
-        """Constructs the noise schedule of Karras et al. (2022)."""
+        """
+        Construct the noise schedule as proposed in [Elucidating the Design Space of Diffusion-Based Generative
+        Models](https://huggingface.co/papers/2206.00364).
+
+        Args:
+            in_sigmas (`torch.Tensor`):
+                The input sigma values to be converted.
+            num_inference_steps (`int`):
+                The number of inference steps to generate the noise schedule for.
+
+        Returns:
+            `torch.Tensor`:
+                The converted sigma values following the Karras noise schedule.
+        """
 
         # Hack to make sure that other schedulers which copy this function don't break
         # TODO: Add this logic to the other schedulers
@@ -405,7 +455,19 @@ def _convert_to_karras(self, in_sigmas: torch.Tensor, num_inference_steps) -> to
 
     # Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler._convert_to_exponential
     def _convert_to_exponential(self, in_sigmas: torch.Tensor, num_inference_steps: int) -> torch.Tensor:
-        """Constructs an exponential noise schedule."""
+        """
+        Construct an exponential noise schedule.
+
+        Args:
+            in_sigmas (`torch.Tensor`):
+                The input sigma values to be converted.
+            num_inference_steps (`int`):
+                The number of inference steps to generate the noise schedule for.
+
+        Returns:
+            `torch.Tensor`:
+                The converted sigma values following an exponential schedule.
+        """
 
         # Hack to make sure that other schedulers which copy this function don't break
         # TODO: Add this logic to the other schedulers
@@ -429,7 +491,24 @@ def _convert_to_exponential(self, in_sigmas: torch.Tensor, num_inference_steps:
     def _convert_to_beta(
         self, in_sigmas: torch.Tensor, num_inference_steps: int, alpha: float = 0.6, beta: float = 0.6
     ) -> torch.Tensor:
-        """From "Beta Sampling is All You Need" [arXiv:2407.12173] (Lee et. al, 2024)"""
+        """
+        Construct a beta noise schedule as proposed in [Beta Sampling is All You
+        Need](https://huggingface.co/papers/2407.12173).
+
+        Args:
+            in_sigmas (`torch.Tensor`):
+                The input sigma values to be converted.
+            num_inference_steps (`int`):
+                The number of inference steps to generate the noise schedule for.
+            alpha (`float`, *optional*, defaults to `0.6`):
+                The alpha parameter for the beta distribution.
+            beta (`float`, *optional*, defaults to `0.6`):
+                The beta parameter for the beta distribution.
+
+        Returns:
+            `torch.Tensor`:
+                The converted sigma values following a beta distribution schedule.
+        """
 
         # Hack to make sure that other schedulers which copy this function don't break
         # TODO: Add this logic to the other schedulers
@@ -558,6 +637,21 @@ def add_noise(
         noise: torch.Tensor,
         timesteps: torch.Tensor,
     ) -> torch.Tensor:
+        """
+        Add noise to the original samples according to the noise schedule at the specified timesteps.
+
+        Args:
+            original_samples (`torch.Tensor`):
+                The original samples to which noise will be added.
+            noise (`torch.Tensor`):
+                The noise tensor to add to the original samples.
+            timesteps (`torch.Tensor`):
+                The timesteps at which to add noise, determining the noise level from the schedule.
+
+        Returns:
+            `torch.Tensor`:
+                The noisy samples with added noise scaled according to the timestep schedule.
+        """
         # Make sure sigmas and timesteps have the same device and dtype as original_samples
         sigmas = self.sigmas.to(device=original_samples.device, dtype=original_samples.dtype)
         if original_samples.device.type == "mps" and torch.is_floating_point(timesteps):
diff --git a/src/diffusers/schedulers/scheduling_lcm.py b/src/diffusers/schedulers/scheduling_lcm.py
index cd7a29fe675f..0527f3533851 100644
--- a/src/diffusers/schedulers/scheduling_lcm.py
+++ b/src/diffusers/schedulers/scheduling_lcm.py
@@ -17,7 +17,7 @@
 
 import math
 from dataclasses import dataclass
-from typing import List, Optional, Tuple, Union
+from typing import List, Literal, Optional, Tuple, Union
 
 import numpy as np
 import torch
@@ -51,10 +51,10 @@ class LCMSchedulerOutput(BaseOutput):
 
 # Copied from diffusers.schedulers.scheduling_ddpm.betas_for_alpha_bar
 def betas_for_alpha_bar(
-    num_diffusion_timesteps,
-    max_beta=0.999,
-    alpha_transform_type="cosine",
-):
+    num_diffusion_timesteps: int,
+    max_beta: float = 0.999,
+    alpha_transform_type: Literal["cosine", "exp"] = "cosine",
+) -> torch.Tensor:
     """
     Create a beta schedule that discretizes the given alpha_t_bar function, which defines the cumulative product of
     (1-beta) over time from t = [0,1].
@@ -62,16 +62,17 @@ def betas_for_alpha_bar(
     Contains a function alpha_bar that takes an argument t and transforms it to the cumulative product of (1-beta) up
     to that part of the diffusion process.
 
-
     Args:
-        num_diffusion_timesteps (`int`): the number of betas to produce.
-        max_beta (`float`): the maximum beta to use; use values lower than 1 to
-                     prevent singularities.
-        alpha_transform_type (`str`, *optional*, default to `cosine`): the type of noise schedule for alpha_bar.
-                     Choose from `cosine` or `exp`
+        num_diffusion_timesteps (`int`):
+            The number of betas to produce.
+        max_beta (`float`, defaults to `0.999`):
+            The maximum beta to use; use values lower than 1 to avoid numerical instability.
+        alpha_transform_type (`"cosine"` or `"exp"`, defaults to `"cosine"`):
+            The type of noise schedule for `alpha_bar`. Choose from `cosine` or `exp`.
 
     Returns:
-        betas (`np.ndarray`): the betas used by the scheduler to step the model outputs
+        `torch.Tensor`:
+            The betas used by the scheduler to step the model outputs.
     """
     if alpha_transform_type == "cosine":
 
@@ -99,13 +100,13 @@ def rescale_zero_terminal_snr(betas: torch.Tensor) -> torch.Tensor:
     """
     Rescales betas to have zero terminal SNR Based on https://huggingface.co/papers/2305.08891 (Algorithm 1)
 
-
     Args:
         betas (`torch.Tensor`):
-            the betas that the scheduler is being initialized with.
+            The betas that the scheduler is being initialized with.
 
     Returns:
-        `torch.Tensor`: rescaled betas with zero terminal SNR
+        `torch.Tensor`:
+            Rescaled betas with zero terminal SNR.
     """
     # Convert betas to alphas_bar_sqrt
     alphas = 1.0 - betas
@@ -169,7 +170,7 @@ class LCMScheduler(SchedulerMixin, ConfigMixin):
         prediction_type (`str`, defaults to `epsilon`, *optional*):
             Prediction type of the scheduler function; can be `epsilon` (predicts the noise of the diffusion process),
             `sample` (directly predicts the noisy sample`) or `v_prediction` (see section 2.4 of [Imagen
-            Video](https://imagen.research.google/video/paper.pdf) paper).
+            Video](https://huggingface.co/papers/2210.02303) paper).
         thresholding (`bool`, defaults to `False`):
             Whether to use the "dynamic thresholding" method. This is unsuitable for latent-space diffusion models such
             as Stable Diffusion.
@@ -251,7 +252,23 @@ def __init__(
         self._begin_index = None
 
     # Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler.index_for_timestep
-    def index_for_timestep(self, timestep, schedule_timesteps=None):
+    def index_for_timestep(
+        self, timestep: Union[float, torch.Tensor], schedule_timesteps: Optional[torch.Tensor] = None
+    ) -> int:
+        """
+        Find the index of a given timestep in the timestep schedule.
+
+        Args:
+            timestep (`float` or `torch.Tensor`):
+                The timestep value to find in the schedule.
+            schedule_timesteps (`torch.Tensor`, *optional*):
+                The timestep schedule to search in. If `None`, uses `self.timesteps`.
+
+        Returns:
+            `int`:
+                The index of the timestep in the schedule. For the very first step, returns the second index if
+                multiple matches exist to avoid skipping a sigma when starting mid-schedule (e.g., for image-to-image).
+        """
         if schedule_timesteps is None:
             schedule_timesteps = self.timesteps
 
@@ -266,7 +283,14 @@ def index_for_timestep(self, timestep, schedule_timesteps=None):
         return indices[pos].item()
 
     # Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler._init_step_index
-    def _init_step_index(self, timestep):
+    def _init_step_index(self, timestep: Union[float, torch.Tensor]) -> None:
+        """
+        Initialize the step index for the scheduler based on the given timestep.
+
+        Args:
+            timestep (`float` or `torch.Tensor`):
+                The current timestep to initialize the step index from.
+        """
         if self.begin_index is None:
             if isinstance(timestep, torch.Tensor):
                 timestep = timestep.to(self.timesteps.device)
@@ -291,7 +315,7 @@ def set_begin_index(self, begin_index: int = 0):
         Sets the begin index for the scheduler. This function should be run from pipeline before the inference.
 
         Args:
-            begin_index (`int`):
+            begin_index (`int`, defaults to `0`):
                 The begin index for the scheduler.
         """
         self._begin_index = begin_index
@@ -315,6 +339,8 @@ def scale_model_input(self, sample: torch.Tensor, timestep: Optional[int] = None
     # Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler._threshold_sample
     def _threshold_sample(self, sample: torch.Tensor) -> torch.Tensor:
         """
+        Apply dynamic thresholding to the predicted sample.
+
         "Dynamic thresholding: At each sampling step we set s to a certain percentile absolute pixel value in xt0 (the
         prediction of x_0 at timestep t), and if s > 1, then we threshold xt0 to the range [-s, s] and then divide by
         s. Dynamic thresholding pushes saturated pixels (those near -1 and 1) inwards, thereby actively preventing
@@ -322,6 +348,14 @@ def _threshold_sample(self, sample: torch.Tensor) -> torch.Tensor:
         photorealism as well as better image-text alignment, especially when using very large guidance weights."
 
         https://huggingface.co/papers/2205.11487
+
+        Args:
+            sample (`torch.Tensor`):
+                The predicted sample to be thresholded.
+
+        Returns:
+            `torch.Tensor`:
+                The thresholded sample.
         """
         dtype = sample.dtype
         batch_size, channels, *remaining_dims = sample.shape
@@ -597,6 +631,22 @@ def add_noise(
         noise: torch.Tensor,
         timesteps: torch.IntTensor,
     ) -> torch.Tensor:
+        """
+        Add noise to the original samples according to the noise magnitude at each timestep (this is the forward
+        diffusion process).
+
+        Args:
+            original_samples (`torch.Tensor`):
+                The original samples to which noise will be added.
+            noise (`torch.Tensor`):
+                The noise to add to the samples.
+            timesteps (`torch.IntTensor`):
+                The timesteps indicating the noise level for each sample.
+
+        Returns:
+            `torch.Tensor`:
+                The noisy samples.
+        """
         # Make sure alphas_cumprod and timestep have same device and dtype as original_samples
         # Move the self.alphas_cumprod to device to avoid redundant CPU to GPU data movement
         # for the subsequent add_noise calls
@@ -619,6 +669,21 @@ def add_noise(
 
     # Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler.get_velocity
     def get_velocity(self, sample: torch.Tensor, noise: torch.Tensor, timesteps: torch.IntTensor) -> torch.Tensor:
+        """
+        Compute the velocity prediction from the sample and noise according to the velocity formula.
+
+        Args:
+            sample (`torch.Tensor`):
+                The input sample.
+            noise (`torch.Tensor`):
+                The noise tensor.
+            timesteps (`torch.IntTensor`):
+                The timesteps for velocity computation.
+
+        Returns:
+            `torch.Tensor`:
+                The computed velocity.
+        """
         # Make sure alphas_cumprod and timestep have same device and dtype as sample
         self.alphas_cumprod = self.alphas_cumprod.to(device=sample.device)
         alphas_cumprod = self.alphas_cumprod.to(dtype=sample.dtype)
@@ -642,6 +707,17 @@ def __len__(self):
 
     # Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler.previous_timestep
     def previous_timestep(self, timestep):
+        """
+        Compute the previous timestep in the diffusion chain.
+
+        Args:
+            timestep (`int`):
+                The current timestep.
+
+        Returns:
+            `int`:
+                The previous timestep.
+        """
         if self.custom_timesteps or self.num_inference_steps:
             index = (self.timesteps == timestep).nonzero(as_tuple=True)[0][0]
             if index == self.timesteps.shape[0] - 1:
diff --git a/src/diffusers/schedulers/scheduling_lms_discrete.py b/src/diffusers/schedulers/scheduling_lms_discrete.py
index c2450204aa8f..276af6eeacb7 100644
--- a/src/diffusers/schedulers/scheduling_lms_discrete.py
+++ b/src/diffusers/schedulers/scheduling_lms_discrete.py
@@ -14,7 +14,7 @@
 import math
 import warnings
 from dataclasses import dataclass
-from typing import List, Optional, Tuple, Union
+from typing import List, Literal, Optional, Tuple, Union
 
 import numpy as np
 import scipy.stats
@@ -47,10 +47,10 @@ class LMSDiscreteSchedulerOutput(BaseOutput):
 
 # Copied from diffusers.schedulers.scheduling_ddpm.betas_for_alpha_bar
 def betas_for_alpha_bar(
-    num_diffusion_timesteps,
-    max_beta=0.999,
-    alpha_transform_type="cosine",
-):
+    num_diffusion_timesteps: int,
+    max_beta: float = 0.999,
+    alpha_transform_type: Literal["cosine", "exp"] = "cosine",
+) -> torch.Tensor:
     """
     Create a beta schedule that discretizes the given alpha_t_bar function, which defines the cumulative product of
     (1-beta) over time from t = [0,1].
@@ -58,16 +58,17 @@ def betas_for_alpha_bar(
     Contains a function alpha_bar that takes an argument t and transforms it to the cumulative product of (1-beta) up
     to that part of the diffusion process.
 
-
     Args:
-        num_diffusion_timesteps (`int`): the number of betas to produce.
-        max_beta (`float`): the maximum beta to use; use values lower than 1 to
-                     prevent singularities.
-        alpha_transform_type (`str`, *optional*, default to `cosine`): the type of noise schedule for alpha_bar.
-                     Choose from `cosine` or `exp`
+        num_diffusion_timesteps (`int`):
+            The number of betas to produce.
+        max_beta (`float`, defaults to `0.999`):
+            The maximum beta to use; use values lower than 1 to avoid numerical instability.
+        alpha_transform_type (`"cosine"` or `"exp"`, defaults to `"cosine"`):
+            The type of noise schedule for `alpha_bar`. Choose from `cosine` or `exp`.
 
     Returns:
-        betas (`np.ndarray`): the betas used by the scheduler to step the model outputs
+        `torch.Tensor`:
+            The betas used by the scheduler to step the model outputs.
     """
     if alpha_transform_type == "cosine":
 
@@ -98,15 +99,14 @@ class LMSDiscreteScheduler(SchedulerMixin, ConfigMixin):
     methods the library implements for all schedulers such as loading and saving.
 
     Args:
-        num_train_timesteps (`int`, defaults to 1000):
+        num_train_timesteps (`int`, defaults to `1000`):
             The number of diffusion steps to train the model.
-        beta_start (`float`, defaults to 0.0001):
+        beta_start (`float`, defaults to `0.0001`):
             The starting `beta` value of inference.
-        beta_end (`float`, defaults to 0.02):
+        beta_end (`float`, defaults to `0.02`):
             The final `beta` value.
-        beta_schedule (`str`, defaults to `"linear"`):
-            The beta schedule, a mapping from a beta range to a sequence of betas for stepping the model. Choose from
-            `linear` or `scaled_linear`.
+        beta_schedule (`"linear"`, `"scaled_linear"`, or `"squaredcos_cap_v2"`, defaults to `"linear"`):
+            The beta schedule, a mapping from a beta range to a sequence of betas for stepping the model.
         trained_betas (`np.ndarray`, *optional*):
             Pass an array of betas directly to the constructor to bypass `beta_start` and `beta_end`.
         use_karras_sigmas (`bool`, *optional*, defaults to `False`):
@@ -117,14 +117,14 @@ class LMSDiscreteScheduler(SchedulerMixin, ConfigMixin):
         use_beta_sigmas (`bool`, *optional*, defaults to `False`):
             Whether to use beta sigmas for step sizes in the noise schedule during the sampling process. Refer to [Beta
             Sampling is All You Need](https://huggingface.co/papers/2407.12173) for more information.
-        prediction_type (`str`, defaults to `epsilon`, *optional*):
+        prediction_type (`"epsilon"`, `"sample"`, or `"v_prediction"`, defaults to `"epsilon"`):
             Prediction type of the scheduler function; can be `epsilon` (predicts the noise of the diffusion process),
             `sample` (directly predicts the noisy sample`) or `v_prediction` (see section 2.4 of [Imagen
-            Video](https://imagen.research.google/video/paper.pdf) paper).
-        timestep_spacing (`str`, defaults to `"linspace"`):
+            Video](https://huggingface.co/papers/2210.02303) paper).
+        timestep_spacing (`"linspace"`, `"leading"`, or `"trailing"`, defaults to `"linspace"`):
             The way the timesteps should be scaled. Refer to Table 2 of the [Common Diffusion Noise Schedules and
             Sample Steps are Flawed](https://huggingface.co/papers/2305.08891) for more information.
-        steps_offset (`int`, defaults to 0):
+        steps_offset (`int`, defaults to `0`):
             An offset added to the inference steps, as required by some model families.
     """
 
@@ -137,13 +137,13 @@ def __init__(
         num_train_timesteps: int = 1000,
         beta_start: float = 0.0001,
         beta_end: float = 0.02,
-        beta_schedule: str = "linear",
+        beta_schedule: Literal["linear", "scaled_linear", "squaredcos_cap_v2"] = "linear",
         trained_betas: Optional[Union[np.ndarray, List[float]]] = None,
         use_karras_sigmas: Optional[bool] = False,
         use_exponential_sigmas: Optional[bool] = False,
         use_beta_sigmas: Optional[bool] = False,
-        prediction_type: str = "epsilon",
-        timestep_spacing: str = "linspace",
+        prediction_type: Literal["epsilon", "sample", "v_prediction"] = "epsilon",
+        timestep_spacing: Literal["linspace", "leading", "trailing"] = "linspace",
         steps_offset: int = 0,
     ):
         if sum([self.config.use_beta_sigmas, self.config.use_exponential_sigmas, self.config.use_karras_sigmas]) > 1:
@@ -182,7 +182,15 @@ def __init__(
         self.sigmas = self.sigmas.to("cpu")  # to avoid too much CPU/GPU communication
 
     @property
-    def init_noise_sigma(self):
+    def init_noise_sigma(self) -> Union[float, torch.Tensor]:
+        """
+        The standard deviation of the initial noise distribution.
+
+        Returns:
+            `float` or `torch.Tensor`:
+                The standard deviation of the initial noise distribution, computed based on the maximum sigma value and
+                the timestep spacing configuration.
+        """
         # standard deviation of the initial noise distribution
         if self.config.timestep_spacing in ["linspace", "trailing"]:
             return self.sigmas.max()
@@ -190,26 +198,34 @@ def init_noise_sigma(self):
         return (self.sigmas.max() ** 2 + 1) ** 0.5
 
     @property
-    def step_index(self):
+    def step_index(self) -> Optional[int]:
         """
-        The index counter for current timestep. It will increase 1 after each scheduler step.
+        The index counter for current timestep. It will increase by 1 after each scheduler step.
+
+        Returns:
+            `int` or `None`:
+                The current step index, or `None` if not initialized.
         """
         return self._step_index
 
     @property
-    def begin_index(self):
+    def begin_index(self) -> Optional[int]:
         """
         The index for the first timestep. It should be set from pipeline with `set_begin_index` method.
+
+        Returns:
+            `int` or `None`:
+                The begin index for the scheduler, or `None` if not set.
         """
         return self._begin_index
 
     # Copied from diffusers.schedulers.scheduling_dpmsolver_multistep.DPMSolverMultistepScheduler.set_begin_index
-    def set_begin_index(self, begin_index: int = 0):
+    def set_begin_index(self, begin_index: int = 0) -> None:
         """
         Sets the begin index for the scheduler. This function should be run from pipeline before the inference.
 
         Args:
-            begin_index (`int`):
+            begin_index (`int`, defaults to `0`):
                 The begin index for the scheduler.
         """
         self._begin_index = begin_index
@@ -238,14 +254,21 @@ def scale_model_input(self, sample: torch.Tensor, timestep: Union[float, torch.T
         self.is_scale_input_called = True
         return sample
 
-    def get_lms_coefficient(self, order, t, current_order):
+    def get_lms_coefficient(self, order: int, t: int, current_order: int) -> float:
         """
         Compute the linear multistep coefficient.
 
         Args:
-            order ():
-            t ():
-            current_order ():
+            order (`int`):
+                The order of the linear multistep method.
+            t (`int`):
+                The current timestep index.
+            current_order (`int`):
+                The current order for which to compute the coefficient.
+
+        Returns:
+            `float`:
+                The computed linear multistep coefficient.
         """
 
         def lms_derivative(tau):
@@ -260,7 +283,7 @@ def lms_derivative(tau):
 
         return integrated_coeff
 
-    def set_timesteps(self, num_inference_steps: int, device: Union[str, torch.device] = None):
+    def set_timesteps(self, num_inference_steps: int, device: Optional[Union[str, torch.device]] = None) -> None:
         """
         Sets the discrete timesteps used for the diffusion chain (to be run before inference).
 
@@ -319,7 +342,23 @@ def set_timesteps(self, num_inference_steps: int, device: Union[str, torch.devic
         self.derivatives = []
 
     # Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler.index_for_timestep
-    def index_for_timestep(self, timestep, schedule_timesteps=None):
+    def index_for_timestep(
+        self, timestep: Union[float, torch.Tensor], schedule_timesteps: Optional[torch.Tensor] = None
+    ) -> int:
+        """
+        Find the index of a given timestep in the timestep schedule.
+
+        Args:
+            timestep (`float` or `torch.Tensor`):
+                The timestep value to find in the schedule.
+            schedule_timesteps (`torch.Tensor`, *optional*):
+                The timestep schedule to search in. If `None`, uses `self.timesteps`.
+
+        Returns:
+            `int`:
+                The index of the timestep in the schedule. For the very first step, returns the second index if
+                multiple matches exist to avoid skipping a sigma when starting mid-schedule (e.g., for image-to-image).
+        """
         if schedule_timesteps is None:
             schedule_timesteps = self.timesteps
 
@@ -334,7 +373,14 @@ def index_for_timestep(self, timestep, schedule_timesteps=None):
         return indices[pos].item()
 
     # Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler._init_step_index
-    def _init_step_index(self, timestep):
+    def _init_step_index(self, timestep: Union[float, torch.Tensor]) -> None:
+        """
+        Initialize the step index for the scheduler based on the given timestep.
+
+        Args:
+            timestep (`float` or `torch.Tensor`):
+                The current timestep to initialize the step index from.
+        """
         if self.begin_index is None:
             if isinstance(timestep, torch.Tensor):
                 timestep = timestep.to(self.timesteps.device)
@@ -343,7 +389,20 @@ def _init_step_index(self, timestep):
             self._step_index = self._begin_index
 
     # Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler._sigma_to_t
-    def _sigma_to_t(self, sigma, log_sigmas):
+    def _sigma_to_t(self, sigma: np.ndarray, log_sigmas: np.ndarray) -> np.ndarray:
+        """
+        Convert sigma values to corresponding timestep values through interpolation.
+
+        Args:
+            sigma (`np.ndarray`):
+                The sigma value(s) to convert to timestep(s).
+            log_sigmas (`np.ndarray`):
+                The logarithm of the sigma schedule used for interpolation.
+
+        Returns:
+            `np.ndarray`:
+                The interpolated timestep value(s) corresponding to the input sigma(s).
+        """
         # get log sigma
         log_sigma = np.log(np.maximum(sigma, 1e-10))
 
@@ -366,9 +425,19 @@ def _sigma_to_t(self, sigma, log_sigmas):
         t = t.reshape(sigma.shape)
         return t
 
-    # copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler._convert_to_karras
     def _convert_to_karras(self, in_sigmas: torch.Tensor) -> torch.Tensor:
-        """Constructs the noise schedule of Karras et al. (2022)."""
+        """
+        Construct the noise schedule as proposed in [Elucidating the Design Space of Diffusion-Based Generative
+        Models](https://huggingface.co/papers/2206.00364).
+
+        Args:
+            in_sigmas (`torch.Tensor`):
+                The input sigma values to be converted.
+
+        Returns:
+            `torch.Tensor`:
+                The converted sigma values following the Karras noise schedule.
+        """
 
         sigma_min: float = in_sigmas[-1].item()
         sigma_max: float = in_sigmas[0].item()
@@ -382,7 +451,19 @@ def _convert_to_karras(self, in_sigmas: torch.Tensor) -> torch.Tensor:
 
     # Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler._convert_to_exponential
     def _convert_to_exponential(self, in_sigmas: torch.Tensor, num_inference_steps: int) -> torch.Tensor:
-        """Constructs an exponential noise schedule."""
+        """
+        Construct an exponential noise schedule.
+
+        Args:
+            in_sigmas (`torch.Tensor`):
+                The input sigma values to be converted.
+            num_inference_steps (`int`):
+                The number of inference steps to generate the noise schedule for.
+
+        Returns:
+            `torch.Tensor`:
+                The converted sigma values following an exponential schedule.
+        """
 
         # Hack to make sure that other schedulers which copy this function don't break
         # TODO: Add this logic to the other schedulers
@@ -406,7 +487,24 @@ def _convert_to_exponential(self, in_sigmas: torch.Tensor, num_inference_steps:
     def _convert_to_beta(
         self, in_sigmas: torch.Tensor, num_inference_steps: int, alpha: float = 0.6, beta: float = 0.6
     ) -> torch.Tensor:
-        """From "Beta Sampling is All You Need" [arXiv:2407.12173] (Lee et. al, 2024)"""
+        """
+        Construct a beta noise schedule as proposed in [Beta Sampling is All You
+        Need](https://huggingface.co/papers/2407.12173).
+
+        Args:
+            in_sigmas (`torch.Tensor`):
+                The input sigma values to be converted.
+            num_inference_steps (`int`):
+                The number of inference steps to generate the noise schedule for.
+            alpha (`float`, *optional*, defaults to `0.6`):
+                The alpha parameter for the beta distribution.
+            beta (`float`, *optional*, defaults to `0.6`):
+                The beta parameter for the beta distribution.
+
+        Returns:
+            `torch.Tensor`:
+                The converted sigma values following a beta distribution schedule.
+        """
 
         # Hack to make sure that other schedulers which copy this function don't break
         # TODO: Add this logic to the other schedulers
@@ -521,6 +619,21 @@ def add_noise(
         noise: torch.Tensor,
         timesteps: torch.Tensor,
     ) -> torch.Tensor:
+        """
+        Add noise to the original samples according to the noise schedule at the specified timesteps.
+
+        Args:
+            original_samples (`torch.Tensor`):
+                The original samples to which noise will be added.
+            noise (`torch.Tensor`):
+                The noise tensor to add to the original samples.
+            timesteps (`torch.Tensor`):
+                The timesteps at which to add noise, determining the noise level from the schedule.
+
+        Returns:
+            `torch.Tensor`:
+                The noisy samples with added noise scaled according to the timestep schedule.
+        """
         # Make sure sigmas and timesteps have the same device and dtype as original_samples
         sigmas = self.sigmas.to(device=original_samples.device, dtype=original_samples.dtype)
         if original_samples.device.type == "mps" and torch.is_floating_point(timesteps):
@@ -548,5 +661,5 @@ def add_noise(
         noisy_samples = original_samples + noise * sigma
         return noisy_samples
 
-    def __len__(self):
+    def __len__(self) -> int:
         return self.config.num_train_timesteps
diff --git a/src/diffusers/schedulers/scheduling_lms_discrete_flax.py b/src/diffusers/schedulers/scheduling_lms_discrete_flax.py
index b8e08ff9e134..3fd4dc8a5d61 100644
--- a/src/diffusers/schedulers/scheduling_lms_discrete_flax.py
+++ b/src/diffusers/schedulers/scheduling_lms_discrete_flax.py
@@ -77,7 +77,7 @@ class FlaxLMSDiscreteScheduler(FlaxSchedulerMixin, ConfigMixin):
         prediction_type (`str`, default `epsilon`, optional):
             prediction type of the scheduler function, one of `epsilon` (predicting the noise of the diffusion
             process), `sample` (directly predicting the noisy sample`) or `v_prediction` (see section 2.4
-            https://imagen.research.google/video/paper.pdf)
+            https://huggingface.co/papers/2210.02303)
         dtype (`jnp.dtype`, *optional*, defaults to `jnp.float32`):
             the `dtype` used for params and computation.
     """
diff --git a/src/diffusers/schedulers/scheduling_pndm.py b/src/diffusers/schedulers/scheduling_pndm.py
index c07621179e2b..651532b06ddb 100644
--- a/src/diffusers/schedulers/scheduling_pndm.py
+++ b/src/diffusers/schedulers/scheduling_pndm.py
@@ -15,7 +15,7 @@
 # DISCLAIMER: This file is strongly influenced by https://github.com/ermongroup/ddim
 
 import math
-from typing import List, Optional, Tuple, Union
+from typing import List, Literal, Optional, Tuple, Union
 
 import numpy as np
 import torch
@@ -26,10 +26,10 @@
 
 # Copied from diffusers.schedulers.scheduling_ddpm.betas_for_alpha_bar
 def betas_for_alpha_bar(
-    num_diffusion_timesteps,
-    max_beta=0.999,
-    alpha_transform_type="cosine",
-):
+    num_diffusion_timesteps: int,
+    max_beta: float = 0.999,
+    alpha_transform_type: Literal["cosine", "exp"] = "cosine",
+) -> torch.Tensor:
     """
     Create a beta schedule that discretizes the given alpha_t_bar function, which defines the cumulative product of
     (1-beta) over time from t = [0,1].
@@ -37,16 +37,17 @@ def betas_for_alpha_bar(
     Contains a function alpha_bar that takes an argument t and transforms it to the cumulative product of (1-beta) up
     to that part of the diffusion process.
 
-
     Args:
-        num_diffusion_timesteps (`int`): the number of betas to produce.
-        max_beta (`float`): the maximum beta to use; use values lower than 1 to
-                     prevent singularities.
-        alpha_transform_type (`str`, *optional*, default to `cosine`): the type of noise schedule for alpha_bar.
-                     Choose from `cosine` or `exp`
+        num_diffusion_timesteps (`int`):
+            The number of betas to produce.
+        max_beta (`float`, defaults to `0.999`):
+            The maximum beta to use; use values lower than 1 to avoid numerical instability.
+        alpha_transform_type (`"cosine"` or `"exp"`, defaults to `"cosine"`):
+            The type of noise schedule for `alpha_bar`. Choose from `cosine` or `exp`.
 
     Returns:
-        betas (`np.ndarray`): the betas used by the scheduler to step the model outputs
+        `torch.Tensor`:
+            The betas used by the scheduler to step the model outputs.
     """
     if alpha_transform_type == "cosine":
 
@@ -78,15 +79,14 @@ class PNDMScheduler(SchedulerMixin, ConfigMixin):
     methods the library implements for all schedulers such as loading and saving.
 
     Args:
-        num_train_timesteps (`int`, defaults to 1000):
+        num_train_timesteps (`int`, defaults to `1000`):
             The number of diffusion steps to train the model.
-        beta_start (`float`, defaults to 0.0001):
+        beta_start (`float`, defaults to `0.0001`):
             The starting `beta` value of inference.
-        beta_end (`float`, defaults to 0.02):
+        beta_end (`float`, defaults to `0.02`):
             The final `beta` value.
-        beta_schedule (`str`, defaults to `"linear"`):
-            The beta schedule, a mapping from a beta range to a sequence of betas for stepping the model. Choose from
-            `linear`, `scaled_linear`, or `squaredcos_cap_v2`.
+        beta_schedule (`"linear"`, `"scaled_linear"`, or `"squaredcos_cap_v2"`, defaults to `"linear"`):
+            The beta schedule, a mapping from a beta range to a sequence of betas for stepping the model.
         trained_betas (`np.ndarray`, *optional*):
             Pass an array of betas directly to the constructor to bypass `beta_start` and `beta_end`.
         skip_prk_steps (`bool`, defaults to `False`):
@@ -96,14 +96,13 @@ class PNDMScheduler(SchedulerMixin, ConfigMixin):
             Each diffusion step uses the alphas product value at that step and at the previous one. For the final step
             there is no previous alpha. When this option is `True` the previous alpha product is fixed to `1`,
             otherwise it uses the alpha value at step 0.
-        prediction_type (`str`, defaults to `epsilon`, *optional*):
+        prediction_type (`"epsilon"` or `"v_prediction"`, defaults to `"epsilon"`):
             Prediction type of the scheduler function; can be `epsilon` (predicts the noise of the diffusion process)
-            or `v_prediction` (see section 2.4 of [Imagen Video](https://imagen.research.google/video/paper.pdf)
-            paper).
-        timestep_spacing (`str`, defaults to `"leading"`):
+            or `v_prediction` (see section 2.4 of [Imagen Video](https://huggingface.co/papers/2210.02303) paper).
+        timestep_spacing (`"linspace"`, `"leading"`, or `"trailing"`, defaults to `"leading"`):
             The way the timesteps should be scaled. Refer to Table 2 of the [Common Diffusion Noise Schedules and
             Sample Steps are Flawed](https://huggingface.co/papers/2305.08891) for more information.
-        steps_offset (`int`, defaults to 0):
+        steps_offset (`int`, defaults to `0`):
             An offset added to the inference steps, as required by some model families.
     """
 
@@ -116,12 +115,12 @@ def __init__(
         num_train_timesteps: int = 1000,
         beta_start: float = 0.0001,
         beta_end: float = 0.02,
-        beta_schedule: str = "linear",
+        beta_schedule: Literal["linear", "scaled_linear", "squaredcos_cap_v2"] = "linear",
         trained_betas: Optional[Union[np.ndarray, List[float]]] = None,
         skip_prk_steps: bool = False,
         set_alpha_to_one: bool = False,
-        prediction_type: str = "epsilon",
-        timestep_spacing: str = "leading",
+        prediction_type: Literal["epsilon", "v_prediction"] = "epsilon",
+        timestep_spacing: Literal["linspace", "leading", "trailing"] = "leading",
         steps_offset: int = 0,
     ):
         if trained_betas is not None:
@@ -163,7 +162,7 @@ def __init__(
         self.plms_timesteps = None
         self.timesteps = None
 
-    def set_timesteps(self, num_inference_steps: int, device: Union[str, torch.device] = None):
+    def set_timesteps(self, num_inference_steps: int, device: Optional[Union[str, torch.device]] = None) -> None:
         """
         Sets the discrete timesteps used for the diffusion chain (to be run before inference).
 
@@ -242,7 +241,7 @@ def step(
                 The current discrete timestep in the diffusion chain.
             sample (`torch.Tensor`):
                 A current instance of a sample created by the diffusion process.
-            return_dict (`bool`):
+            return_dict (`bool`, defaults to `True`):
                 Whether or not to return a [`~schedulers.scheduling_utils.SchedulerOutput`] or `tuple`.
 
         Returns:
@@ -275,14 +274,13 @@ def step_prk(
                 The current discrete timestep in the diffusion chain.
             sample (`torch.Tensor`):
                 A current instance of a sample created by the diffusion process.
-            return_dict (`bool`):
+            return_dict (`bool`, defaults to `True`):
                 Whether or not to return a [`~schedulers.scheduling_utils.SchedulerOutput`] or tuple.
 
         Returns:
             [`~schedulers.scheduling_utils.SchedulerOutput`] or `tuple`:
                 If return_dict is `True`, [`~schedulers.scheduling_utils.SchedulerOutput`] is returned, otherwise a
                 tuple is returned where the first element is the sample tensor.
-
         """
         if self.num_inference_steps is None:
             raise ValueError(
@@ -334,14 +332,13 @@ def step_plms(
                 The current discrete timestep in the diffusion chain.
             sample (`torch.Tensor`):
                 A current instance of a sample created by the diffusion process.
-            return_dict (`bool`):
+            return_dict (`bool`, defaults to `True`):
                 Whether or not to return a [`~schedulers.scheduling_utils.SchedulerOutput`] or tuple.
 
         Returns:
             [`~schedulers.scheduling_utils.SchedulerOutput`] or `tuple`:
                 If return_dict is `True`, [`~schedulers.scheduling_utils.SchedulerOutput`] is returned, otherwise a
                 tuple is returned where the first element is the sample tensor.
-
         """
         if self.num_inference_steps is None:
             raise ValueError(
@@ -402,19 +399,27 @@ def scale_model_input(self, sample: torch.Tensor, *args, **kwargs) -> torch.Tens
         """
         return sample
 
-    def _get_prev_sample(self, sample, timestep, prev_timestep, model_output):
-        # See formula (9) of PNDM paper https://huggingface.co/papers/2202.09778
-        # this function computes x_(t−δ) using the formula of (9)
-        # Note that x_t needs to be added to both sides of the equation
-
-        # Notation (<variable name> -> <name in paper>
-        # alpha_prod_t -> α_t
-        # alpha_prod_t_prev -> α_(t−δ)
-        # beta_prod_t -> (1 - α_t)
-        # beta_prod_t_prev -> (1 - α_(t−δ))
-        # sample -> x_t
-        # model_output -> e_θ(x_t, t)
-        # prev_sample -> x_(t−δ)
+    def _get_prev_sample(
+        self, sample: torch.Tensor, timestep: int, prev_timestep: int, model_output: torch.Tensor
+    ) -> torch.Tensor:
+        """
+        Compute the previous sample x_(t-δ) from the current sample x_t using formula (9) from the [PNDM
+        paper](https://huggingface.co/papers/2202.09778).
+
+        Args:
+            sample (`torch.Tensor`):
+                The current sample x_t.
+            timestep (`int`):
+                The current timestep t.
+            prev_timestep (`int`):
+                The previous timestep (t-δ).
+            model_output (`torch.Tensor`):
+                The model output e_θ(x_t, t).
+
+        Returns:
+            `torch.Tensor`:
+                The previous sample x_(t-δ).
+        """
         alpha_prod_t = self.alphas_cumprod[timestep]
         alpha_prod_t_prev = self.alphas_cumprod[prev_timestep] if prev_timestep >= 0 else self.final_alpha_cumprod
         beta_prod_t = 1 - alpha_prod_t
@@ -452,6 +457,22 @@ def add_noise(
         noise: torch.Tensor,
         timesteps: torch.IntTensor,
     ) -> torch.Tensor:
+        """
+        Add noise to the original samples according to the noise magnitude at each timestep (this is the forward
+        diffusion process).
+
+        Args:
+            original_samples (`torch.Tensor`):
+                The original samples to which noise will be added.
+            noise (`torch.Tensor`):
+                The noise to add to the samples.
+            timesteps (`torch.IntTensor`):
+                The timesteps indicating the noise level for each sample.
+
+        Returns:
+            `torch.Tensor`:
+                The noisy samples.
+        """
         # Make sure alphas_cumprod and timestep have same device and dtype as original_samples
         # Move the self.alphas_cumprod to device to avoid redundant CPU to GPU data movement
         # for the subsequent add_noise calls
@@ -472,5 +493,5 @@ def add_noise(
         noisy_samples = sqrt_alpha_prod * original_samples + sqrt_one_minus_alpha_prod * noise
         return noisy_samples
 
-    def __len__(self):
+    def __len__(self) -> int:
         return self.config.num_train_timesteps
diff --git a/src/diffusers/schedulers/scheduling_pndm_flax.py b/src/diffusers/schedulers/scheduling_pndm_flax.py
index 12e22005afaf..44bafccd5520 100644
--- a/src/diffusers/schedulers/scheduling_pndm_flax.py
+++ b/src/diffusers/schedulers/scheduling_pndm_flax.py
@@ -103,7 +103,7 @@ class FlaxPNDMScheduler(FlaxSchedulerMixin, ConfigMixin):
         prediction_type (`str`, default `epsilon`, optional):
             prediction type of the scheduler function, one of `epsilon` (predicting the noise of the diffusion
             process), `sample` (directly predicting the noisy sample`) or `v_prediction` (see section 2.4
-            https://imagen.research.google/video/paper.pdf)
+            https://huggingface.co/papers/2210.02303)
         dtype (`jnp.dtype`, *optional*, defaults to `jnp.float32`):
             the `dtype` used for params and computation.
     """
diff --git a/src/diffusers/schedulers/scheduling_repaint.py b/src/diffusers/schedulers/scheduling_repaint.py
index 6530c5af9e5b..a2eaf8eb3abd 100644
--- a/src/diffusers/schedulers/scheduling_repaint.py
+++ b/src/diffusers/schedulers/scheduling_repaint.py
@@ -14,7 +14,7 @@
 
 import math
 from dataclasses import dataclass
-from typing import Optional, Tuple, Union
+from typing import Literal, Optional, Tuple, Union
 
 import numpy as np
 import torch
@@ -45,10 +45,10 @@ class RePaintSchedulerOutput(BaseOutput):
 
 # Copied from diffusers.schedulers.scheduling_ddpm.betas_for_alpha_bar
 def betas_for_alpha_bar(
-    num_diffusion_timesteps,
-    max_beta=0.999,
-    alpha_transform_type="cosine",
-):
+    num_diffusion_timesteps: int,
+    max_beta: float = 0.999,
+    alpha_transform_type: Literal["cosine", "exp"] = "cosine",
+) -> torch.Tensor:
     """
     Create a beta schedule that discretizes the given alpha_t_bar function, which defines the cumulative product of
     (1-beta) over time from t = [0,1].
@@ -56,16 +56,17 @@ def betas_for_alpha_bar(
     Contains a function alpha_bar that takes an argument t and transforms it to the cumulative product of (1-beta) up
     to that part of the diffusion process.
 
-
     Args:
-        num_diffusion_timesteps (`int`): the number of betas to produce.
-        max_beta (`float`): the maximum beta to use; use values lower than 1 to
-                     prevent singularities.
-        alpha_transform_type (`str`, *optional*, default to `cosine`): the type of noise schedule for alpha_bar.
-                     Choose from `cosine` or `exp`
+        num_diffusion_timesteps (`int`):
+            The number of betas to produce.
+        max_beta (`float`, defaults to `0.999`):
+            The maximum beta to use; use values lower than 1 to avoid numerical instability.
+        alpha_transform_type (`"cosine"` or `"exp"`, defaults to `"cosine"`):
+            The type of noise schedule for `alpha_bar`. Choose from `cosine` or `exp`.
 
     Returns:
-        betas (`np.ndarray`): the betas used by the scheduler to step the model outputs
+        `torch.Tensor`:
+            The betas used by the scheduler to step the model outputs.
     """
     if alpha_transform_type == "cosine":
 
diff --git a/src/diffusers/schedulers/scheduling_sasolver.py b/src/diffusers/schedulers/scheduling_sasolver.py
index 2979ce193a36..5783e20de69d 100644
--- a/src/diffusers/schedulers/scheduling_sasolver.py
+++ b/src/diffusers/schedulers/scheduling_sasolver.py
@@ -16,7 +16,7 @@
 # The codebase is modified based on https://github.com/huggingface/diffusers/blob/main/src/diffusers/schedulers/scheduling_dpmsolver_multistep.py
 
 import math
-from typing import Callable, List, Optional, Tuple, Union
+from typing import Callable, List, Literal, Optional, Tuple, Union
 
 import numpy as np
 import torch
@@ -33,10 +33,10 @@
 
 # Copied from diffusers.schedulers.scheduling_ddpm.betas_for_alpha_bar
 def betas_for_alpha_bar(
-    num_diffusion_timesteps,
-    max_beta=0.999,
-    alpha_transform_type="cosine",
-):
+    num_diffusion_timesteps: int,
+    max_beta: float = 0.999,
+    alpha_transform_type: Literal["cosine", "exp"] = "cosine",
+) -> torch.Tensor:
     """
     Create a beta schedule that discretizes the given alpha_t_bar function, which defines the cumulative product of
     (1-beta) over time from t = [0,1].
@@ -44,16 +44,17 @@ def betas_for_alpha_bar(
     Contains a function alpha_bar that takes an argument t and transforms it to the cumulative product of (1-beta) up
     to that part of the diffusion process.
 
-
     Args:
-        num_diffusion_timesteps (`int`): the number of betas to produce.
-        max_beta (`float`): the maximum beta to use; use values lower than 1 to
-                     prevent singularities.
-        alpha_transform_type (`str`, *optional*, default to `cosine`): the type of noise schedule for alpha_bar.
-                     Choose from `cosine` or `exp`
+        num_diffusion_timesteps (`int`):
+            The number of betas to produce.
+        max_beta (`float`, defaults to `0.999`):
+            The maximum beta to use; use values lower than 1 to avoid numerical instability.
+        alpha_transform_type (`"cosine"` or `"exp"`, defaults to `"cosine"`):
+            The type of noise schedule for `alpha_bar`. Choose from `cosine` or `exp`.
 
     Returns:
-        betas (`np.ndarray`): the betas used by the scheduler to step the model outputs
+        `torch.Tensor`:
+            The betas used by the scheduler to step the model outputs.
     """
     if alpha_transform_type == "cosine":
 
@@ -104,7 +105,7 @@ class SASolverScheduler(SchedulerMixin, ConfigMixin):
         prediction_type (`str`, defaults to `epsilon`, *optional*):
             Prediction type of the scheduler function; can be `epsilon` (predicts the noise of the diffusion process),
             `sample` (directly predicts the noisy sample`) or `v_prediction` (see section 2.4 of [Imagen
-            Video](https://imagen.research.google/video/paper.pdf) paper).
+            Video](https://huggingface.co/papers/2210.02303) paper).
         tau_func (`Callable`, *optional*):
             Stochasticity during the sampling. Default in init is `lambda t: 1 if t >= 200 and t <= 800 else 0`.
             SA-Solver will sample from vanilla diffusion ODE if tau_func is set to `lambda t: 0`. SA-Solver will sample
@@ -253,7 +254,7 @@ def set_begin_index(self, begin_index: int = 0):
         Sets the begin index for the scheduler. This function should be run from pipeline before the inference.
 
         Args:
-            begin_index (`int`):
+            begin_index (`int`, defaults to `0`):
                 The begin index for the scheduler.
         """
         self._begin_index = begin_index
@@ -342,6 +343,8 @@ def set_timesteps(self, num_inference_steps: int = None, device: Union[str, torc
     # Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler._threshold_sample
     def _threshold_sample(self, sample: torch.Tensor) -> torch.Tensor:
         """
+        Apply dynamic thresholding to the predicted sample.
+
         "Dynamic thresholding: At each sampling step we set s to a certain percentile absolute pixel value in xt0 (the
         prediction of x_0 at timestep t), and if s > 1, then we threshold xt0 to the range [-s, s] and then divide by
         s. Dynamic thresholding pushes saturated pixels (those near -1 and 1) inwards, thereby actively preventing
@@ -349,6 +352,14 @@ def _threshold_sample(self, sample: torch.Tensor) -> torch.Tensor:
         photorealism as well as better image-text alignment, especially when using very large guidance weights."
 
         https://huggingface.co/papers/2205.11487
+
+        Args:
+            sample (`torch.Tensor`):
+                The predicted sample to be thresholded.
+
+        Returns:
+            `torch.Tensor`:
+                The thresholded sample.
         """
         dtype = sample.dtype
         batch_size, channels, *remaining_dims = sample.shape
@@ -375,6 +386,19 @@ def _threshold_sample(self, sample: torch.Tensor) -> torch.Tensor:
 
     # Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler._sigma_to_t
     def _sigma_to_t(self, sigma, log_sigmas):
+        """
+        Convert sigma values to corresponding timestep values through interpolation.
+
+        Args:
+            sigma (`np.ndarray`):
+                The sigma value(s) to convert to timestep(s).
+            log_sigmas (`np.ndarray`):
+                The logarithm of the sigma schedule used for interpolation.
+
+        Returns:
+            `np.ndarray`:
+                The interpolated timestep value(s) corresponding to the input sigma(s).
+        """
         # get log sigma
         log_sigma = np.log(np.maximum(sigma, 1e-10))
 
@@ -399,6 +423,17 @@ def _sigma_to_t(self, sigma, log_sigmas):
 
     # Copied from diffusers.schedulers.scheduling_dpmsolver_multistep.DPMSolverMultistepScheduler._sigma_to_alpha_sigma_t
     def _sigma_to_alpha_sigma_t(self, sigma):
+        """
+        Convert sigma values to alpha_t and sigma_t values.
+
+        Args:
+            sigma (`torch.Tensor`):
+                The sigma value(s) to convert.
+
+        Returns:
+            `Tuple[torch.Tensor, torch.Tensor]`:
+                A tuple containing (alpha_t, sigma_t) values.
+        """
         if self.config.use_flow_sigmas:
             alpha_t = 1 - sigma
             sigma_t = sigma
@@ -410,7 +445,20 @@ def _sigma_to_alpha_sigma_t(self, sigma):
 
     # Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler._convert_to_karras
     def _convert_to_karras(self, in_sigmas: torch.Tensor, num_inference_steps) -> torch.Tensor:
-        """Constructs the noise schedule of Karras et al. (2022)."""
+        """
+        Construct the noise schedule as proposed in [Elucidating the Design Space of Diffusion-Based Generative
+        Models](https://huggingface.co/papers/2206.00364).
+
+        Args:
+            in_sigmas (`torch.Tensor`):
+                The input sigma values to be converted.
+            num_inference_steps (`int`):
+                The number of inference steps to generate the noise schedule for.
+
+        Returns:
+            `torch.Tensor`:
+                The converted sigma values following the Karras noise schedule.
+        """
 
         # Hack to make sure that other schedulers which copy this function don't break
         # TODO: Add this logic to the other schedulers
@@ -436,7 +484,19 @@ def _convert_to_karras(self, in_sigmas: torch.Tensor, num_inference_steps) -> to
 
     # Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler._convert_to_exponential
     def _convert_to_exponential(self, in_sigmas: torch.Tensor, num_inference_steps: int) -> torch.Tensor:
-        """Constructs an exponential noise schedule."""
+        """
+        Construct an exponential noise schedule.
+
+        Args:
+            in_sigmas (`torch.Tensor`):
+                The input sigma values to be converted.
+            num_inference_steps (`int`):
+                The number of inference steps to generate the noise schedule for.
+
+        Returns:
+            `torch.Tensor`:
+                The converted sigma values following an exponential schedule.
+        """
 
         # Hack to make sure that other schedulers which copy this function don't break
         # TODO: Add this logic to the other schedulers
@@ -460,7 +520,24 @@ def _convert_to_exponential(self, in_sigmas: torch.Tensor, num_inference_steps:
     def _convert_to_beta(
         self, in_sigmas: torch.Tensor, num_inference_steps: int, alpha: float = 0.6, beta: float = 0.6
     ) -> torch.Tensor:
-        """From "Beta Sampling is All You Need" [arXiv:2407.12173] (Lee et. al, 2024)"""
+        """
+        Construct a beta noise schedule as proposed in [Beta Sampling is All You
+        Need](https://huggingface.co/papers/2407.12173).
+
+        Args:
+            in_sigmas (`torch.Tensor`):
+                The input sigma values to be converted.
+            num_inference_steps (`int`):
+                The number of inference steps to generate the noise schedule for.
+            alpha (`float`, *optional*, defaults to `0.6`):
+                The alpha parameter for the beta distribution.
+            beta (`float`, *optional*, defaults to `0.6`):
+                The beta parameter for the beta distribution.
+
+        Returns:
+            `torch.Tensor`:
+                The converted sigma values following a beta distribution schedule.
+        """
 
         # Hack to make sure that other schedulers which copy this function don't break
         # TODO: Add this logic to the other schedulers
@@ -1037,7 +1114,22 @@ def stochastic_adams_moulton_update(
         return x_t
 
     # Copied from diffusers.schedulers.scheduling_dpmsolver_multistep.DPMSolverMultistepScheduler.index_for_timestep
-    def index_for_timestep(self, timestep, schedule_timesteps=None):
+    def index_for_timestep(
+        self, timestep: Union[int, torch.Tensor], schedule_timesteps: Optional[torch.Tensor] = None
+    ) -> int:
+        """
+        Find the index for a given timestep in the schedule.
+
+        Args:
+            timestep (`int` or `torch.Tensor`):
+                The timestep for which to find the index.
+            schedule_timesteps (`torch.Tensor`, *optional*):
+                The timestep schedule to search in. If `None`, uses `self.timesteps`.
+
+        Returns:
+            `int`:
+                The index of the timestep in the schedule.
+        """
         if schedule_timesteps is None:
             schedule_timesteps = self.timesteps
 
@@ -1060,6 +1152,10 @@ def index_for_timestep(self, timestep, schedule_timesteps=None):
     def _init_step_index(self, timestep):
         """
         Initialize the step_index counter for the scheduler.
+
+        Args:
+            timestep (`int` or `torch.Tensor`):
+                The current timestep for which to initialize the step index.
         """
 
         if self.begin_index is None:
@@ -1193,6 +1289,22 @@ def add_noise(
         noise: torch.Tensor,
         timesteps: torch.IntTensor,
     ) -> torch.Tensor:
+        """
+        Add noise to the original samples according to the noise magnitude at each timestep (this is the forward
+        diffusion process).
+
+        Args:
+            original_samples (`torch.Tensor`):
+                The original samples to which noise will be added.
+            noise (`torch.Tensor`):
+                The noise to add to the samples.
+            timesteps (`torch.IntTensor`):
+                The timesteps indicating the noise level for each sample.
+
+        Returns:
+            `torch.Tensor`:
+                The noisy samples.
+        """
         # Make sure alphas_cumprod and timestep have same device and dtype as original_samples
         # Move the self.alphas_cumprod to device to avoid redundant CPU to GPU data movement
         # for the subsequent add_noise calls
diff --git a/src/diffusers/schedulers/scheduling_scm.py b/src/diffusers/schedulers/scheduling_scm.py
index 63b4a109ff9b..7b01d886299c 100644
--- a/src/diffusers/schedulers/scheduling_scm.py
+++ b/src/diffusers/schedulers/scheduling_scm.py
@@ -109,7 +109,7 @@ def set_begin_index(self, begin_index: int = 0):
         Sets the begin index for the scheduler. This function should be run from pipeline before the inference.
 
         Args:
-            begin_index (`int`):
+            begin_index (`int`, defaults to `0`):
                 The begin index for the scheduler.
         """
         self._begin_index = begin_index
@@ -173,7 +173,14 @@ def set_timesteps(
         self._begin_index = None
 
     # Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler._init_step_index
-    def _init_step_index(self, timestep):
+    def _init_step_index(self, timestep: Union[float, torch.Tensor]) -> None:
+        """
+        Initialize the step index for the scheduler based on the given timestep.
+
+        Args:
+            timestep (`float` or `torch.Tensor`):
+                The current timestep to initialize the step index from.
+        """
         if self.begin_index is None:
             if isinstance(timestep, torch.Tensor):
                 timestep = timestep.to(self.timesteps.device)
@@ -182,7 +189,23 @@ def _init_step_index(self, timestep):
             self._step_index = self._begin_index
 
     # Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler.index_for_timestep
-    def index_for_timestep(self, timestep, schedule_timesteps=None):
+    def index_for_timestep(
+        self, timestep: Union[float, torch.Tensor], schedule_timesteps: Optional[torch.Tensor] = None
+    ) -> int:
+        """
+        Find the index of a given timestep in the timestep schedule.
+
+        Args:
+            timestep (`float` or `torch.Tensor`):
+                The timestep value to find in the schedule.
+            schedule_timesteps (`torch.Tensor`, *optional*):
+                The timestep schedule to search in. If `None`, uses `self.timesteps`.
+
+        Returns:
+            `int`:
+                The index of the timestep in the schedule. For the very first step, returns the second index if
+                multiple matches exist to avoid skipping a sigma when starting mid-schedule (e.g., for image-to-image).
+        """
         if schedule_timesteps is None:
             schedule_timesteps = self.timesteps
 
diff --git a/src/diffusers/schedulers/scheduling_tcd.py b/src/diffusers/schedulers/scheduling_tcd.py
index 3fd5c341eca9..7b4840ffdb19 100644
--- a/src/diffusers/schedulers/scheduling_tcd.py
+++ b/src/diffusers/schedulers/scheduling_tcd.py
@@ -17,7 +17,7 @@
 
 import math
 from dataclasses import dataclass
-from typing import List, Optional, Tuple, Union
+from typing import List, Literal, Optional, Tuple, Union
 
 import numpy as np
 import torch
@@ -50,10 +50,10 @@ class TCDSchedulerOutput(BaseOutput):
 
 # Copied from diffusers.schedulers.scheduling_ddpm.betas_for_alpha_bar
 def betas_for_alpha_bar(
-    num_diffusion_timesteps,
-    max_beta=0.999,
-    alpha_transform_type="cosine",
-):
+    num_diffusion_timesteps: int,
+    max_beta: float = 0.999,
+    alpha_transform_type: Literal["cosine", "exp"] = "cosine",
+) -> torch.Tensor:
     """
     Create a beta schedule that discretizes the given alpha_t_bar function, which defines the cumulative product of
     (1-beta) over time from t = [0,1].
@@ -61,16 +61,17 @@ def betas_for_alpha_bar(
     Contains a function alpha_bar that takes an argument t and transforms it to the cumulative product of (1-beta) up
     to that part of the diffusion process.
 
-
     Args:
-        num_diffusion_timesteps (`int`): the number of betas to produce.
-        max_beta (`float`): the maximum beta to use; use values lower than 1 to
-                     prevent singularities.
-        alpha_transform_type (`str`, *optional*, default to `cosine`): the type of noise schedule for alpha_bar.
-                     Choose from `cosine` or `exp`
+        num_diffusion_timesteps (`int`):
+            The number of betas to produce.
+        max_beta (`float`, defaults to `0.999`):
+            The maximum beta to use; use values lower than 1 to avoid numerical instability.
+        alpha_transform_type (`"cosine"` or `"exp"`, defaults to `"cosine"`):
+            The type of noise schedule for `alpha_bar`. Choose from `cosine` or `exp`.
 
     Returns:
-        betas (`np.ndarray`): the betas used by the scheduler to step the model outputs
+        `torch.Tensor`:
+            The betas used by the scheduler to step the model outputs.
     """
     if alpha_transform_type == "cosine":
 
@@ -98,13 +99,13 @@ def rescale_zero_terminal_snr(betas: torch.Tensor) -> torch.Tensor:
     """
     Rescales betas to have zero terminal SNR Based on https://huggingface.co/papers/2305.08891 (Algorithm 1)
 
-
     Args:
         betas (`torch.Tensor`):
-            the betas that the scheduler is being initialized with.
+            The betas that the scheduler is being initialized with.
 
     Returns:
-        `torch.Tensor`: rescaled betas with zero terminal SNR
+        `torch.Tensor`:
+            Rescaled betas with zero terminal SNR.
     """
     # Convert betas to alphas_bar_sqrt
     alphas = 1.0 - betas
@@ -170,7 +171,7 @@ class TCDScheduler(SchedulerMixin, ConfigMixin):
         prediction_type (`str`, defaults to `epsilon`, *optional*):
             Prediction type of the scheduler function; can be `epsilon` (predicts the noise of the diffusion process),
             `sample` (directly predicts the noisy sample`) or `v_prediction` (see section 2.4 of [Imagen
-            Video](https://imagen.research.google/video/paper.pdf) paper).
+            Video](https://huggingface.co/papers/2210.02303) paper).
         thresholding (`bool`, defaults to `False`):
             Whether to use the "dynamic thresholding" method. This is unsuitable for latent-space diffusion models such
             as Stable Diffusion.
@@ -252,7 +253,23 @@ def __init__(
         self._begin_index = None
 
     # Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler.index_for_timestep
-    def index_for_timestep(self, timestep, schedule_timesteps=None):
+    def index_for_timestep(
+        self, timestep: Union[float, torch.Tensor], schedule_timesteps: Optional[torch.Tensor] = None
+    ) -> int:
+        """
+        Find the index of a given timestep in the timestep schedule.
+
+        Args:
+            timestep (`float` or `torch.Tensor`):
+                The timestep value to find in the schedule.
+            schedule_timesteps (`torch.Tensor`, *optional*):
+                The timestep schedule to search in. If `None`, uses `self.timesteps`.
+
+        Returns:
+            `int`:
+                The index of the timestep in the schedule. For the very first step, returns the second index if
+                multiple matches exist to avoid skipping a sigma when starting mid-schedule (e.g., for image-to-image).
+        """
         if schedule_timesteps is None:
             schedule_timesteps = self.timesteps
 
@@ -267,7 +284,14 @@ def index_for_timestep(self, timestep, schedule_timesteps=None):
         return indices[pos].item()
 
     # Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler._init_step_index
-    def _init_step_index(self, timestep):
+    def _init_step_index(self, timestep: Union[float, torch.Tensor]) -> None:
+        """
+        Initialize the step index for the scheduler based on the given timestep.
+
+        Args:
+            timestep (`float` or `torch.Tensor`):
+                The current timestep to initialize the step index from.
+        """
         if self.begin_index is None:
             if isinstance(timestep, torch.Tensor):
                 timestep = timestep.to(self.timesteps.device)
@@ -292,7 +316,7 @@ def set_begin_index(self, begin_index: int = 0):
         Sets the begin index for the scheduler. This function should be run from pipeline before the inference.
 
         Args:
-            begin_index (`int`):
+            begin_index (`int`, defaults to `0`):
                 The begin index for the scheduler.
         """
         self._begin_index = begin_index
@@ -316,6 +340,24 @@ def scale_model_input(self, sample: torch.Tensor, timestep: Optional[int] = None
 
     # Copied from diffusers.schedulers.scheduling_ddim.DDIMScheduler._get_variance
     def _get_variance(self, timestep, prev_timestep):
+        """
+        Computes the variance of the noise added at a given diffusion step.
+
+        For a given `timestep` and its previous step, this method calculates the variance as defined in DDIM/DDPM
+        literature:
+            var_t = (beta_prod_t_prev / beta_prod_t) * (1 - alpha_prod_t / alpha_prod_t_prev)
+        where alpha_prod and beta_prod are cumulative products of alphas and betas, respectively.
+
+        Args:
+            timestep (`int`):
+                The current timestep in the diffusion process.
+            prev_timestep (`int`):
+                The previous timestep in the diffusion process. If negative, uses `final_alpha_cumprod`.
+
+        Returns:
+            `torch.Tensor`:
+                The variance for the current timestep.
+        """
         alpha_prod_t = self.alphas_cumprod[timestep]
         alpha_prod_t_prev = self.alphas_cumprod[prev_timestep] if prev_timestep >= 0 else self.final_alpha_cumprod
         beta_prod_t = 1 - alpha_prod_t
@@ -328,6 +370,8 @@ def _get_variance(self, timestep, prev_timestep):
     # Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler._threshold_sample
     def _threshold_sample(self, sample: torch.Tensor) -> torch.Tensor:
         """
+        Apply dynamic thresholding to the predicted sample.
+
         "Dynamic thresholding: At each sampling step we set s to a certain percentile absolute pixel value in xt0 (the
         prediction of x_0 at timestep t), and if s > 1, then we threshold xt0 to the range [-s, s] and then divide by
         s. Dynamic thresholding pushes saturated pixels (those near -1 and 1) inwards, thereby actively preventing
@@ -335,6 +379,14 @@ def _threshold_sample(self, sample: torch.Tensor) -> torch.Tensor:
         photorealism as well as better image-text alignment, especially when using very large guidance weights."
 
         https://huggingface.co/papers/2205.11487
+
+        Args:
+            sample (`torch.Tensor`):
+                The predicted sample to be thresholded.
+
+        Returns:
+            `torch.Tensor`:
+                The thresholded sample.
         """
         dtype = sample.dtype
         batch_size, channels, *remaining_dims = sample.shape
@@ -634,6 +686,22 @@ def add_noise(
         noise: torch.Tensor,
         timesteps: torch.IntTensor,
     ) -> torch.Tensor:
+        """
+        Add noise to the original samples according to the noise magnitude at each timestep (this is the forward
+        diffusion process).
+
+        Args:
+            original_samples (`torch.Tensor`):
+                The original samples to which noise will be added.
+            noise (`torch.Tensor`):
+                The noise to add to the samples.
+            timesteps (`torch.IntTensor`):
+                The timesteps indicating the noise level for each sample.
+
+        Returns:
+            `torch.Tensor`:
+                The noisy samples.
+        """
         # Make sure alphas_cumprod and timestep have same device and dtype as original_samples
         # Move the self.alphas_cumprod to device to avoid redundant CPU to GPU data movement
         # for the subsequent add_noise calls
@@ -656,6 +724,21 @@ def add_noise(
 
     # Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler.get_velocity
     def get_velocity(self, sample: torch.Tensor, noise: torch.Tensor, timesteps: torch.IntTensor) -> torch.Tensor:
+        """
+        Compute the velocity prediction from the sample and noise according to the velocity formula.
+
+        Args:
+            sample (`torch.Tensor`):
+                The input sample.
+            noise (`torch.Tensor`):
+                The noise tensor.
+            timesteps (`torch.IntTensor`):
+                The timesteps for velocity computation.
+
+        Returns:
+            `torch.Tensor`:
+                The computed velocity.
+        """
         # Make sure alphas_cumprod and timestep have same device and dtype as sample
         self.alphas_cumprod = self.alphas_cumprod.to(device=sample.device)
         alphas_cumprod = self.alphas_cumprod.to(dtype=sample.dtype)
@@ -679,6 +762,17 @@ def __len__(self):
 
     # Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler.previous_timestep
     def previous_timestep(self, timestep):
+        """
+        Compute the previous timestep in the diffusion chain.
+
+        Args:
+            timestep (`int`):
+                The current timestep.
+
+        Returns:
+            `int`:
+                The previous timestep.
+        """
         if self.custom_timesteps or self.num_inference_steps:
             index = (self.timesteps == timestep).nonzero(as_tuple=True)[0][0]
             if index == self.timesteps.shape[0] - 1:
diff --git a/src/diffusers/schedulers/scheduling_unclip.py b/src/diffusers/schedulers/scheduling_unclip.py
index d78efabfbc57..5a978dec649b 100644
--- a/src/diffusers/schedulers/scheduling_unclip.py
+++ b/src/diffusers/schedulers/scheduling_unclip.py
@@ -14,7 +14,7 @@
 
 import math
 from dataclasses import dataclass
-from typing import Optional, Tuple, Union
+from typing import Literal, Optional, Tuple, Union
 
 import numpy as np
 import torch
@@ -46,10 +46,10 @@ class UnCLIPSchedulerOutput(BaseOutput):
 
 # Copied from diffusers.schedulers.scheduling_ddpm.betas_for_alpha_bar
 def betas_for_alpha_bar(
-    num_diffusion_timesteps,
-    max_beta=0.999,
-    alpha_transform_type="cosine",
-):
+    num_diffusion_timesteps: int,
+    max_beta: float = 0.999,
+    alpha_transform_type: Literal["cosine", "exp"] = "cosine",
+) -> torch.Tensor:
     """
     Create a beta schedule that discretizes the given alpha_t_bar function, which defines the cumulative product of
     (1-beta) over time from t = [0,1].
@@ -57,16 +57,17 @@ def betas_for_alpha_bar(
     Contains a function alpha_bar that takes an argument t and transforms it to the cumulative product of (1-beta) up
     to that part of the diffusion process.
 
-
     Args:
-        num_diffusion_timesteps (`int`): the number of betas to produce.
-        max_beta (`float`): the maximum beta to use; use values lower than 1 to
-                     prevent singularities.
-        alpha_transform_type (`str`, *optional*, default to `cosine`): the type of noise schedule for alpha_bar.
-                     Choose from `cosine` or `exp`
+        num_diffusion_timesteps (`int`):
+            The number of betas to produce.
+        max_beta (`float`, defaults to `0.999`):
+            The maximum beta to use; use values lower than 1 to avoid numerical instability.
+        alpha_transform_type (`"cosine"` or `"exp"`, defaults to `"cosine"`):
+            The type of noise schedule for `alpha_bar`. Choose from `cosine` or `exp`.
 
     Returns:
-        betas (`np.ndarray`): the betas used by the scheduler to step the model outputs
+        `torch.Tensor`:
+            The betas used by the scheduler to step the model outputs.
     """
     if alpha_transform_type == "cosine":
 
@@ -334,6 +335,22 @@ def add_noise(
         noise: torch.Tensor,
         timesteps: torch.IntTensor,
     ) -> torch.Tensor:
+        """
+        Add noise to the original samples according to the noise magnitude at each timestep (this is the forward
+        diffusion process).
+
+        Args:
+            original_samples (`torch.Tensor`):
+                The original samples to which noise will be added.
+            noise (`torch.Tensor`):
+                The noise to add to the samples.
+            timesteps (`torch.IntTensor`):
+                The timesteps indicating the noise level for each sample.
+
+        Returns:
+            `torch.Tensor`:
+                The noisy samples.
+        """
         # Make sure alphas_cumprod and timestep have same device and dtype as original_samples
         # Move the self.alphas_cumprod to device to avoid redundant CPU to GPU data movement
         # for the subsequent add_noise calls
diff --git a/src/diffusers/schedulers/scheduling_unipc_multistep.py b/src/diffusers/schedulers/scheduling_unipc_multistep.py
index 162a34bd2774..689c6a06350b 100644
--- a/src/diffusers/schedulers/scheduling_unipc_multistep.py
+++ b/src/diffusers/schedulers/scheduling_unipc_multistep.py
@@ -16,7 +16,7 @@
 # The codebase is modified based on https://github.com/huggingface/diffusers/blob/main/src/diffusers/schedulers/scheduling_dpmsolver_multistep.py
 
 import math
-from typing import List, Optional, Tuple, Union
+from typing import List, Literal, Optional, Tuple, Union
 
 import numpy as np
 import torch
@@ -32,10 +32,10 @@
 
 # Copied from diffusers.schedulers.scheduling_ddpm.betas_for_alpha_bar
 def betas_for_alpha_bar(
-    num_diffusion_timesteps,
-    max_beta=0.999,
-    alpha_transform_type="cosine",
-):
+    num_diffusion_timesteps: int,
+    max_beta: float = 0.999,
+    alpha_transform_type: Literal["cosine", "exp"] = "cosine",
+) -> torch.Tensor:
     """
     Create a beta schedule that discretizes the given alpha_t_bar function, which defines the cumulative product of
     (1-beta) over time from t = [0,1].
@@ -43,16 +43,17 @@ def betas_for_alpha_bar(
     Contains a function alpha_bar that takes an argument t and transforms it to the cumulative product of (1-beta) up
     to that part of the diffusion process.
 
-
     Args:
-        num_diffusion_timesteps (`int`): the number of betas to produce.
-        max_beta (`float`): the maximum beta to use; use values lower than 1 to
-                     prevent singularities.
-        alpha_transform_type (`str`, *optional*, default to `cosine`): the type of noise schedule for alpha_bar.
-                     Choose from `cosine` or `exp`
+        num_diffusion_timesteps (`int`):
+            The number of betas to produce.
+        max_beta (`float`, defaults to `0.999`):
+            The maximum beta to use; use values lower than 1 to avoid numerical instability.
+        alpha_transform_type (`"cosine"` or `"exp"`, defaults to `"cosine"`):
+            The type of noise schedule for `alpha_bar`. Choose from `cosine` or `exp`.
 
     Returns:
-        betas (`np.ndarray`): the betas used by the scheduler to step the model outputs
+        `torch.Tensor`:
+            The betas used by the scheduler to step the model outputs.
     """
     if alpha_transform_type == "cosine":
 
@@ -76,17 +77,17 @@ def alpha_bar_fn(t):
 
 
 # Copied from diffusers.schedulers.scheduling_ddim.rescale_zero_terminal_snr
-def rescale_zero_terminal_snr(betas):
+def rescale_zero_terminal_snr(betas: torch.Tensor) -> torch.Tensor:
     """
     Rescales betas to have zero terminal SNR Based on https://huggingface.co/papers/2305.08891 (Algorithm 1)
 
-
     Args:
         betas (`torch.Tensor`):
-            the betas that the scheduler is being initialized with.
+            The betas that the scheduler is being initialized with.
 
     Returns:
-        `torch.Tensor`: rescaled betas with zero terminal SNR
+        `torch.Tensor`:
+            Rescaled betas with zero terminal SNR.
     """
     # Convert betas to alphas_bar_sqrt
     alphas = 1.0 - betas
@@ -126,19 +127,19 @@ class UniPCMultistepScheduler(SchedulerMixin, ConfigMixin):
             The starting `beta` value of inference.
         beta_end (`float`, defaults to 0.02):
             The final `beta` value.
-        beta_schedule (`str`, defaults to `"linear"`):
+        beta_schedule (`"linear"`, `"scaled_linear"`, or `"squaredcos_cap_v2"`, defaults to `"linear"`):
             The beta schedule, a mapping from a beta range to a sequence of betas for stepping the model. Choose from
             `linear`, `scaled_linear`, or `squaredcos_cap_v2`.
         trained_betas (`np.ndarray`, *optional*):
             Pass an array of betas directly to the constructor to bypass `beta_start` and `beta_end`.
-        solver_order (`int`, default `2`):
+        solver_order (`int`, defaults to `2`):
             The UniPC order which can be any positive integer. The effective order of accuracy is `solver_order + 1`
             due to the UniC. It is recommended to use `solver_order=2` for guided sampling, and `solver_order=3` for
             unconditional sampling.
-        prediction_type (`str`, defaults to `epsilon`, *optional*):
+        prediction_type (`"epsilon"`, `"sample"`, `"v_prediction"`, or `"flow_prediction"`, defaults to `"epsilon"`, *optional*):
             Prediction type of the scheduler function; can be `epsilon` (predicts the noise of the diffusion process),
-            `sample` (directly predicts the noisy sample`) or `v_prediction` (see section 2.4 of [Imagen
-            Video](https://imagen.research.google/video/paper.pdf) paper).
+            `sample` (directly predicts the noisy sample`), `v_prediction` (see section 2.4 of [Imagen
+            Video](https://huggingface.co/papers/2210.02303) paper), or `flow_prediction`.
         thresholding (`bool`, defaults to `False`):
             Whether to use the "dynamic thresholding" method. This is unsuitable for latent-space diffusion models such
             as Stable Diffusion.
@@ -148,7 +149,7 @@ class UniPCMultistepScheduler(SchedulerMixin, ConfigMixin):
             The threshold value for dynamic thresholding. Valid only when `thresholding=True` and `predict_x0=True`.
         predict_x0 (`bool`, defaults to `True`):
             Whether to use the updating algorithm on the predicted x0.
-        solver_type (`str`, default `bh2`):
+        solver_type (`"bh1"` or `"bh2"`, defaults to `"bh2"`):
             Solver type for UniPC. It is recommended to use `bh1` for unconditional sampling when steps < 10, and `bh2`
             otherwise.
         lower_order_final (`bool`, default `True`):
@@ -170,12 +171,12 @@ class UniPCMultistepScheduler(SchedulerMixin, ConfigMixin):
             Sampling is All You Need](https://huggingface.co/papers/2407.12173) for more information.
         use_flow_sigmas (`bool`, *optional*, defaults to `False`):
             Whether to use flow sigmas for step sizes in the noise schedule during the sampling process.
-        timestep_spacing (`str`, defaults to `"linspace"`):
+        timestep_spacing (`"linspace"`, `"leading"`, or `"trailing"`, defaults to `"linspace"`):
             The way the timesteps should be scaled. Refer to Table 2 of the [Common Diffusion Noise Schedules and
             Sample Steps are Flawed](https://huggingface.co/papers/2305.08891) for more information.
         steps_offset (`int`, defaults to 0):
             An offset added to the inference steps, as required by some model families.
-        final_sigmas_type (`str`, defaults to `"zero"`):
+        final_sigmas_type (`"zero"` or `"sigma_min"`, defaults to `"zero"`):
             The final `sigma` value for the noise schedule during the sampling process. If `"sigma_min"`, the final
             sigma is the same as the last sigma in the training schedule. If `zero`, the final sigma is set to 0.
         rescale_betas_zero_snr (`bool`, defaults to `False`):
@@ -193,30 +194,30 @@ def __init__(
         num_train_timesteps: int = 1000,
         beta_start: float = 0.0001,
         beta_end: float = 0.02,
-        beta_schedule: str = "linear",
+        beta_schedule: Literal["linear", "scaled_linear", "squaredcos_cap_v2"] = "linear",
         trained_betas: Optional[Union[np.ndarray, List[float]]] = None,
         solver_order: int = 2,
-        prediction_type: str = "epsilon",
+        prediction_type: Literal["epsilon", "sample", "v_prediction", "flow_prediction"] = "epsilon",
         thresholding: bool = False,
         dynamic_thresholding_ratio: float = 0.995,
         sample_max_value: float = 1.0,
         predict_x0: bool = True,
-        solver_type: str = "bh2",
+        solver_type: Literal["bh1", "bh2"] = "bh2",
         lower_order_final: bool = True,
         disable_corrector: List[int] = [],
-        solver_p: SchedulerMixin = None,
+        solver_p: Optional[SchedulerMixin] = None,
         use_karras_sigmas: Optional[bool] = False,
         use_exponential_sigmas: Optional[bool] = False,
         use_beta_sigmas: Optional[bool] = False,
         use_flow_sigmas: Optional[bool] = False,
         flow_shift: Optional[float] = 1.0,
-        timestep_spacing: str = "linspace",
+        timestep_spacing: Literal["linspace", "leading", "trailing"] = "linspace",
         steps_offset: int = 0,
-        final_sigmas_type: Optional[str] = "zero",  # "zero", "sigma_min"
+        final_sigmas_type: Optional[Literal["zero", "sigma_min"]] = "zero",
         rescale_betas_zero_snr: bool = False,
         use_dynamic_shifting: bool = False,
-        time_shift_type: str = "exponential",
-    ):
+        time_shift_type: Literal["exponential"] = "exponential",
+    ) -> None:
         if self.config.use_beta_sigmas and not is_scipy_available():
             raise ImportError("Make sure to install scipy if you want to use beta sigmas.")
         if sum([self.config.use_beta_sigmas, self.config.use_exponential_sigmas, self.config.use_karras_sigmas]) > 1:
@@ -278,33 +279,33 @@ def __init__(
         self.sigmas = self.sigmas.to("cpu")  # to avoid too much CPU/GPU communication
 
     @property
-    def step_index(self):
+    def step_index(self) -> Optional[int]:
         """
         The index counter for current timestep. It will increase 1 after each scheduler step.
         """
         return self._step_index
 
     @property
-    def begin_index(self):
+    def begin_index(self) -> Optional[int]:
         """
         The index for the first timestep. It should be set from pipeline with `set_begin_index` method.
         """
         return self._begin_index
 
     # Copied from diffusers.schedulers.scheduling_dpmsolver_multistep.DPMSolverMultistepScheduler.set_begin_index
-    def set_begin_index(self, begin_index: int = 0):
+    def set_begin_index(self, begin_index: int = 0) -> None:
         """
         Sets the begin index for the scheduler. This function should be run from pipeline before the inference.
 
         Args:
-            begin_index (`int`):
+            begin_index (`int`, defaults to `0`):
                 The begin index for the scheduler.
         """
         self._begin_index = begin_index
 
     def set_timesteps(
-        self, num_inference_steps: int, device: Union[str, torch.device] = None, mu: Optional[float] = None
-    ):
+        self, num_inference_steps: int, device: Optional[Union[str, torch.device]] = None, mu: Optional[float] = None
+    ) -> None:
         """
         Sets the discrete timesteps used for the diffusion chain (to be run before inference).
 
@@ -313,6 +314,8 @@ def set_timesteps(
                 The number of diffusion steps used when generating samples with a pre-trained model.
             device (`str` or `torch.device`, *optional*):
                 The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
+            mu (`float`, *optional*):
+                Optional mu parameter for dynamic shifting when using exponential time shift type.
         """
         # "linspace", "leading", "trailing" corresponds to annotation of Table 2. of https://huggingface.co/papers/2305.08891
         if mu is not None:
@@ -432,6 +435,8 @@ def set_timesteps(
     # Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler._threshold_sample
     def _threshold_sample(self, sample: torch.Tensor) -> torch.Tensor:
         """
+        Apply dynamic thresholding to the predicted sample.
+
         "Dynamic thresholding: At each sampling step we set s to a certain percentile absolute pixel value in xt0 (the
         prediction of x_0 at timestep t), and if s > 1, then we threshold xt0 to the range [-s, s] and then divide by
         s. Dynamic thresholding pushes saturated pixels (those near -1 and 1) inwards, thereby actively preventing
@@ -439,6 +444,14 @@ def _threshold_sample(self, sample: torch.Tensor) -> torch.Tensor:
         photorealism as well as better image-text alignment, especially when using very large guidance weights."
 
         https://huggingface.co/papers/2205.11487
+
+        Args:
+            sample (`torch.Tensor`):
+                The predicted sample to be thresholded.
+
+        Returns:
+            `torch.Tensor`:
+                The thresholded sample.
         """
         dtype = sample.dtype
         batch_size, channels, *remaining_dims = sample.shape
@@ -464,7 +477,20 @@ def _threshold_sample(self, sample: torch.Tensor) -> torch.Tensor:
         return sample
 
     # Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler._sigma_to_t
-    def _sigma_to_t(self, sigma, log_sigmas):
+    def _sigma_to_t(self, sigma: np.ndarray, log_sigmas: np.ndarray) -> np.ndarray:
+        """
+        Convert sigma values to corresponding timestep values through interpolation.
+
+        Args:
+            sigma (`np.ndarray`):
+                The sigma value(s) to convert to timestep(s).
+            log_sigmas (`np.ndarray`):
+                The logarithm of the sigma schedule used for interpolation.
+
+        Returns:
+            `np.ndarray`:
+                The interpolated timestep value(s) corresponding to the input sigma(s).
+        """
         # get log sigma
         log_sigma = np.log(np.maximum(sigma, 1e-10))
 
@@ -488,7 +514,18 @@ def _sigma_to_t(self, sigma, log_sigmas):
         return t
 
     # Copied from diffusers.schedulers.scheduling_dpmsolver_multistep.DPMSolverMultistepScheduler._sigma_to_alpha_sigma_t
-    def _sigma_to_alpha_sigma_t(self, sigma):
+    def _sigma_to_alpha_sigma_t(self, sigma: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
+        """
+        Convert sigma values to alpha_t and sigma_t values.
+
+        Args:
+            sigma (`torch.Tensor`):
+                The sigma value(s) to convert.
+
+        Returns:
+            `Tuple[torch.Tensor, torch.Tensor]`:
+                A tuple containing (alpha_t, sigma_t) values.
+        """
         if self.config.use_flow_sigmas:
             alpha_t = 1 - sigma
             sigma_t = sigma
@@ -499,8 +536,21 @@ def _sigma_to_alpha_sigma_t(self, sigma):
         return alpha_t, sigma_t
 
     # Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler._convert_to_karras
-    def _convert_to_karras(self, in_sigmas: torch.Tensor, num_inference_steps) -> torch.Tensor:
-        """Constructs the noise schedule of Karras et al. (2022)."""
+    def _convert_to_karras(self, in_sigmas: torch.Tensor, num_inference_steps: int) -> torch.Tensor:
+        """
+        Construct the noise schedule as proposed in [Elucidating the Design Space of Diffusion-Based Generative
+        Models](https://huggingface.co/papers/2206.00364).
+
+        Args:
+            in_sigmas (`torch.Tensor`):
+                The input sigma values to be converted.
+            num_inference_steps (`int`):
+                The number of inference steps to generate the noise schedule for.
+
+        Returns:
+            `torch.Tensor`:
+                The converted sigma values following the Karras noise schedule.
+        """
 
         # Hack to make sure that other schedulers which copy this function don't break
         # TODO: Add this logic to the other schedulers
@@ -526,7 +576,19 @@ def _convert_to_karras(self, in_sigmas: torch.Tensor, num_inference_steps) -> to
 
     # Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler._convert_to_exponential
     def _convert_to_exponential(self, in_sigmas: torch.Tensor, num_inference_steps: int) -> torch.Tensor:
-        """Constructs an exponential noise schedule."""
+        """
+        Construct an exponential noise schedule.
+
+        Args:
+            in_sigmas (`torch.Tensor`):
+                The input sigma values to be converted.
+            num_inference_steps (`int`):
+                The number of inference steps to generate the noise schedule for.
+
+        Returns:
+            `torch.Tensor`:
+                The converted sigma values following an exponential schedule.
+        """
 
         # Hack to make sure that other schedulers which copy this function don't break
         # TODO: Add this logic to the other schedulers
@@ -550,7 +612,24 @@ def _convert_to_exponential(self, in_sigmas: torch.Tensor, num_inference_steps:
     def _convert_to_beta(
         self, in_sigmas: torch.Tensor, num_inference_steps: int, alpha: float = 0.6, beta: float = 0.6
     ) -> torch.Tensor:
-        """From "Beta Sampling is All You Need" [arXiv:2407.12173] (Lee et. al, 2024)"""
+        """
+        Construct a beta noise schedule as proposed in [Beta Sampling is All You
+        Need](https://huggingface.co/papers/2407.12173).
+
+        Args:
+            in_sigmas (`torch.Tensor`):
+                The input sigma values to be converted.
+            num_inference_steps (`int`):
+                The number of inference steps to generate the noise schedule for.
+            alpha (`float`, *optional*, defaults to `0.6`):
+                The alpha parameter for the beta distribution.
+            beta (`float`, *optional*, defaults to `0.6`):
+                The beta parameter for the beta distribution.
+
+        Returns:
+            `torch.Tensor`:
+                The converted sigma values following a beta distribution schedule.
+        """
 
         # Hack to make sure that other schedulers which copy this function don't break
         # TODO: Add this logic to the other schedulers
@@ -918,7 +997,22 @@ def multistep_uni_c_bh_update(
         return x_t
 
     # Copied from diffusers.schedulers.scheduling_dpmsolver_multistep.DPMSolverMultistepScheduler.index_for_timestep
-    def index_for_timestep(self, timestep, schedule_timesteps=None):
+    def index_for_timestep(
+        self, timestep: Union[int, torch.Tensor], schedule_timesteps: Optional[torch.Tensor] = None
+    ) -> int:
+        """
+        Find the index for a given timestep in the schedule.
+
+        Args:
+            timestep (`int` or `torch.Tensor`):
+                The timestep for which to find the index.
+            schedule_timesteps (`torch.Tensor`, *optional*):
+                The timestep schedule to search in. If `None`, uses `self.timesteps`.
+
+        Returns:
+            `int`:
+                The index of the timestep in the schedule.
+        """
         if schedule_timesteps is None:
             schedule_timesteps = self.timesteps
 
@@ -938,9 +1032,13 @@ def index_for_timestep(self, timestep, schedule_timesteps=None):
         return step_index
 
     # Copied from diffusers.schedulers.scheduling_dpmsolver_multistep.DPMSolverMultistepScheduler._init_step_index
-    def _init_step_index(self, timestep):
+    def _init_step_index(self, timestep: Union[int, torch.Tensor]) -> None:
         """
         Initialize the step_index counter for the scheduler.
+
+        Args:
+            timestep (`int` or `torch.Tensor`):
+                The current timestep for which to initialize the step index.
         """
 
         if self.begin_index is None:
@@ -964,11 +1062,11 @@ def step(
         Args:
             model_output (`torch.Tensor`):
                 The direct output from learned diffusion model.
-            timestep (`int`):
+            timestep (`int` or `torch.Tensor`):
                 The current discrete timestep in the diffusion chain.
             sample (`torch.Tensor`):
                 A current instance of a sample created by the diffusion process.
-            return_dict (`bool`):
+            return_dict (`bool`, defaults to `True`):
                 Whether or not to return a [`~schedulers.scheduling_utils.SchedulerOutput`] or `tuple`.
 
         Returns:
@@ -1053,6 +1151,21 @@ def add_noise(
         noise: torch.Tensor,
         timesteps: torch.IntTensor,
     ) -> torch.Tensor:
+        """
+        Add noise to the original samples according to the noise schedule at the specified timesteps.
+
+        Args:
+            original_samples (`torch.Tensor`):
+                The original samples without noise.
+            noise (`torch.Tensor`):
+                The noise to add to the samples.
+            timesteps (`torch.IntTensor`):
+                The timesteps at which to add noise to the samples.
+
+        Returns:
+            `torch.Tensor`:
+                The noisy samples.
+        """
         # Make sure sigmas and timesteps have the same device and dtype as original_samples
         sigmas = self.sigmas.to(device=original_samples.device, dtype=original_samples.dtype)
         if original_samples.device.type == "mps" and torch.is_floating_point(timesteps):
@@ -1081,5 +1194,5 @@ def add_noise(
         noisy_samples = alpha_t * original_samples + sigma_t * noise
         return noisy_samples
 
-    def __len__(self):
+    def __len__(self) -> int:
         return self.config.num_train_timesteps
diff --git a/src/diffusers/utils/__init__.py b/src/diffusers/utils/__init__.py
index cf77aaee8205..6884d3be9292 100644
--- a/src/diffusers/utils/__init__.py
+++ b/src/diffusers/utils/__init__.py
@@ -108,6 +108,7 @@
     is_tensorboard_available,
     is_timm_available,
     is_torch_available,
+    is_torch_mlu_available,
     is_torch_npu_available,
     is_torch_version,
     is_torch_xla_available,
diff --git a/src/diffusers/utils/constants.py b/src/diffusers/utils/constants.py
index 42a53e181034..c46fa4363483 100644
--- a/src/diffusers/utils/constants.py
+++ b/src/diffusers/utils/constants.py
@@ -42,11 +42,10 @@
 DEPRECATED_REVISION_ARGS = ["fp16", "non-ema"]
 DIFFUSERS_REQUEST_TIMEOUT = 60
 DIFFUSERS_ATTN_BACKEND = os.getenv("DIFFUSERS_ATTN_BACKEND", "native")
-DIFFUSERS_ATTN_CHECKS = os.getenv("DIFFUSERS_ATTN_CHECKS", "0") in ENV_VARS_TRUE_VALUES
+DIFFUSERS_ATTN_CHECKS = os.getenv("DIFFUSERS_ATTN_CHECKS", "0").upper() in ENV_VARS_TRUE_VALUES
 DEFAULT_HF_PARALLEL_LOADING_WORKERS = 8
 HF_ENABLE_PARALLEL_LOADING = os.environ.get("HF_ENABLE_PARALLEL_LOADING", "").upper() in ENV_VARS_TRUE_VALUES
 DIFFUSERS_DISABLE_REMOTE_CODE = os.getenv("DIFFUSERS_DISABLE_REMOTE_CODE", "false").upper() in ENV_VARS_TRUE_VALUES
-DIFFUSERS_ENABLE_HUB_KERNELS = os.environ.get("DIFFUSERS_ENABLE_HUB_KERNELS", "").upper() in ENV_VARS_TRUE_VALUES
 
 # Below should be `True` if the current version of `peft` and `transformers` are compatible with
 # PEFT backend. Will automatically fall back to PEFT backend if the correct versions of the libraries are
diff --git a/src/diffusers/utils/dummy_pt_objects.py b/src/diffusers/utils/dummy_pt_objects.py
index 3c426d503996..8628893200fe 100644
--- a/src/diffusers/utils/dummy_pt_objects.py
+++ b/src/diffusers/utils/dummy_pt_objects.py
@@ -257,6 +257,21 @@ def from_pretrained(cls, *args, **kwargs):
         requires_backends(cls, ["torch"])
 
 
+class TaylorSeerCacheConfig(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+
 def apply_faster_cache(*args, **kwargs):
     requires_backends(apply_faster_cache, ["torch"])
 
@@ -273,6 +288,10 @@ def apply_pyramid_attention_broadcast(*args, **kwargs):
     requires_backends(apply_pyramid_attention_broadcast, ["torch"])
 
 
+def apply_taylorseer_cache(*args, **kwargs):
+    requires_backends(apply_taylorseer_cache, ["torch"])
+
+
 class AllegroTransformer3DModel(metaclass=DummyObject):
     _backends = ["torch"]
 
@@ -408,6 +427,21 @@ def from_pretrained(cls, *args, **kwargs):
         requires_backends(cls, ["torch"])
 
 
+class AutoencoderKLFlux2(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+
 class AutoencoderKLHunyuanImage(metaclass=DummyObject):
     _backends = ["torch"]
 
@@ -453,6 +487,21 @@ def from_pretrained(cls, *args, **kwargs):
         requires_backends(cls, ["torch"])
 
 
+class AutoencoderKLHunyuanVideo15(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+
 class AutoencoderKLLTXVideo(metaclass=DummyObject):
     _backends = ["torch"]
 
@@ -648,6 +697,21 @@ def from_pretrained(cls, *args, **kwargs):
         requires_backends(cls, ["torch"])
 
 
+class ChronoEditTransformer3DModel(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+
 class CogVideoXTransformer3DModel(metaclass=DummyObject):
     _backends = ["torch"]
 
@@ -828,6 +892,21 @@ def from_pretrained(cls, *args, **kwargs):
         requires_backends(cls, ["torch"])
 
 
+class Flux2Transformer2DModel(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+
 class FluxControlNetModel(metaclass=DummyObject):
     _backends = ["torch"]
 
@@ -948,6 +1027,21 @@ def from_pretrained(cls, *args, **kwargs):
         requires_backends(cls, ["torch"])
 
 
+class HunyuanVideo15Transformer3DModel(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+
 class HunyuanVideoFramepackTransformer3DModel(metaclass=DummyObject):
     _backends = ["torch"]
 
@@ -1173,6 +1267,21 @@ def from_pretrained(cls, *args, **kwargs):
         requires_backends(cls, ["torch"])
 
 
+class OvisImageTransformer2DModel(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+
 class ParallelConfig(metaclass=DummyObject):
     _backends = ["torch"]
 
@@ -1308,6 +1417,21 @@ def from_pretrained(cls, *args, **kwargs):
         requires_backends(cls, ["torch"])
 
 
+class SanaVideoTransformer3DModel(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+
 class SD3ControlNetModel(metaclass=DummyObject):
     _backends = ["torch"]
 
@@ -1593,6 +1717,21 @@ def from_pretrained(cls, *args, **kwargs):
         requires_backends(cls, ["torch"])
 
 
+class WanAnimateTransformer3DModel(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+
 class WanTransformer3DModel(metaclass=DummyObject):
     _backends = ["torch"]
 
@@ -1623,6 +1762,21 @@ def from_pretrained(cls, *args, **kwargs):
         requires_backends(cls, ["torch"])
 
 
+class ZImageTransformer2DModel(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+
 def attention_backend(*args, **kwargs):
     requires_backends(attention_backend, ["torch"])
 
diff --git a/src/diffusers/utils/dummy_torch_and_transformers_objects.py b/src/diffusers/utils/dummy_torch_and_transformers_objects.py
index 20575ff2294d..79a21d2ac6e5 100644
--- a/src/diffusers/utils/dummy_torch_and_transformers_objects.py
+++ b/src/diffusers/utils/dummy_torch_and_transformers_objects.py
@@ -182,6 +182,21 @@ def from_pretrained(cls, *args, **kwargs):
         requires_backends(cls, ["torch", "transformers"])
 
 
+class Wan22AutoBlocks(metaclass=DummyObject):
+    _backends = ["torch", "transformers"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch", "transformers"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+
 class WanAutoBlocks(metaclass=DummyObject):
     _backends = ["torch", "transformers"]
 
@@ -542,6 +557,21 @@ def from_pretrained(cls, *args, **kwargs):
         requires_backends(cls, ["torch", "transformers"])
 
 
+class ChronoEditPipeline(metaclass=DummyObject):
+    _backends = ["torch", "transformers"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch", "transformers"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+
 class CLIPImageProjection(metaclass=DummyObject):
     _backends = ["torch", "transformers"]
 
@@ -797,6 +827,21 @@ def from_pretrained(cls, *args, **kwargs):
         requires_backends(cls, ["torch", "transformers"])
 
 
+class Flux2Pipeline(metaclass=DummyObject):
+    _backends = ["torch", "transformers"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch", "transformers"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+
 class FluxControlImg2ImgPipeline(metaclass=DummyObject):
     _backends = ["torch", "transformers"]
 
@@ -1097,6 +1142,36 @@ def from_pretrained(cls, *args, **kwargs):
         requires_backends(cls, ["torch", "transformers"])
 
 
+class HunyuanVideo15ImageToVideoPipeline(metaclass=DummyObject):
+    _backends = ["torch", "transformers"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch", "transformers"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+
+class HunyuanVideo15Pipeline(metaclass=DummyObject):
+    _backends = ["torch", "transformers"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch", "transformers"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+
 class HunyuanVideoFramepackPipeline(metaclass=DummyObject):
     _backends = ["torch", "transformers"]
 
@@ -1292,6 +1367,51 @@ def from_pretrained(cls, *args, **kwargs):
         requires_backends(cls, ["torch", "transformers"])
 
 
+class Kandinsky5I2IPipeline(metaclass=DummyObject):
+    _backends = ["torch", "transformers"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch", "transformers"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+
+class Kandinsky5I2VPipeline(metaclass=DummyObject):
+    _backends = ["torch", "transformers"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch", "transformers"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+
+class Kandinsky5T2IPipeline(metaclass=DummyObject):
+    _backends = ["torch", "transformers"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch", "transformers"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+
 class Kandinsky5T2VPipeline(metaclass=DummyObject):
     _backends = ["torch", "transformers"]
 
@@ -1877,6 +1997,21 @@ def from_pretrained(cls, *args, **kwargs):
         requires_backends(cls, ["torch", "transformers"])
 
 
+class OvisImagePipeline(metaclass=DummyObject):
+    _backends = ["torch", "transformers"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch", "transformers"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+
 class PaintByExamplePipeline(metaclass=DummyObject):
     _backends = ["torch", "transformers"]
 
@@ -2117,6 +2252,21 @@ def from_pretrained(cls, *args, **kwargs):
         requires_backends(cls, ["torch", "transformers"])
 
 
+class SanaImageToVideoPipeline(metaclass=DummyObject):
+    _backends = ["torch", "transformers"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch", "transformers"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+
 class SanaPAGPipeline(metaclass=DummyObject):
     _backends = ["torch", "transformers"]
 
@@ -2177,6 +2327,21 @@ def from_pretrained(cls, *args, **kwargs):
         requires_backends(cls, ["torch", "transformers"])
 
 
+class SanaVideoPipeline(metaclass=DummyObject):
+    _backends = ["torch", "transformers"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch", "transformers"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+
 class SemanticStableDiffusionPipeline(metaclass=DummyObject):
     _backends = ["torch", "transformers"]
 
@@ -3467,6 +3632,21 @@ def from_pretrained(cls, *args, **kwargs):
         requires_backends(cls, ["torch", "transformers"])
 
 
+class WanAnimatePipeline(metaclass=DummyObject):
+    _backends = ["torch", "transformers"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch", "transformers"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+
 class WanImageToVideoPipeline(metaclass=DummyObject):
     _backends = ["torch", "transformers"]
 
@@ -3570,3 +3750,33 @@ def from_config(cls, *args, **kwargs):
     @classmethod
     def from_pretrained(cls, *args, **kwargs):
         requires_backends(cls, ["torch", "transformers"])
+
+
+class ZImageImg2ImgPipeline(metaclass=DummyObject):
+    _backends = ["torch", "transformers"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch", "transformers"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+
+class ZImagePipeline(metaclass=DummyObject):
+    _backends = ["torch", "transformers"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch", "transformers"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
diff --git a/src/diffusers/utils/dynamic_modules_utils.py b/src/diffusers/utils/dynamic_modules_utils.py
index 627b1e0604dc..1c0734cf35bb 100644
--- a/src/diffusers/utils/dynamic_modules_utils.py
+++ b/src/diffusers/utils/dynamic_modules_utils.py
@@ -254,6 +254,7 @@ def get_cached_module_file(
     token: Optional[Union[bool, str]] = None,
     revision: Optional[str] = None,
     local_files_only: bool = False,
+    local_dir: Optional[str] = None,
 ):
     """
     Prepares Downloads a module from a local folder or a distant repo and returns its path inside the cached
@@ -332,6 +333,7 @@ def get_cached_module_file(
                 force_download=force_download,
                 proxies=proxies,
                 local_files_only=local_files_only,
+                local_dir=local_dir,
             )
             submodule = "git"
             module_file = pretrained_model_name_or_path + ".py"
@@ -355,6 +357,8 @@ def get_cached_module_file(
                 force_download=force_download,
                 proxies=proxies,
                 local_files_only=local_files_only,
+                local_dir=local_dir,
+                revision=revision,
                 token=token,
             )
             submodule = os.path.join("local", "--".join(pretrained_model_name_or_path.split("/")))
@@ -415,6 +419,7 @@ def get_cached_module_file(
                     token=token,
                     revision=revision,
                     local_files_only=local_files_only,
+                    local_dir=local_dir,
                 )
     return os.path.join(full_submodule, module_file)
 
@@ -431,7 +436,7 @@ def get_class_from_dynamic_module(
     token: Optional[Union[bool, str]] = None,
     revision: Optional[str] = None,
     local_files_only: bool = False,
-    **kwargs,
+    local_dir: Optional[str] = None,
 ):
     """
     Extracts a class from a module file, present in the local folder or repository of a model.
@@ -496,5 +501,6 @@ def get_class_from_dynamic_module(
         token=token,
         revision=revision,
         local_files_only=local_files_only,
+        local_dir=local_dir,
     )
     return get_class_in_module(class_name, final_module)
diff --git a/src/diffusers/utils/import_utils.py b/src/diffusers/utils/import_utils.py
index adf8ed8b0694..57b0a337922a 100644
--- a/src/diffusers/utils/import_utils.py
+++ b/src/diffusers/utils/import_utils.py
@@ -192,6 +192,7 @@ def _is_package_available(pkg_name: str, get_dist_name: bool = False) -> Tuple[b
 
 _torch_xla_available, _torch_xla_version = _is_package_available("torch_xla")
 _torch_npu_available, _torch_npu_version = _is_package_available("torch_npu")
+_torch_mlu_available, _torch_mlu_version = _is_package_available("torch_mlu")
 _transformers_available, _transformers_version = _is_package_available("transformers")
 _hf_hub_available, _hf_hub_version = _is_package_available("huggingface_hub")
 _kernels_available, _kernels_version = _is_package_available("kernels")
@@ -243,6 +244,10 @@ def is_torch_npu_available():
     return _torch_npu_available
 
 
+def is_torch_mlu_available():
+    return _torch_mlu_available
+
+
 def is_flax_available():
     return _flax_available
 
diff --git a/src/diffusers/utils/kernels_utils.py b/src/diffusers/utils/kernels_utils.py
deleted file mode 100644
index 26d6e3972fb7..000000000000
--- a/src/diffusers/utils/kernels_utils.py
+++ /dev/null
@@ -1,23 +0,0 @@
-from ..utils import get_logger
-from .import_utils import is_kernels_available
-
-
-logger = get_logger(__name__)
-
-
-_DEFAULT_HUB_ID_FA3 = "kernels-community/flash-attn3"
-
-
-def _get_fa3_from_hub():
-    if not is_kernels_available():
-        return None
-    else:
-        from kernels import get_kernel
-
-        try:
-            # TODO: temporary revision for now. Remove when merged upstream into `main`.
-            flash_attn_3_hub = get_kernel(_DEFAULT_HUB_ID_FA3, revision="fake-ops-return-probs")
-            return flash_attn_3_hub
-        except Exception as e:
-            logger.error(f"An error occurred while fetching kernel '{_DEFAULT_HUB_ID_FA3}' from the Hub: {e}")
-            raise
diff --git a/src/diffusers/utils/torch_utils.py b/src/diffusers/utils/torch_utils.py
index a1ab8cda431f..3b66fdadbef8 100644
--- a/src/diffusers/utils/torch_utils.py
+++ b/src/diffusers/utils/torch_utils.py
@@ -20,7 +20,7 @@
 from typing import Callable, Dict, List, Optional, Tuple, Union
 
 from . import logging
-from .import_utils import is_torch_available, is_torch_npu_available, is_torch_version
+from .import_utils import is_torch_available, is_torch_mlu_available, is_torch_npu_available, is_torch_version
 
 
 if is_torch_available():
@@ -242,8 +242,8 @@ def fourier_filter(x_in: "torch.Tensor", threshold: int, scale: int) -> "torch.T
 def apply_freeu(
     resolution_idx: int, hidden_states: "torch.Tensor", res_hidden_states: "torch.Tensor", **freeu_kwargs
 ) -> Tuple["torch.Tensor", "torch.Tensor"]:
-    """Applies the FreeU mechanism as introduced in https:
-    //arxiv.org/abs/2309.11497. Adapted from the official code repository: https://github.com/ChenyangSi/FreeU.
+    """Applies the FreeU mechanism as introduced in https://huggingface.co/papers/2309.11497. Adapted from the official
+    code repository: https://github.com/ChenyangSi/FreeU.
 
     Args:
         resolution_idx (`int`): Integer denoting the UNet block where FreeU is being applied.
@@ -286,6 +286,8 @@ def get_device():
         return "xpu"
     elif torch.backends.mps.is_available():
         return "mps"
+    elif is_torch_mlu_available():
+        return "mlu"
     else:
         return "cpu"
 
diff --git a/src/diffusers/video_processor.py b/src/diffusers/video_processor.py
index 59b59b47d2c7..abeb30bca102 100644
--- a/src/diffusers/video_processor.py
+++ b/src/diffusers/video_processor.py
@@ -13,11 +13,12 @@
 # limitations under the License.
 
 import warnings
-from typing import List, Optional, Union
+from typing import List, Optional, Tuple, Union
 
 import numpy as np
 import PIL
 import torch
+import torch.nn.functional as F
 
 from .image_processor import VaeImageProcessor, is_valid_image, is_valid_image_imagelist
 
@@ -111,3 +112,65 @@ def postprocess_video(
             raise ValueError(f"{output_type} does not exist. Please choose one of ['np', 'pt', 'pil']")
 
         return outputs
+
+    @staticmethod
+    def classify_height_width_bin(height: int, width: int, ratios: dict) -> Tuple[int, int]:
+        r"""
+        Returns the binned height and width based on the aspect ratio.
+
+        Args:
+            height (`int`): The height of the image.
+            width (`int`): The width of the image.
+            ratios (`dict`): A dictionary where keys are aspect ratios and values are tuples of (height, width).
+
+        Returns:
+            `Tuple[int, int]`: The closest binned height and width.
+        """
+        ar = float(height / width)
+        closest_ratio = min(ratios.keys(), key=lambda ratio: abs(float(ratio) - ar))
+        default_hw = ratios[closest_ratio]
+        return int(default_hw[0]), int(default_hw[1])
+
+    @staticmethod
+    def resize_and_crop_tensor(samples: torch.Tensor, new_width: int, new_height: int) -> torch.Tensor:
+        r"""
+        Resizes and crops a tensor of videos to the specified dimensions.
+
+        Args:
+            samples (`torch.Tensor`):
+                A tensor of shape (N, C, T, H, W) where N is the batch size, C is the number of channels, T is the
+                number of frames, H is the height, and W is the width.
+            new_width (`int`): The desired width of the output videos.
+            new_height (`int`): The desired height of the output videos.
+
+        Returns:
+            `torch.Tensor`: A tensor containing the resized and cropped videos.
+        """
+        orig_height, orig_width = samples.shape[3], samples.shape[4]
+
+        # Check if resizing is needed
+        if orig_height != new_height or orig_width != new_width:
+            ratio = max(new_height / orig_height, new_width / orig_width)
+            resized_width = int(orig_width * ratio)
+            resized_height = int(orig_height * ratio)
+
+            # Reshape to (N*T, C, H, W) for interpolation
+            n, c, t, h, w = samples.shape
+            samples = samples.permute(0, 2, 1, 3, 4).reshape(n * t, c, h, w)
+
+            # Resize
+            samples = F.interpolate(
+                samples, size=(resized_height, resized_width), mode="bilinear", align_corners=False
+            )
+
+            # Center Crop
+            start_x = (resized_width - new_width) // 2
+            end_x = start_x + new_width
+            start_y = (resized_height - new_height) // 2
+            end_y = start_y + new_height
+            samples = samples[:, :, start_y:end_y, start_x:end_x]
+
+            # Reshape back to (N, C, T, H, W)
+            samples = samples.reshape(n, t, c, new_height, new_width).permute(0, 2, 1, 3, 4)
+
+        return samples
diff --git a/tests/hooks/test_group_offloading.py b/tests/hooks/test_group_offloading.py
index 96cbecfbf530..236094109d07 100644
--- a/tests/hooks/test_group_offloading.py
+++ b/tests/hooks/test_group_offloading.py
@@ -19,6 +19,7 @@
 import torch
 from parameterized import parameterized
 
+from diffusers import AutoencoderKL
 from diffusers.hooks import HookRegistry, ModelHook
 from diffusers.models import ModelMixin
 from diffusers.pipelines.pipeline_utils import DiffusionPipeline
@@ -149,6 +150,74 @@ def post_forward(self, module, output):
         return output
 
 
+# Model with only standalone computational layers at top level
+class DummyModelWithStandaloneLayers(ModelMixin):
+    def __init__(self, in_features: int, hidden_features: int, out_features: int) -> None:
+        super().__init__()
+
+        self.layer1 = torch.nn.Linear(in_features, hidden_features)
+        self.activation = torch.nn.ReLU()
+        self.layer2 = torch.nn.Linear(hidden_features, hidden_features)
+        self.layer3 = torch.nn.Linear(hidden_features, out_features)
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = self.layer1(x)
+        x = self.activation(x)
+        x = self.layer2(x)
+        x = self.layer3(x)
+        return x
+
+
+# Model with deeply nested structure
+class DummyModelWithDeeplyNestedBlocks(ModelMixin):
+    def __init__(self, in_features: int, hidden_features: int, out_features: int) -> None:
+        super().__init__()
+
+        self.input_layer = torch.nn.Linear(in_features, hidden_features)
+        self.container = ContainerWithNestedModuleList(hidden_features)
+        self.output_layer = torch.nn.Linear(hidden_features, out_features)
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = self.input_layer(x)
+        x = self.container(x)
+        x = self.output_layer(x)
+        return x
+
+
+class ContainerWithNestedModuleList(torch.nn.Module):
+    def __init__(self, features: int) -> None:
+        super().__init__()
+
+        # Top-level computational layer
+        self.proj_in = torch.nn.Linear(features, features)
+
+        # Nested container with ModuleList
+        self.nested_container = NestedContainer(features)
+
+        # Another top-level computational layer
+        self.proj_out = torch.nn.Linear(features, features)
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = self.proj_in(x)
+        x = self.nested_container(x)
+        x = self.proj_out(x)
+        return x
+
+
+class NestedContainer(torch.nn.Module):
+    def __init__(self, features: int) -> None:
+        super().__init__()
+
+        self.blocks = torch.nn.ModuleList([torch.nn.Linear(features, features), torch.nn.Linear(features, features)])
+        self.norm = torch.nn.LayerNorm(features)
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        for block in self.blocks:
+            x = block(x)
+        x = self.norm(x)
+        return x
+
+
 @require_torch_accelerator
 class GroupOffloadTests(unittest.TestCase):
     in_features = 64
@@ -340,7 +409,7 @@ def apply_layer_output_tracker_hook(model: DummyModelWithLayerNorm):
         out = model(x)
         self.assertTrue(torch.allclose(out_ref, out, atol=1e-5), "Outputs do not match.")
 
-        num_repeats = 4
+        num_repeats = 2
         for i in range(num_repeats):
             out_ref = model_ref(x)
             out = model(x)
@@ -362,3 +431,138 @@ def apply_layer_output_tracker_hook(model: DummyModelWithLayerNorm):
             self.assertLess(
                 cumulated_absmax, 1e-5, f"Output differences for {name} exceeded threshold: {cumulated_absmax:.5f}"
             )
+
+    def test_vae_like_model_without_streams(self):
+        """Test VAE-like model with block-level offloading but without streams."""
+        if torch.device(torch_device).type not in ["cuda", "xpu"]:
+            return
+
+        config = self.get_autoencoder_kl_config()
+        model = AutoencoderKL(**config)
+
+        model_ref = AutoencoderKL(**config)
+        model_ref.load_state_dict(model.state_dict(), strict=True)
+        model_ref.to(torch_device)
+
+        model.enable_group_offload(torch_device, offload_type="block_level", num_blocks_per_group=1, use_stream=False)
+
+        x = torch.randn(2, 3, 32, 32).to(torch_device)
+
+        with torch.no_grad():
+            out_ref = model_ref(x).sample
+            out = model(x).sample
+
+        self.assertTrue(
+            torch.allclose(out_ref, out, atol=1e-5), "Outputs do not match for VAE-like model without streams."
+        )
+
+    def test_model_with_only_standalone_layers(self):
+        """Test that models with only standalone layers (no ModuleList/Sequential) work with block-level offloading."""
+        if torch.device(torch_device).type not in ["cuda", "xpu"]:
+            return
+
+        model = DummyModelWithStandaloneLayers(in_features=64, hidden_features=128, out_features=64)
+
+        model_ref = DummyModelWithStandaloneLayers(in_features=64, hidden_features=128, out_features=64)
+        model_ref.load_state_dict(model.state_dict(), strict=True)
+        model_ref.to(torch_device)
+
+        model.enable_group_offload(torch_device, offload_type="block_level", num_blocks_per_group=1, use_stream=True)
+
+        x = torch.randn(2, 64).to(torch_device)
+
+        with torch.no_grad():
+            for i in range(2):
+                out_ref = model_ref(x)
+                out = model(x)
+                self.assertTrue(
+                    torch.allclose(out_ref, out, atol=1e-5),
+                    f"Outputs do not match at iteration {i} for model with standalone layers.",
+                )
+
+    @parameterized.expand([("block_level",), ("leaf_level",)])
+    def test_standalone_conv_layers_with_both_offload_types(self, offload_type: str):
+        """Test that standalone Conv2d layers work correctly with both block-level and leaf-level offloading."""
+        if torch.device(torch_device).type not in ["cuda", "xpu"]:
+            return
+
+        config = self.get_autoencoder_kl_config()
+        model = AutoencoderKL(**config)
+
+        model_ref = AutoencoderKL(**config)
+        model_ref.load_state_dict(model.state_dict(), strict=True)
+        model_ref.to(torch_device)
+
+        model.enable_group_offload(torch_device, offload_type=offload_type, num_blocks_per_group=1, use_stream=True)
+
+        x = torch.randn(2, 3, 32, 32).to(torch_device)
+
+        with torch.no_grad():
+            out_ref = model_ref(x).sample
+            out = model(x).sample
+
+        self.assertTrue(
+            torch.allclose(out_ref, out, atol=1e-5),
+            f"Outputs do not match for standalone Conv layers with {offload_type}.",
+        )
+
+    def test_multiple_invocations_with_vae_like_model(self):
+        """Test that multiple forward passes work correctly with VAE-like model."""
+        if torch.device(torch_device).type not in ["cuda", "xpu"]:
+            return
+
+        config = self.get_autoencoder_kl_config()
+        model = AutoencoderKL(**config)
+
+        model_ref = AutoencoderKL(**config)
+        model_ref.load_state_dict(model.state_dict(), strict=True)
+        model_ref.to(torch_device)
+
+        model.enable_group_offload(torch_device, offload_type="block_level", num_blocks_per_group=1, use_stream=True)
+
+        x = torch.randn(2, 3, 32, 32).to(torch_device)
+
+        with torch.no_grad():
+            for i in range(2):
+                out_ref = model_ref(x).sample
+                out = model(x).sample
+                self.assertTrue(torch.allclose(out_ref, out, atol=1e-5), f"Outputs do not match at iteration {i}.")
+
+    def test_nested_container_parameters_offloading(self):
+        """Test that parameters from non-computational layers in nested containers are handled correctly."""
+        if torch.device(torch_device).type not in ["cuda", "xpu"]:
+            return
+
+        model = DummyModelWithDeeplyNestedBlocks(in_features=64, hidden_features=128, out_features=64)
+
+        model_ref = DummyModelWithDeeplyNestedBlocks(in_features=64, hidden_features=128, out_features=64)
+        model_ref.load_state_dict(model.state_dict(), strict=True)
+        model_ref.to(torch_device)
+
+        model.enable_group_offload(torch_device, offload_type="block_level", num_blocks_per_group=1, use_stream=True)
+
+        x = torch.randn(2, 64).to(torch_device)
+
+        with torch.no_grad():
+            for i in range(2):
+                out_ref = model_ref(x)
+                out = model(x)
+                self.assertTrue(
+                    torch.allclose(out_ref, out, atol=1e-5),
+                    f"Outputs do not match at iteration {i} for nested parameters.",
+                )
+
+    def get_autoencoder_kl_config(self, block_out_channels=None, norm_num_groups=None):
+        block_out_channels = block_out_channels or [2, 4]
+        norm_num_groups = norm_num_groups or 2
+        init_dict = {
+            "block_out_channels": block_out_channels,
+            "in_channels": 3,
+            "out_channels": 3,
+            "down_block_types": ["DownEncoderBlock2D"] * len(block_out_channels),
+            "up_block_types": ["UpDecoderBlock2D"] * len(block_out_channels),
+            "latent_channels": 4,
+            "norm_num_groups": norm_num_groups,
+            "layers_per_block": 1,
+        }
+        return init_dict
diff --git a/tests/lora/test_lora_layers_flux2.py b/tests/lora/test_lora_layers_flux2.py
new file mode 100644
index 000000000000..4ae189aceb66
--- /dev/null
+++ b/tests/lora/test_lora_layers_flux2.py
@@ -0,0 +1,168 @@
+# coding=utf-8
+# Copyright 2025 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import sys
+import unittest
+
+import numpy as np
+import torch
+from transformers import AutoProcessor, Mistral3ForConditionalGeneration
+
+from diffusers import AutoencoderKLFlux2, FlowMatchEulerDiscreteScheduler, Flux2Pipeline, Flux2Transformer2DModel
+
+from ..testing_utils import floats_tensor, require_peft_backend, torch_device
+
+
+sys.path.append(".")
+
+from .utils import PeftLoraLoaderMixinTests, check_if_lora_correctly_set  # noqa: E402
+
+
+@require_peft_backend
+class Flux2LoRATests(unittest.TestCase, PeftLoraLoaderMixinTests):
+    pipeline_class = Flux2Pipeline
+    scheduler_cls = FlowMatchEulerDiscreteScheduler
+    scheduler_kwargs = {}
+
+    transformer_kwargs = {
+        "patch_size": 1,
+        "in_channels": 4,
+        "num_layers": 1,
+        "num_single_layers": 1,
+        "attention_head_dim": 16,
+        "num_attention_heads": 2,
+        "joint_attention_dim": 16,
+        "timestep_guidance_channels": 256,
+        "axes_dims_rope": [4, 4, 4, 4],
+    }
+    transformer_cls = Flux2Transformer2DModel
+    vae_kwargs = {
+        "sample_size": 32,
+        "in_channels": 3,
+        "out_channels": 3,
+        "down_block_types": ("DownEncoderBlock2D",),
+        "up_block_types": ("UpDecoderBlock2D",),
+        "block_out_channels": (4,),
+        "layers_per_block": 1,
+        "latent_channels": 1,
+        "norm_num_groups": 1,
+        "use_quant_conv": False,
+        "use_post_quant_conv": False,
+    }
+    vae_cls = AutoencoderKLFlux2
+
+    tokenizer_cls, tokenizer_id = AutoProcessor, "hf-internal-testing/tiny-mistral3-diffusers"
+    text_encoder_cls, text_encoder_id = Mistral3ForConditionalGeneration, "hf-internal-testing/tiny-mistral3-diffusers"
+    denoiser_target_modules = ["to_qkv_mlp_proj", "to_k"]
+
+    @property
+    def output_shape(self):
+        return (1, 8, 8, 3)
+
+    def get_dummy_inputs(self, with_generator=True):
+        batch_size = 1
+        sequence_length = 10
+        num_channels = 4
+        sizes = (32, 32)
+
+        generator = torch.manual_seed(0)
+        noise = floats_tensor((batch_size, num_channels) + sizes)
+        input_ids = torch.randint(1, sequence_length, size=(batch_size, sequence_length), generator=generator)
+
+        pipeline_inputs = {
+            "prompt": "a dog is dancing",
+            "num_inference_steps": 2,
+            "guidance_scale": 5.0,
+            "height": 8,
+            "width": 8,
+            "max_sequence_length": 8,
+            "output_type": "np",
+            "text_encoder_out_layers": (1,),
+        }
+        if with_generator:
+            pipeline_inputs.update({"generator": generator})
+
+        return noise, input_ids, pipeline_inputs
+
+    # Overriding because (1) text encoder LoRAs are not supported in Flux 2 and (2) because the Flux 2 single block
+    # QKV projections are always fused, it has no `to_q` param as expected by the original test.
+    def test_lora_fuse_nan(self):
+        components, _, denoiser_lora_config = self.get_dummy_components()
+        pipe = self.pipeline_class(**components)
+        pipe = pipe.to(torch_device)
+        pipe.set_progress_bar_config(disable=None)
+        _, _, inputs = self.get_dummy_inputs(with_generator=False)
+
+        denoiser = pipe.transformer if self.unet_kwargs is None else pipe.unet
+        denoiser.add_adapter(denoiser_lora_config, "adapter-1")
+        self.assertTrue(check_if_lora_correctly_set(denoiser), "Lora not correctly set in denoiser.")
+
+        # corrupt one LoRA weight with `inf` values
+        with torch.no_grad():
+            possible_tower_names = ["transformer_blocks", "single_transformer_blocks"]
+            filtered_tower_names = [
+                tower_name for tower_name in possible_tower_names if hasattr(pipe.transformer, tower_name)
+            ]
+            if len(filtered_tower_names) == 0:
+                reason = f"`pipe.transformer` didn't have any of the following attributes: {possible_tower_names}."
+                raise ValueError(reason)
+            for tower_name in filtered_tower_names:
+                transformer_tower = getattr(pipe.transformer, tower_name)
+                is_single = "single" in tower_name
+                if is_single:
+                    transformer_tower[0].attn.to_qkv_mlp_proj.lora_A["adapter-1"].weight += float("inf")
+                else:
+                    transformer_tower[0].attn.to_k.lora_A["adapter-1"].weight += float("inf")
+
+        # with `safe_fusing=True` we should see an Error
+        with self.assertRaises(ValueError):
+            pipe.fuse_lora(components=self.pipeline_class._lora_loadable_modules, safe_fusing=True)
+
+        # without we should not see an error, but every image will be black
+        pipe.fuse_lora(components=self.pipeline_class._lora_loadable_modules, safe_fusing=False)
+        out = pipe(**inputs)[0]
+
+        self.assertTrue(np.isnan(out).all())
+
+    @unittest.skip("Not supported in Flux2.")
+    def test_simple_inference_with_text_denoiser_block_scale(self):
+        pass
+
+    @unittest.skip("Not supported in Flux2.")
+    def test_simple_inference_with_text_denoiser_block_scale_for_all_dict_options(self):
+        pass
+
+    @unittest.skip("Not supported in Flux2.")
+    def test_modify_padding_mode(self):
+        pass
+
+    @unittest.skip("Text encoder LoRA is not supported in Flux2.")
+    def test_simple_inference_with_partial_text_lora(self):
+        pass
+
+    @unittest.skip("Text encoder LoRA is not supported in Flux2.")
+    def test_simple_inference_with_text_lora(self):
+        pass
+
+    @unittest.skip("Text encoder LoRA is not supported in Flux2.")
+    def test_simple_inference_with_text_lora_and_scale(self):
+        pass
+
+    @unittest.skip("Text encoder LoRA is not supported in Flux2.")
+    def test_simple_inference_with_text_lora_fused(self):
+        pass
+
+    @unittest.skip("Text encoder LoRA is not supported in Flux2.")
+    def test_simple_inference_with_text_lora_save_load(self):
+        pass
diff --git a/tests/lora/test_lora_layers_z_image.py b/tests/lora/test_lora_layers_z_image.py
new file mode 100644
index 000000000000..35d1389d9612
--- /dev/null
+++ b/tests/lora/test_lora_layers_z_image.py
@@ -0,0 +1,285 @@
+# coding=utf-8
+# Copyright 2025 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import sys
+import unittest
+
+import numpy as np
+import torch
+from transformers import Qwen2Tokenizer, Qwen3Config, Qwen3Model
+
+from diffusers import AutoencoderKL, FlowMatchEulerDiscreteScheduler, ZImagePipeline, ZImageTransformer2DModel
+
+from ..testing_utils import floats_tensor, is_peft_available, require_peft_backend, skip_mps, torch_device
+
+
+if is_peft_available():
+    from peft import LoraConfig
+
+
+sys.path.append(".")
+
+from .utils import PeftLoraLoaderMixinTests, check_if_lora_correctly_set  # noqa: E402
+
+
+@require_peft_backend
+class ZImageLoRATests(unittest.TestCase, PeftLoraLoaderMixinTests):
+    pipeline_class = ZImagePipeline
+    scheduler_cls = FlowMatchEulerDiscreteScheduler
+    scheduler_kwargs = {}
+
+    transformer_kwargs = {
+        "all_patch_size": (2,),
+        "all_f_patch_size": (1,),
+        "in_channels": 16,
+        "dim": 32,
+        "n_layers": 2,
+        "n_refiner_layers": 1,
+        "n_heads": 2,
+        "n_kv_heads": 2,
+        "norm_eps": 1e-5,
+        "qk_norm": True,
+        "cap_feat_dim": 16,
+        "rope_theta": 256.0,
+        "t_scale": 1000.0,
+        "axes_dims": [8, 4, 4],
+        "axes_lens": [256, 32, 32],
+    }
+    transformer_cls = ZImageTransformer2DModel
+    vae_kwargs = {
+        "in_channels": 3,
+        "out_channels": 3,
+        "down_block_types": ["DownEncoderBlock2D", "DownEncoderBlock2D"],
+        "up_block_types": ["UpDecoderBlock2D", "UpDecoderBlock2D"],
+        "block_out_channels": [32, 64],
+        "layers_per_block": 1,
+        "latent_channels": 16,
+        "norm_num_groups": 32,
+        "sample_size": 32,
+        "scaling_factor": 0.3611,
+        "shift_factor": 0.1159,
+    }
+    vae_cls = AutoencoderKL
+    tokenizer_cls, tokenizer_id = Qwen2Tokenizer, "hf-internal-testing/tiny-random-Qwen2VLForConditionalGeneration"
+    text_encoder_cls, text_encoder_id = Qwen3Model, None  # Will be created inline
+    denoiser_target_modules = ["to_q", "to_k", "to_v", "to_out.0"]
+
+    @property
+    def output_shape(self):
+        return (1, 32, 32, 3)
+
+    def get_dummy_inputs(self, with_generator=True):
+        batch_size = 1
+        sequence_length = 10
+        num_channels = 4
+        sizes = (32, 32)
+
+        generator = torch.manual_seed(0)
+        noise = floats_tensor((batch_size, num_channels) + sizes)
+        input_ids = torch.randint(1, sequence_length, size=(batch_size, sequence_length), generator=generator)
+
+        pipeline_inputs = {
+            "prompt": "A painting of a squirrel eating a burger",
+            "num_inference_steps": 4,
+            "guidance_scale": 0.0,
+            "height": 32,
+            "width": 32,
+            "max_sequence_length": 16,
+            "output_type": "np",
+        }
+        if with_generator:
+            pipeline_inputs.update({"generator": generator})
+
+        return noise, input_ids, pipeline_inputs
+
+    def get_dummy_components(self, scheduler_cls=None, use_dora=False, lora_alpha=None):
+        # Override to create Qwen3Model inline since it doesn't have a pretrained tiny model
+        torch.manual_seed(0)
+        config = Qwen3Config(
+            hidden_size=16,
+            intermediate_size=16,
+            num_hidden_layers=2,
+            num_attention_heads=2,
+            num_key_value_heads=2,
+            vocab_size=151936,
+            max_position_embeddings=512,
+        )
+        text_encoder = Qwen3Model(config)
+        tokenizer = Qwen2Tokenizer.from_pretrained(self.tokenizer_id)
+
+        transformer = self.transformer_cls(**self.transformer_kwargs)
+        # `x_pad_token` and `cap_pad_token` are initialized with `torch.empty`.
+        # This can cause NaN data values in our testing environment. Fixating them
+        # helps prevent that issue.
+        with torch.no_grad():
+            transformer.x_pad_token.copy_(torch.ones_like(transformer.x_pad_token.data))
+            transformer.cap_pad_token.copy_(torch.ones_like(transformer.cap_pad_token.data))
+        vae = self.vae_cls(**self.vae_kwargs)
+
+        if scheduler_cls is None:
+            scheduler_cls = self.scheduler_cls
+        scheduler = scheduler_cls(**self.scheduler_kwargs)
+
+        rank = 4
+        lora_alpha = rank if lora_alpha is None else lora_alpha
+
+        text_lora_config = LoraConfig(
+            r=rank,
+            lora_alpha=lora_alpha,
+            target_modules=["q_proj", "k_proj", "v_proj", "o_proj"],
+            init_lora_weights=False,
+            use_dora=use_dora,
+        )
+
+        denoiser_lora_config = LoraConfig(
+            r=rank,
+            lora_alpha=lora_alpha,
+            target_modules=self.denoiser_target_modules,
+            init_lora_weights=False,
+            use_dora=use_dora,
+        )
+
+        pipeline_components = {
+            "transformer": transformer,
+            "vae": vae,
+            "scheduler": scheduler,
+            "text_encoder": text_encoder,
+            "tokenizer": tokenizer,
+        }
+
+        return pipeline_components, text_lora_config, denoiser_lora_config
+
+    def test_correct_lora_configs_with_different_ranks(self):
+        components, _, denoiser_lora_config = self.get_dummy_components()
+        pipe = self.pipeline_class(**components)
+        pipe = pipe.to(torch_device)
+        pipe.set_progress_bar_config(disable=None)
+        _, _, inputs = self.get_dummy_inputs(with_generator=False)
+
+        original_output = pipe(**inputs, generator=torch.manual_seed(0))[0]
+
+        pipe.transformer.add_adapter(denoiser_lora_config, "adapter-1")
+
+        lora_output_same_rank = pipe(**inputs, generator=torch.manual_seed(0))[0]
+
+        pipe.transformer.delete_adapters("adapter-1")
+
+        denoiser = pipe.unet if self.unet_kwargs is not None else pipe.transformer
+        for name, _ in denoiser.named_modules():
+            if "to_k" in name and "attention" in name and "lora" not in name:
+                module_name_to_rank_update = name.replace(".base_layer.", ".")
+                break
+
+        # change the rank_pattern
+        updated_rank = denoiser_lora_config.r * 2
+        denoiser_lora_config.rank_pattern = {module_name_to_rank_update: updated_rank}
+
+        pipe.transformer.add_adapter(denoiser_lora_config, "adapter-1")
+        updated_rank_pattern = pipe.transformer.peft_config["adapter-1"].rank_pattern
+
+        self.assertTrue(updated_rank_pattern == {module_name_to_rank_update: updated_rank})
+
+        lora_output_diff_rank = pipe(**inputs, generator=torch.manual_seed(0))[0]
+        self.assertTrue(not np.allclose(original_output, lora_output_same_rank, atol=1e-3, rtol=1e-3))
+        self.assertTrue(not np.allclose(lora_output_diff_rank, lora_output_same_rank, atol=1e-3, rtol=1e-3))
+
+        pipe.transformer.delete_adapters("adapter-1")
+
+        # similarly change the alpha_pattern
+        updated_alpha = denoiser_lora_config.lora_alpha * 2
+        denoiser_lora_config.alpha_pattern = {module_name_to_rank_update: updated_alpha}
+
+        pipe.transformer.add_adapter(denoiser_lora_config, "adapter-1")
+        self.assertTrue(
+            pipe.transformer.peft_config["adapter-1"].alpha_pattern == {module_name_to_rank_update: updated_alpha}
+        )
+
+        lora_output_diff_alpha = pipe(**inputs, generator=torch.manual_seed(0))[0]
+        self.assertTrue(not np.allclose(original_output, lora_output_diff_alpha, atol=1e-3, rtol=1e-3))
+        self.assertTrue(not np.allclose(lora_output_diff_alpha, lora_output_same_rank, atol=1e-3, rtol=1e-3))
+
+    @skip_mps
+    def test_lora_fuse_nan(self):
+        components, _, denoiser_lora_config = self.get_dummy_components()
+        pipe = self.pipeline_class(**components)
+        pipe = pipe.to(torch_device)
+        pipe.set_progress_bar_config(disable=None)
+        _, _, inputs = self.get_dummy_inputs(with_generator=False)
+
+        denoiser = pipe.transformer if self.unet_kwargs is None else pipe.unet
+        denoiser.add_adapter(denoiser_lora_config, "adapter-1")
+        self.assertTrue(check_if_lora_correctly_set(denoiser), "Lora not correctly set in denoiser.")
+
+        # corrupt one LoRA weight with `inf` values
+        with torch.no_grad():
+            possible_tower_names = ["noise_refiner"]
+            filtered_tower_names = [
+                tower_name for tower_name in possible_tower_names if hasattr(pipe.transformer, tower_name)
+            ]
+            for tower_name in filtered_tower_names:
+                transformer_tower = getattr(pipe.transformer, tower_name)
+                transformer_tower[0].attention.to_q.lora_A["adapter-1"].weight += float("inf")
+
+        # with `safe_fusing=True` we should see an Error
+        with self.assertRaises(ValueError):
+            pipe.fuse_lora(components=self.pipeline_class._lora_loadable_modules, safe_fusing=True)
+
+        # without we should not see an error, but every image will be black
+        pipe.fuse_lora(components=self.pipeline_class._lora_loadable_modules, safe_fusing=False)
+        out = pipe(**inputs)[0]
+
+        self.assertTrue(np.isnan(out).all())
+
+    def test_lora_scale_kwargs_match_fusion(self):
+        super().test_lora_scale_kwargs_match_fusion(5e-2, 5e-2)
+
+    @unittest.skip("Needs to be debugged.")
+    def test_set_adapters_match_attention_kwargs(self):
+        super().test_set_adapters_match_attention_kwargs()
+
+    @unittest.skip("Needs to be debugged.")
+    def test_simple_inference_with_text_denoiser_lora_and_scale(self):
+        super().test_simple_inference_with_text_denoiser_lora_and_scale()
+
+    @unittest.skip("Not supported in ZImage.")
+    def test_simple_inference_with_text_denoiser_block_scale(self):
+        pass
+
+    @unittest.skip("Not supported in ZImage.")
+    def test_simple_inference_with_text_denoiser_block_scale_for_all_dict_options(self):
+        pass
+
+    @unittest.skip("Not supported in ZImage.")
+    def test_modify_padding_mode(self):
+        pass
+
+    @unittest.skip("Text encoder LoRA is not supported in ZImage.")
+    def test_simple_inference_with_partial_text_lora(self):
+        pass
+
+    @unittest.skip("Text encoder LoRA is not supported in ZImage.")
+    def test_simple_inference_with_text_lora(self):
+        pass
+
+    @unittest.skip("Text encoder LoRA is not supported in ZImage.")
+    def test_simple_inference_with_text_lora_and_scale(self):
+        pass
+
+    @unittest.skip("Text encoder LoRA is not supported in ZImage.")
+    def test_simple_inference_with_text_lora_fused(self):
+        pass
+
+    @unittest.skip("Text encoder LoRA is not supported in ZImage.")
+    def test_simple_inference_with_text_lora_save_load(self):
+        pass
diff --git a/tests/lora/utils.py b/tests/lora/utils.py
index 3d4344bb86a9..5fae6cac0a7f 100644
--- a/tests/lora/utils.py
+++ b/tests/lora/utils.py
@@ -28,6 +28,7 @@
     AutoencoderKL,
     UNet2DConditionModel,
 )
+from diffusers.hooks.group_offloading import _GROUP_OFFLOADING, apply_group_offloading
 from diffusers.utils import logging
 from diffusers.utils.import_utils import is_peft_available
 
@@ -2367,3 +2368,51 @@ def test_lora_loading_model_cpu_offload(self):
 
         output_lora_loaded = pipe(**inputs, generator=torch.manual_seed(0))[0]
         self.assertTrue(np.allclose(output_lora, output_lora_loaded, atol=1e-3, rtol=1e-3))
+
+    @require_torch_accelerator
+    def test_lora_group_offloading_delete_adapters(self):
+        components, _, denoiser_lora_config = self.get_dummy_components()
+        _, _, inputs = self.get_dummy_inputs(with_generator=False)
+        pipe = self.pipeline_class(**components)
+        pipe = pipe.to(torch_device)
+        pipe.set_progress_bar_config(disable=None)
+
+        denoiser = pipe.transformer if self.unet_kwargs is None else pipe.unet
+        denoiser.add_adapter(denoiser_lora_config)
+        self.assertTrue(check_if_lora_correctly_set(denoiser), "Lora not correctly set in denoiser.")
+
+        try:
+            with tempfile.TemporaryDirectory() as tmpdirname:
+                modules_to_save = self._get_modules_to_save(pipe, has_denoiser=True)
+                lora_state_dicts = self._get_lora_state_dicts(modules_to_save)
+                self.pipeline_class.save_lora_weights(
+                    save_directory=tmpdirname, safe_serialization=True, **lora_state_dicts
+                )
+
+                components, _, _ = self.get_dummy_components()
+                pipe = self.pipeline_class(**components)
+                denoiser = pipe.transformer if self.unet_kwargs is None else pipe.unet
+                pipe.to(torch_device)
+
+                # Enable Group Offloading (leaf_level for more granular testing)
+                apply_group_offloading(
+                    denoiser,
+                    onload_device=torch_device,
+                    offload_device="cpu",
+                    offload_type="leaf_level",
+                )
+
+                pipe.load_lora_weights(tmpdirname, adapter_name="default")
+
+                out_lora = pipe(**inputs, generator=torch.manual_seed(0))[0]
+
+                # Delete the adapter
+                pipe.delete_adapters("default")
+
+                out_no_lora = pipe(**inputs, generator=torch.manual_seed(0))[0]
+
+                self.assertFalse(np.allclose(out_lora, out_no_lora, atol=1e-3, rtol=1e-3))
+        finally:
+            # Clean up the hooks to prevent state leak
+            if hasattr(denoiser, "_diffusers_hook"):
+                denoiser._diffusers_hook.remove_hook(_GROUP_OFFLOADING, recurse=True)
diff --git a/tests/models/autoencoders/test_models_autoencoder_dc.py b/tests/models/autoencoders/test_models_autoencoder_dc.py
index d34001e7b903..b1b5531d0134 100644
--- a/tests/models/autoencoders/test_models_autoencoder_dc.py
+++ b/tests/models/autoencoders/test_models_autoencoder_dc.py
@@ -82,3 +82,7 @@ def prepare_init_args_and_inputs_for_common(self):
     @unittest.skipIf(IS_GITHUB_ACTIONS, reason="Skipping test inside GitHub Actions environment")
     def test_layerwise_casting_inference(self):
         super().test_layerwise_casting_inference()
+
+    @unittest.skipIf(IS_GITHUB_ACTIONS, reason="Skipping test inside GitHub Actions environment")
+    def test_layerwise_casting_memory(self):
+        super().test_layerwise_casting_memory()
diff --git a/tests/models/test_modeling_common.py b/tests/models/test_modeling_common.py
index 6f4c3d544b45..b9dfe932335c 100644
--- a/tests/models/test_modeling_common.py
+++ b/tests/models/test_modeling_common.py
@@ -47,6 +47,7 @@
     XFormersAttnProcessor,
 )
 from diffusers.models.auto_model import AutoModel
+from diffusers.models.modeling_outputs import BaseOutput
 from diffusers.training_utils import EMAModel
 from diffusers.utils import (
     SAFE_WEIGHTS_INDEX_NAME,
@@ -108,6 +109,11 @@ def check_if_lora_correctly_set(model) -> bool:
     return False
 
 
+def normalize_output(out):
+    out0 = out[0] if isinstance(out, (BaseOutput, tuple)) else out
+    return torch.stack(out0) if isinstance(out0, list) else out0
+
+
 # Will be run via run_test_in_subprocess
 def _test_from_save_pretrained_dynamo(in_queue, out_queue, timeout):
     error = None
@@ -536,6 +542,9 @@ def test_from_save_pretrained(self, expected_max_diff=5e-5):
             if isinstance(new_image, dict):
                 new_image = new_image.to_tuple()[0]
 
+            image = normalize_output(image)
+            new_image = normalize_output(new_image)
+
         max_diff = (image - new_image).abs().max().item()
         self.assertLessEqual(max_diff, expected_max_diff, "Models give different forward passes")
 
@@ -780,6 +789,9 @@ def test_from_save_pretrained_variant(self, expected_max_diff=5e-5):
             if isinstance(new_image, dict):
                 new_image = new_image.to_tuple()[0]
 
+            image = normalize_output(image)
+            new_image = normalize_output(new_image)
+
         max_diff = (image - new_image).abs().max().item()
         self.assertLessEqual(max_diff, expected_max_diff, "Models give different forward passes")
 
@@ -842,6 +854,9 @@ def test_determinism(self, expected_max_diff=1e-5):
             if isinstance(second, dict):
                 second = second.to_tuple()[0]
 
+            first = normalize_output(first)
+            second = normalize_output(second)
+
         out_1 = first.cpu().numpy()
         out_2 = second.cpu().numpy()
         out_1 = out_1[~np.isnan(out_1)]
@@ -860,11 +875,15 @@ def test_output(self, expected_output_shape=None):
 
             if isinstance(output, dict):
                 output = output.to_tuple()[0]
+            if isinstance(output, list):
+                output = torch.stack(output)
 
         self.assertIsNotNone(output)
 
         # input & output have to have the same shape
         input_tensor = inputs_dict[self.main_input_name]
+        if isinstance(input_tensor, list):
+            input_tensor = torch.stack(input_tensor)
 
         if expected_output_shape is None:
             expected_shape = input_tensor.shape
@@ -898,11 +917,15 @@ def test_model_from_pretrained(self):
 
             if isinstance(output_1, dict):
                 output_1 = output_1.to_tuple()[0]
+            if isinstance(output_1, list):
+                output_1 = torch.stack(output_1)
 
             output_2 = new_model(**inputs_dict)
 
             if isinstance(output_2, dict):
                 output_2 = output_2.to_tuple()[0]
+            if isinstance(output_2, list):
+                output_2 = torch.stack(output_2)
 
         self.assertEqual(output_1.shape, output_2.shape)
 
@@ -1138,6 +1161,8 @@ def test_save_load_lora_adapter(self, rank, lora_alpha, use_dora=False):
 
         torch.manual_seed(0)
         output_no_lora = model(**inputs_dict, return_dict=False)[0]
+        if isinstance(output_no_lora, list):
+            output_no_lora = torch.stack(output_no_lora)
 
         denoiser_lora_config = LoraConfig(
             r=rank,
@@ -1151,6 +1176,8 @@ def test_save_load_lora_adapter(self, rank, lora_alpha, use_dora=False):
 
         torch.manual_seed(0)
         outputs_with_lora = model(**inputs_dict, return_dict=False)[0]
+        if isinstance(outputs_with_lora, list):
+            outputs_with_lora = torch.stack(outputs_with_lora)
 
         self.assertFalse(torch.allclose(output_no_lora, outputs_with_lora, atol=1e-4, rtol=1e-4))
 
@@ -1175,6 +1202,8 @@ def test_save_load_lora_adapter(self, rank, lora_alpha, use_dora=False):
 
         torch.manual_seed(0)
         outputs_with_lora_2 = model(**inputs_dict, return_dict=False)[0]
+        if isinstance(outputs_with_lora_2, list):
+            outputs_with_lora_2 = torch.stack(outputs_with_lora_2)
 
         self.assertFalse(torch.allclose(output_no_lora, outputs_with_lora_2, atol=1e-4, rtol=1e-4))
         self.assertTrue(torch.allclose(outputs_with_lora, outputs_with_lora_2, atol=1e-4, rtol=1e-4))
@@ -1296,31 +1325,35 @@ def test_lora_adapter_wrong_metadata_raises_error(self):
     def test_cpu_offload(self):
         if self.model_class._no_split_modules is None:
             pytest.skip("Test not supported for this model as `_no_split_modules` is not set.")
+
         config, inputs_dict = self.prepare_init_args_and_inputs_for_common()
         model = self.model_class(**config).eval()
-
         model = model.to(torch_device)
 
         torch.manual_seed(0)
         base_output = model(**inputs_dict)
+        base_normalized_output = normalize_output(base_output)
 
         model_size = compute_module_sizes(model)[""]
-        # We test several splits of sizes to make sure it works.
         max_gpu_sizes = [int(p * model_size) for p in self.model_split_percents[1:]]
+
         with tempfile.TemporaryDirectory() as tmp_dir:
             model.cpu().save_pretrained(tmp_dir)
 
             for max_size in max_gpu_sizes:
                 max_memory = {0: max_size, "cpu": model_size * 2}
                 new_model = self.model_class.from_pretrained(tmp_dir, device_map="auto", max_memory=max_memory)
+
                 # Making sure part of the model will actually end up offloaded
                 self.assertSetEqual(set(new_model.hf_device_map.values()), {0, "cpu"})
 
                 self.check_device_map_is_respected(new_model, new_model.hf_device_map)
+
                 torch.manual_seed(0)
                 new_output = new_model(**inputs_dict)
+                new_normalized_output = normalize_output(new_output)
 
-                self.assertTrue(torch.allclose(base_output[0], new_output[0], atol=1e-5))
+                self.assertTrue(torch.allclose(base_normalized_output, new_normalized_output, atol=1e-5))
 
     @require_torch_accelerator
     def test_disk_offload_without_safetensors(self):
@@ -1333,6 +1366,7 @@ def test_disk_offload_without_safetensors(self):
 
         torch.manual_seed(0)
         base_output = model(**inputs_dict)
+        base_normalized_output = normalize_output(base_output)
 
         model_size = compute_module_sizes(model)[""]
         max_size = int(self.model_split_percents[0] * model_size)
@@ -1352,8 +1386,8 @@ def test_disk_offload_without_safetensors(self):
             self.check_device_map_is_respected(new_model, new_model.hf_device_map)
             torch.manual_seed(0)
             new_output = new_model(**inputs_dict)
-
-            self.assertTrue(torch.allclose(base_output[0], new_output[0], atol=1e-5))
+            new_normalized_output = normalize_output(new_output)
+            self.assertTrue(torch.allclose(base_normalized_output, new_normalized_output, atol=1e-5))
 
     @require_torch_accelerator
     def test_disk_offload_with_safetensors(self):
@@ -1366,6 +1400,7 @@ def test_disk_offload_with_safetensors(self):
 
         torch.manual_seed(0)
         base_output = model(**inputs_dict)
+        base_normalized_output = normalize_output(base_output)
 
         model_size = compute_module_sizes(model)[""]
         with tempfile.TemporaryDirectory() as tmp_dir:
@@ -1380,8 +1415,9 @@ def test_disk_offload_with_safetensors(self):
             self.check_device_map_is_respected(new_model, new_model.hf_device_map)
             torch.manual_seed(0)
             new_output = new_model(**inputs_dict)
+            new_normalized_output = normalize_output(new_output)
 
-            self.assertTrue(torch.allclose(base_output[0], new_output[0], atol=1e-5))
+            self.assertTrue(torch.allclose(base_normalized_output, new_normalized_output, atol=1e-5))
 
     @require_torch_multi_accelerator
     def test_model_parallelism(self):
@@ -1422,6 +1458,7 @@ def test_sharded_checkpoints(self):
         model = model.to(torch_device)
 
         base_output = model(**inputs_dict)
+        base_normalized_output = normalize_output(base_output)
 
         model_size = compute_module_persistent_sizes(model)[""]
         max_shard_size = int((model_size * 0.75) / (2**10))  # Convert to KB as these test models are small.
@@ -1443,8 +1480,9 @@ def test_sharded_checkpoints(self):
             if "generator" in inputs_dict:
                 _, inputs_dict = self.prepare_init_args_and_inputs_for_common()
             new_output = new_model(**inputs_dict)
+            new_normalized_output = normalize_output(new_output)
 
-            self.assertTrue(torch.allclose(base_output[0], new_output[0], atol=1e-5))
+            self.assertTrue(torch.allclose(base_normalized_output, new_normalized_output, atol=1e-5))
 
     @require_torch_accelerator
     def test_sharded_checkpoints_with_variant(self):
@@ -1454,6 +1492,7 @@ def test_sharded_checkpoints_with_variant(self):
         model = model.to(torch_device)
 
         base_output = model(**inputs_dict)
+        base_normalized_output = normalize_output(base_output)
 
         model_size = compute_module_persistent_sizes(model)[""]
         max_shard_size = int((model_size * 0.75) / (2**10))  # Convert to KB as these test models are small.
@@ -1481,8 +1520,9 @@ def test_sharded_checkpoints_with_variant(self):
             if "generator" in inputs_dict:
                 _, inputs_dict = self.prepare_init_args_and_inputs_for_common()
             new_output = new_model(**inputs_dict)
+            new_normalized_output = normalize_output(new_output)
 
-            self.assertTrue(torch.allclose(base_output[0], new_output[0], atol=1e-5))
+            self.assertTrue(torch.allclose(base_normalized_output, new_normalized_output, atol=1e-5))
 
     @require_torch_accelerator
     def test_sharded_checkpoints_with_parallel_loading(self):
@@ -1492,6 +1532,7 @@ def test_sharded_checkpoints_with_parallel_loading(self):
         model = model.to(torch_device)
 
         base_output = model(**inputs_dict)
+        base_normalized_output = normalize_output(base_output)
 
         model_size = compute_module_persistent_sizes(model)[""]
         max_shard_size = int((model_size * 0.75) / (2**10))  # Convert to KB as these test models are small.
@@ -1515,7 +1556,9 @@ def test_sharded_checkpoints_with_parallel_loading(self):
             if "generator" in inputs_dict:
                 _, inputs_dict = self.prepare_init_args_and_inputs_for_common()
             new_output = new_model(**inputs_dict)
-            self.assertTrue(torch.allclose(base_output[0], new_output[0], atol=1e-5))
+            new_normalized_output = normalize_output(new_output)
+
+            self.assertTrue(torch.allclose(base_normalized_output, new_normalized_output, atol=1e-5))
             # set to no.
             os.environ["HF_ENABLE_PARALLEL_LOADING"] = "no"
 
@@ -1529,6 +1572,7 @@ def test_sharded_checkpoints_device_map(self):
 
         torch.manual_seed(0)
         base_output = model(**inputs_dict)
+        base_normalized_output = normalize_output(base_output)
 
         model_size = compute_module_persistent_sizes(model)[""]
         max_shard_size = int((model_size * 0.75) / (2**10))  # Convert to KB as these test models are small.
@@ -1549,7 +1593,9 @@ def test_sharded_checkpoints_device_map(self):
             if "generator" in inputs_dict:
                 _, inputs_dict = self.prepare_init_args_and_inputs_for_common()
             new_output = new_model(**inputs_dict)
-            self.assertTrue(torch.allclose(base_output[0], new_output[0], atol=1e-5))
+            new_normalized_output = normalize_output(new_output)
+
+            self.assertTrue(torch.allclose(base_normalized_output, new_normalized_output, atol=1e-5))
 
     # This test is okay without a GPU because we're not running any execution. We're just serializing
     # and check if the resultant files are following an expected format.
@@ -1629,7 +1675,9 @@ def test_layerwise_casting_inference(self):
         model = self.model_class(**config)
         model.eval()
         model.to(torch_device)
-        base_slice = model(**inputs_dict)[0].detach().flatten().cpu().numpy()
+        base_slice = model(**inputs_dict)[0]
+        base_slice = normalize_output(base_slice)
+        base_slice = base_slice.detach().flatten().cpu().numpy()
 
         def check_linear_dtype(module, storage_dtype, compute_dtype):
             patterns_to_check = DEFAULT_SKIP_MODULES_PATTERN
@@ -1655,7 +1703,9 @@ def test_layerwise_casting(storage_dtype, compute_dtype):
             model.enable_layerwise_casting(storage_dtype=storage_dtype, compute_dtype=compute_dtype)
 
             check_linear_dtype(model, storage_dtype, compute_dtype)
-            output = model(**inputs_dict)[0].float().flatten().detach().cpu().numpy()
+            output = model(**inputs_dict)[0]
+            output = normalize_output(output)
+            output = output.float().flatten().detach().cpu().numpy()
 
             # The precision test is not very important for fast tests. In most cases, the outputs will not be the same.
             # We just want to make sure that the layerwise casting is working as expected.
@@ -1716,6 +1766,12 @@ def get_memory_usage(storage_dtype, compute_dtype):
     @parameterized.expand([False, True])
     @require_torch_accelerator
     def test_group_offloading(self, record_stream):
+        for cls in inspect.getmro(self.__class__):
+            if "test_group_offloading" in cls.__dict__ and cls is not ModelTesterMixin:
+                # Skip this test if it is overwritten by child class. We need to do this because parameterized
+                # materializes the test methods on invocation which cannot be overridden.
+                pytest.skip("Model does not support group offloading.")
+
         if not self.model_class._supports_group_offloading:
             pytest.skip("Model does not support group offloading.")
 
@@ -1735,24 +1791,27 @@ def run_forward(model):
             return model(**inputs_dict)[0]
 
         model = self.model_class(**init_dict)
-
         model.to(torch_device)
         output_without_group_offloading = run_forward(model)
+        output_without_group_offloading = normalize_output(output_without_group_offloading)
 
         torch.manual_seed(0)
         model = self.model_class(**init_dict)
         model.enable_group_offload(torch_device, offload_type="block_level", num_blocks_per_group=1)
         output_with_group_offloading1 = run_forward(model)
+        output_with_group_offloading1 = normalize_output(output_with_group_offloading1)
 
         torch.manual_seed(0)
         model = self.model_class(**init_dict)
         model.enable_group_offload(torch_device, offload_type="block_level", num_blocks_per_group=1, non_blocking=True)
         output_with_group_offloading2 = run_forward(model)
+        output_with_group_offloading2 = normalize_output(output_with_group_offloading2)
 
         torch.manual_seed(0)
         model = self.model_class(**init_dict)
         model.enable_group_offload(torch_device, offload_type="leaf_level")
         output_with_group_offloading3 = run_forward(model)
+        output_with_group_offloading3 = normalize_output(output_with_group_offloading3)
 
         torch.manual_seed(0)
         model = self.model_class(**init_dict)
@@ -1760,6 +1819,7 @@ def run_forward(model):
             torch_device, offload_type="leaf_level", use_stream=True, record_stream=record_stream
         )
         output_with_group_offloading4 = run_forward(model)
+        output_with_group_offloading4 = normalize_output(output_with_group_offloading4)
 
         self.assertTrue(torch.allclose(output_without_group_offloading, output_with_group_offloading1, atol=1e-5))
         self.assertTrue(torch.allclose(output_without_group_offloading, output_with_group_offloading2, atol=1e-5))
@@ -1799,6 +1859,12 @@ def test_group_offloading_with_layerwise_casting(self, record_stream, offload_ty
     @torch.no_grad()
     @torch.inference_mode()
     def test_group_offloading_with_disk(self, offload_type, record_stream, atol=1e-5):
+        for cls in inspect.getmro(self.__class__):
+            if "test_group_offloading_with_disk" in cls.__dict__ and cls is not ModelTesterMixin:
+                # Skip this test if it is overwritten by child class. We need to do this because parameterized
+                # materializes the test methods on invocation which cannot be overridden.
+                pytest.skip("Model does not support group offloading with disk yet.")
+
         if not self.model_class._supports_group_offloading:
             pytest.skip("Model does not support group offloading.")
 
@@ -1814,9 +1880,6 @@ def _run_forward(model, inputs_dict):
             torch.manual_seed(0)
             return model(**inputs_dict)[0]
 
-        if self.__class__.__name__ == "AutoencoderKLCosmosTests" and offload_type == "leaf_level":
-            pytest.skip("With `leaf_type` as the offloading type, it fails. Needs investigation.")
-
         init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
         torch.manual_seed(0)
         model = self.model_class(**init_dict)
@@ -1824,6 +1887,7 @@ def _run_forward(model, inputs_dict):
         model.eval()
         model.to(torch_device)
         output_without_group_offloading = _run_forward(model, inputs_dict)
+        output_without_group_offloading = normalize_output(output_without_group_offloading)
 
         torch.manual_seed(0)
         model = self.model_class(**init_dict)
@@ -1851,6 +1915,9 @@ def _run_forward(model, inputs_dict):
                     offload_to_disk_path=tmpdir,
                     offload_type=offload_type,
                     num_blocks_per_group=num_blocks_per_group,
+                    block_modules=model._group_offload_block_modules
+                    if hasattr(model, "_group_offload_block_modules")
+                    else None,
                 )
                 if not is_correct:
                     if extra_files:
@@ -1859,6 +1926,7 @@ def _run_forward(model, inputs_dict):
                         raise ValueError(f"Following files are missing: {', '.join(missing_files)}")
 
             output_with_group_offloading = _run_forward(model, inputs_dict)
+            output_with_group_offloading = normalize_output(output_with_group_offloading)
             self.assertTrue(torch.allclose(output_without_group_offloading, output_with_group_offloading, atol=atol))
 
     def test_auto_model(self, expected_max_diff=5e-5):
@@ -1892,10 +1960,17 @@ def test_auto_model(self, expected_max_diff=5e-5):
                 output_original = model(**inputs_dict)
                 output_auto = auto_model(**inputs_dict)
 
-            if isinstance(output_original, dict):
-                output_original = output_original.to_tuple()[0]
-            if isinstance(output_auto, dict):
-                output_auto = output_auto.to_tuple()[0]
+        if isinstance(output_original, dict):
+            output_original = output_original.to_tuple()[0]
+        if isinstance(output_auto, dict):
+            output_auto = output_auto.to_tuple()[0]
+
+        if isinstance(output_original, list):
+            output_original = torch.stack(output_original)
+        if isinstance(output_auto, list):
+            output_auto = torch.stack(output_auto)
+
+        output_original, output_auto = output_original.float(), output_auto.float()
 
         max_diff = (output_original - output_auto).abs().max().item()
         self.assertLessEqual(
@@ -2086,6 +2161,8 @@ def test_torch_compile_repeated_blocks(self):
         recompile_limit = 1
         if self.model_class.__name__ == "UNet2DConditionModel":
             recompile_limit = 2
+        elif self.model_class.__name__ == "ZImageTransformer2DModel":
+            recompile_limit = 3
 
         with (
             torch._inductor.utils.fresh_inductor_cache(),
@@ -2187,7 +2264,6 @@ def tearDown(self):
         backend_empty_cache(torch_device)
 
     def get_lora_config(self, lora_rank, lora_alpha, target_modules):
-        # from diffusers test_models_unet_2d_condition.py
         from peft import LoraConfig
 
         lora_config = LoraConfig(
diff --git a/tests/models/transformers/test_models_transformer_flux2.py b/tests/models/transformers/test_models_transformer_flux2.py
new file mode 100644
index 000000000000..316d5fa770bb
--- /dev/null
+++ b/tests/models/transformers/test_models_transformer_flux2.py
@@ -0,0 +1,162 @@
+# coding=utf-8
+# Copyright 2025 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import torch
+
+from diffusers import Flux2Transformer2DModel, attention_backend
+
+from ...testing_utils import enable_full_determinism, torch_device
+from ..test_modeling_common import LoraHotSwappingForModelTesterMixin, ModelTesterMixin, TorchCompileTesterMixin
+
+
+enable_full_determinism()
+
+
+class Flux2TransformerTests(ModelTesterMixin, unittest.TestCase):
+    model_class = Flux2Transformer2DModel
+    main_input_name = "hidden_states"
+    # We override the items here because the transformer under consideration is small.
+    model_split_percents = [0.7, 0.6, 0.6]
+
+    # Skip setting testing with default: AttnProcessor
+    uses_custom_attn_processor = True
+
+    @property
+    def dummy_input(self):
+        return self.prepare_dummy_input()
+
+    @property
+    def input_shape(self):
+        return (16, 4)
+
+    @property
+    def output_shape(self):
+        return (16, 4)
+
+    def prepare_dummy_input(self, height=4, width=4):
+        batch_size = 1
+        num_latent_channels = 4
+        sequence_length = 48
+        embedding_dim = 32
+
+        hidden_states = torch.randn((batch_size, height * width, num_latent_channels)).to(torch_device)
+        encoder_hidden_states = torch.randn((batch_size, sequence_length, embedding_dim)).to(torch_device)
+
+        t_coords = torch.arange(1)
+        h_coords = torch.arange(height)
+        w_coords = torch.arange(width)
+        l_coords = torch.arange(1)
+        image_ids = torch.cartesian_prod(t_coords, h_coords, w_coords, l_coords)  # [height * width, 4]
+        image_ids = image_ids.unsqueeze(0).expand(batch_size, -1, -1).to(torch_device)
+
+        text_t_coords = torch.arange(1)
+        text_h_coords = torch.arange(1)
+        text_w_coords = torch.arange(1)
+        text_l_coords = torch.arange(sequence_length)
+        text_ids = torch.cartesian_prod(text_t_coords, text_h_coords, text_w_coords, text_l_coords)
+        text_ids = text_ids.unsqueeze(0).expand(batch_size, -1, -1).to(torch_device)
+
+        timestep = torch.tensor([1.0]).to(torch_device).expand(batch_size)
+        guidance = torch.tensor([1.0]).to(torch_device).expand(batch_size)
+
+        return {
+            "hidden_states": hidden_states,
+            "encoder_hidden_states": encoder_hidden_states,
+            "img_ids": image_ids,
+            "txt_ids": text_ids,
+            "timestep": timestep,
+            "guidance": guidance,
+        }
+
+    def prepare_init_args_and_inputs_for_common(self):
+        init_dict = {
+            "patch_size": 1,
+            "in_channels": 4,
+            "num_layers": 1,
+            "num_single_layers": 1,
+            "attention_head_dim": 16,
+            "num_attention_heads": 2,
+            "joint_attention_dim": 32,
+            "timestep_guidance_channels": 256,  # Hardcoded in original code
+            "axes_dims_rope": [4, 4, 4, 4],
+        }
+
+        inputs_dict = self.dummy_input
+        return init_dict, inputs_dict
+
+    # TODO (Daniel, Sayak): We can remove this test.
+    def test_flux2_consistency(self, seed=0):
+        torch.manual_seed(seed)
+        init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
+
+        torch.manual_seed(seed)
+        model = self.model_class(**init_dict)
+        # state_dict = model.state_dict()
+        # for key, param in state_dict.items():
+        #     print(f"{key} | {param.shape}")
+        # torch.save(state_dict, "/raid/daniel_gu/test_flux2_params/diffusers.pt")
+        model.to(torch_device)
+        model.eval()
+
+        with attention_backend("native"):
+            with torch.no_grad():
+                output = model(**inputs_dict)
+
+                if isinstance(output, dict):
+                    output = output.to_tuple()[0]
+
+        self.assertIsNotNone(output)
+
+        # input & output have to have the same shape
+        input_tensor = inputs_dict[self.main_input_name]
+        expected_shape = input_tensor.shape
+        self.assertEqual(output.shape, expected_shape, "Input and output shapes do not match")
+
+        # Check against expected slice
+        # fmt: off
+        expected_slice = torch.tensor([-0.3662, 0.4844, 0.6334, -0.3497, 0.2162, 0.0188, 0.0521, -0.2061, -0.2041, -0.0342, -0.7107, 0.4797, -0.3280, 0.7059, -0.0849, 0.4416])
+        # fmt: on
+
+        flat_output = output.cpu().flatten()
+        generated_slice = torch.cat([flat_output[:8], flat_output[-8:]])
+        self.assertTrue(torch.allclose(generated_slice, expected_slice, atol=1e-4))
+
+    def test_gradient_checkpointing_is_applied(self):
+        expected_set = {"Flux2Transformer2DModel"}
+        super().test_gradient_checkpointing_is_applied(expected_set=expected_set)
+
+
+class Flux2TransformerCompileTests(TorchCompileTesterMixin, unittest.TestCase):
+    model_class = Flux2Transformer2DModel
+    different_shapes_for_compilation = [(4, 4), (4, 8), (8, 8)]
+
+    def prepare_init_args_and_inputs_for_common(self):
+        return Flux2TransformerTests().prepare_init_args_and_inputs_for_common()
+
+    def prepare_dummy_input(self, height, width):
+        return Flux2TransformerTests().prepare_dummy_input(height=height, width=width)
+
+
+class Flux2TransformerLoRAHotSwapTests(LoraHotSwappingForModelTesterMixin, unittest.TestCase):
+    model_class = Flux2Transformer2DModel
+    different_shapes_for_compilation = [(4, 4), (4, 8), (8, 8)]
+
+    def prepare_init_args_and_inputs_for_common(self):
+        return Flux2TransformerTests().prepare_init_args_and_inputs_for_common()
+
+    def prepare_dummy_input(self, height, width):
+        return Flux2TransformerTests().prepare_dummy_input(height=height, width=width)
diff --git a/tests/models/transformers/test_models_transformer_hunyuan_1_5.py b/tests/models/transformers/test_models_transformer_hunyuan_1_5.py
new file mode 100644
index 000000000000..57080bc5b0b4
--- /dev/null
+++ b/tests/models/transformers/test_models_transformer_hunyuan_1_5.py
@@ -0,0 +1,101 @@
+# Copyright 2025 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import torch
+
+from diffusers import HunyuanVideo15Transformer3DModel
+
+from ...testing_utils import enable_full_determinism, torch_device
+from ..test_modeling_common import ModelTesterMixin
+
+
+enable_full_determinism()
+
+
+class HunyuanVideo15Transformer3DTests(ModelTesterMixin, unittest.TestCase):
+    model_class = HunyuanVideo15Transformer3DModel
+    main_input_name = "hidden_states"
+    uses_custom_attn_processor = True
+    model_split_percents = [0.99, 0.99, 0.99]
+
+    text_embed_dim = 16
+    text_embed_2_dim = 8
+    image_embed_dim = 12
+
+    @property
+    def dummy_input(self):
+        batch_size = 1
+        num_channels = 4
+        num_frames = 1
+        height = 8
+        width = 8
+        sequence_length = 6
+        sequence_length_2 = 4
+        image_sequence_length = 3
+
+        hidden_states = torch.randn((batch_size, num_channels, num_frames, height, width)).to(torch_device)
+        timestep = torch.tensor([1.0]).to(torch_device)
+        encoder_hidden_states = torch.randn((batch_size, sequence_length, self.text_embed_dim), device=torch_device)
+        encoder_hidden_states_2 = torch.randn(
+            (batch_size, sequence_length_2, self.text_embed_2_dim), device=torch_device
+        )
+        encoder_attention_mask = torch.ones((batch_size, sequence_length), device=torch_device)
+        encoder_attention_mask_2 = torch.ones((batch_size, sequence_length_2), device=torch_device)
+        # All zeros for inducing T2V path in the model.
+        image_embeds = torch.zeros((batch_size, image_sequence_length, self.image_embed_dim), device=torch_device)
+
+        return {
+            "hidden_states": hidden_states,
+            "timestep": timestep,
+            "encoder_hidden_states": encoder_hidden_states,
+            "encoder_attention_mask": encoder_attention_mask,
+            "encoder_hidden_states_2": encoder_hidden_states_2,
+            "encoder_attention_mask_2": encoder_attention_mask_2,
+            "image_embeds": image_embeds,
+        }
+
+    @property
+    def input_shape(self):
+        return (4, 1, 8, 8)
+
+    @property
+    def output_shape(self):
+        return (4, 1, 8, 8)
+
+    def prepare_init_args_and_inputs_for_common(self):
+        init_dict = {
+            "in_channels": 4,
+            "out_channels": 4,
+            "num_attention_heads": 2,
+            "attention_head_dim": 8,
+            "num_layers": 2,
+            "num_refiner_layers": 1,
+            "mlp_ratio": 2.0,
+            "patch_size": 1,
+            "patch_size_t": 1,
+            "text_embed_dim": self.text_embed_dim,
+            "text_embed_2_dim": self.text_embed_2_dim,
+            "image_embed_dim": self.image_embed_dim,
+            "rope_axes_dim": (2, 2, 4),
+            "target_size": 16,
+            "task_type": "t2v",
+        }
+        inputs_dict = self.dummy_input
+        return init_dict, inputs_dict
+
+    def test_gradient_checkpointing_is_applied(self):
+        expected_set = {"HunyuanVideo15Transformer3DModel"}
+        super().test_gradient_checkpointing_is_applied(expected_set=expected_set)
diff --git a/tests/models/transformers/test_models_transformer_sana_video.py b/tests/models/transformers/test_models_transformer_sana_video.py
new file mode 100644
index 000000000000..ff564ed8918d
--- /dev/null
+++ b/tests/models/transformers/test_models_transformer_sana_video.py
@@ -0,0 +1,97 @@
+# Copyright 2025 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import torch
+
+from diffusers import SanaVideoTransformer3DModel
+
+from ...testing_utils import (
+    enable_full_determinism,
+    torch_device,
+)
+from ..test_modeling_common import ModelTesterMixin, TorchCompileTesterMixin
+
+
+enable_full_determinism()
+
+
+class SanaVideoTransformer3DTests(ModelTesterMixin, unittest.TestCase):
+    model_class = SanaVideoTransformer3DModel
+    main_input_name = "hidden_states"
+    uses_custom_attn_processor = True
+
+    @property
+    def dummy_input(self):
+        batch_size = 1
+        num_channels = 16
+        num_frames = 2
+        height = 16
+        width = 16
+        text_encoder_embedding_dim = 16
+        sequence_length = 12
+
+        hidden_states = torch.randn((batch_size, num_channels, num_frames, height, width)).to(torch_device)
+        timestep = torch.randint(0, 1000, size=(batch_size,)).to(torch_device)
+        encoder_hidden_states = torch.randn((batch_size, sequence_length, text_encoder_embedding_dim)).to(torch_device)
+
+        return {
+            "hidden_states": hidden_states,
+            "encoder_hidden_states": encoder_hidden_states,
+            "timestep": timestep,
+        }
+
+    @property
+    def input_shape(self):
+        return (16, 2, 16, 16)
+
+    @property
+    def output_shape(self):
+        return (16, 2, 16, 16)
+
+    def prepare_init_args_and_inputs_for_common(self):
+        init_dict = {
+            "in_channels": 16,
+            "out_channels": 16,
+            "num_attention_heads": 2,
+            "attention_head_dim": 12,
+            "num_layers": 2,
+            "num_cross_attention_heads": 2,
+            "cross_attention_head_dim": 12,
+            "cross_attention_dim": 24,
+            "caption_channels": 16,
+            "mlp_ratio": 2.5,
+            "dropout": 0.0,
+            "attention_bias": False,
+            "sample_size": 8,
+            "patch_size": (1, 2, 2),
+            "norm_elementwise_affine": False,
+            "norm_eps": 1e-6,
+            "qk_norm": "rms_norm_across_heads",
+            "rope_max_seq_len": 32,
+        }
+        inputs_dict = self.dummy_input
+        return init_dict, inputs_dict
+
+    def test_gradient_checkpointing_is_applied(self):
+        expected_set = {"SanaVideoTransformer3DModel"}
+        super().test_gradient_checkpointing_is_applied(expected_set=expected_set)
+
+
+class SanaVideoTransformerCompileTests(TorchCompileTesterMixin, unittest.TestCase):
+    model_class = SanaVideoTransformer3DModel
+
+    def prepare_init_args_and_inputs_for_common(self):
+        return SanaVideoTransformer3DTests().prepare_init_args_and_inputs_for_common()
diff --git a/tests/models/transformers/test_models_transformer_wan_animate.py b/tests/models/transformers/test_models_transformer_wan_animate.py
new file mode 100644
index 000000000000..5d571b8c2e7d
--- /dev/null
+++ b/tests/models/transformers/test_models_transformer_wan_animate.py
@@ -0,0 +1,126 @@
+# Copyright 2025 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import torch
+
+from diffusers import WanAnimateTransformer3DModel
+
+from ...testing_utils import (
+    enable_full_determinism,
+    torch_device,
+)
+from ..test_modeling_common import ModelTesterMixin, TorchCompileTesterMixin
+
+
+enable_full_determinism()
+
+
+class WanAnimateTransformer3DTests(ModelTesterMixin, unittest.TestCase):
+    model_class = WanAnimateTransformer3DModel
+    main_input_name = "hidden_states"
+    uses_custom_attn_processor = True
+
+    @property
+    def dummy_input(self):
+        batch_size = 1
+        num_channels = 4
+        num_frames = 20  # To make the shapes work out; for complicated reasons we want 21 to divide num_frames + 1
+        height = 16
+        width = 16
+        text_encoder_embedding_dim = 16
+        sequence_length = 12
+
+        clip_seq_len = 12
+        clip_dim = 16
+
+        inference_segment_length = 77  # The inference segment length in the full Wan2.2-Animate-14B model
+        face_height = 16  # Should be square and match `motion_encoder_size` below
+        face_width = 16
+
+        hidden_states = torch.randn((batch_size, 2 * num_channels + 4, num_frames + 1, height, width)).to(torch_device)
+        timestep = torch.randint(0, 1000, size=(batch_size,)).to(torch_device)
+        encoder_hidden_states = torch.randn((batch_size, sequence_length, text_encoder_embedding_dim)).to(torch_device)
+        clip_ref_features = torch.randn((batch_size, clip_seq_len, clip_dim)).to(torch_device)
+        pose_latents = torch.randn((batch_size, num_channels, num_frames, height, width)).to(torch_device)
+        face_pixel_values = torch.randn((batch_size, 3, inference_segment_length, face_height, face_width)).to(
+            torch_device
+        )
+
+        return {
+            "hidden_states": hidden_states,
+            "timestep": timestep,
+            "encoder_hidden_states": encoder_hidden_states,
+            "encoder_hidden_states_image": clip_ref_features,
+            "pose_hidden_states": pose_latents,
+            "face_pixel_values": face_pixel_values,
+        }
+
+    @property
+    def input_shape(self):
+        return (12, 1, 16, 16)
+
+    @property
+    def output_shape(self):
+        return (4, 1, 16, 16)
+
+    def prepare_init_args_and_inputs_for_common(self):
+        # Use custom channel sizes since the default Wan Animate channel sizes will cause the motion encoder to
+        # contain the vast majority of the parameters in the test model
+        channel_sizes = {"4": 16, "8": 16, "16": 16}
+
+        init_dict = {
+            "patch_size": (1, 2, 2),
+            "num_attention_heads": 2,
+            "attention_head_dim": 12,
+            "in_channels": 12,  # 2 * C + 4 = 2 * 4 + 4 = 12
+            "latent_channels": 4,
+            "out_channels": 4,
+            "text_dim": 16,
+            "freq_dim": 256,
+            "ffn_dim": 32,
+            "num_layers": 2,
+            "cross_attn_norm": True,
+            "qk_norm": "rms_norm_across_heads",
+            "image_dim": 16,
+            "rope_max_seq_len": 32,
+            "motion_encoder_channel_sizes": channel_sizes,  # Start of Wan Animate-specific config
+            "motion_encoder_size": 16,  # Ensures that there will be 2 motion encoder resblocks
+            "motion_style_dim": 8,
+            "motion_dim": 4,
+            "motion_encoder_dim": 16,
+            "face_encoder_hidden_dim": 16,
+            "face_encoder_num_heads": 2,
+            "inject_face_latents_blocks": 2,
+        }
+        inputs_dict = self.dummy_input
+        return init_dict, inputs_dict
+
+    def test_gradient_checkpointing_is_applied(self):
+        expected_set = {"WanAnimateTransformer3DModel"}
+        super().test_gradient_checkpointing_is_applied(expected_set=expected_set)
+
+    # Override test_output because the transformer output is expected to have less channels than the main transformer
+    # input.
+    def test_output(self):
+        expected_output_shape = (1, 4, 21, 16, 16)
+        super().test_output(expected_output_shape=expected_output_shape)
+
+
+class WanAnimateTransformerCompileTests(TorchCompileTesterMixin, unittest.TestCase):
+    model_class = WanAnimateTransformer3DModel
+
+    def prepare_init_args_and_inputs_for_common(self):
+        return WanAnimateTransformer3DTests().prepare_init_args_and_inputs_for_common()
diff --git a/tests/models/transformers/test_models_transformer_z_image.py b/tests/models/transformers/test_models_transformer_z_image.py
new file mode 100644
index 000000000000..79054019f2d2
--- /dev/null
+++ b/tests/models/transformers/test_models_transformer_z_image.py
@@ -0,0 +1,171 @@
+# coding=utf-8
+# Copyright 2025 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import gc
+import os
+import unittest
+
+import torch
+
+from diffusers import ZImageTransformer2DModel
+
+from ...testing_utils import IS_GITHUB_ACTIONS, torch_device
+from ..test_modeling_common import ModelTesterMixin, TorchCompileTesterMixin
+
+
+# Z-Image requires torch.use_deterministic_algorithms(False) due to complex64 RoPE operations
+# Cannot use enable_full_determinism() which sets it to True
+os.environ["CUDA_LAUNCH_BLOCKING"] = "1"
+os.environ["CUBLAS_WORKSPACE_CONFIG"] = ":16:8"
+torch.use_deterministic_algorithms(False)
+torch.backends.cudnn.deterministic = True
+torch.backends.cudnn.benchmark = False
+if hasattr(torch.backends, "cuda"):
+    torch.backends.cuda.matmul.allow_tf32 = False
+
+
+@unittest.skipIf(
+    IS_GITHUB_ACTIONS,
+    reason="Skipping test-suite inside the CI because the model has `torch.empty()` inside of it during init and we don't have a clear way to override it in the modeling tests.",
+)
+class ZImageTransformerTests(ModelTesterMixin, unittest.TestCase):
+    model_class = ZImageTransformer2DModel
+    main_input_name = "x"
+    # We override the items here because the transformer under consideration is small.
+    model_split_percents = [0.9, 0.9, 0.9]
+
+    def prepare_dummy_input(self, height=16, width=16):
+        batch_size = 1
+        num_channels = 16
+        embedding_dim = 16
+        sequence_length = 16
+
+        hidden_states = [torch.randn((num_channels, 1, height, width)).to(torch_device) for _ in range(batch_size)]
+        encoder_hidden_states = [
+            torch.randn((sequence_length, embedding_dim)).to(torch_device) for _ in range(batch_size)
+        ]
+        timestep = torch.tensor([0.0]).to(torch_device)
+
+        return {"x": hidden_states, "cap_feats": encoder_hidden_states, "t": timestep}
+
+    @property
+    def dummy_input(self):
+        return self.prepare_dummy_input()
+
+    @property
+    def input_shape(self):
+        return (4, 32, 32)
+
+    @property
+    def output_shape(self):
+        return (4, 32, 32)
+
+    def prepare_init_args_and_inputs_for_common(self):
+        init_dict = {
+            "all_patch_size": (2,),
+            "all_f_patch_size": (1,),
+            "in_channels": 16,
+            "dim": 16,
+            "n_layers": 1,
+            "n_refiner_layers": 1,
+            "n_heads": 1,
+            "n_kv_heads": 2,
+            "qk_norm": True,
+            "cap_feat_dim": 16,
+            "rope_theta": 256.0,
+            "t_scale": 1000.0,
+            "axes_dims": [8, 4, 4],
+            "axes_lens": [256, 32, 32],
+        }
+        inputs_dict = self.dummy_input
+        return init_dict, inputs_dict
+
+    def setUp(self):
+        gc.collect()
+        if torch.cuda.is_available():
+            torch.cuda.empty_cache()
+            torch.cuda.synchronize()
+        torch.manual_seed(0)
+        if torch.cuda.is_available():
+            torch.cuda.manual_seed_all(0)
+
+    def tearDown(self):
+        super().tearDown()
+        gc.collect()
+        if torch.cuda.is_available():
+            torch.cuda.empty_cache()
+            torch.cuda.synchronize()
+        torch.manual_seed(0)
+        if torch.cuda.is_available():
+            torch.cuda.manual_seed_all(0)
+
+    def test_gradient_checkpointing_is_applied(self):
+        expected_set = {"ZImageTransformer2DModel"}
+        super().test_gradient_checkpointing_is_applied(expected_set=expected_set)
+
+    @unittest.skip("Test is not supported for handling main inputs that are lists.")
+    def test_training(self):
+        super().test_training()
+
+    @unittest.skip("Test is not supported for handling main inputs that are lists.")
+    def test_ema_training(self):
+        super().test_ema_training()
+
+    @unittest.skip("Test is not supported for handling main inputs that are lists.")
+    def test_effective_gradient_checkpointing(self):
+        super().test_effective_gradient_checkpointing()
+
+    @unittest.skip(
+        "Test needs to be revisited. But we need to ensure `x_pad_token` and `cap_pad_token` are cast to the same dtype as the destination tensor before they are assigned to the padding indices."
+    )
+    def test_layerwise_casting_training(self):
+        super().test_layerwise_casting_training()
+
+    @unittest.skip("Test is not supported for handling main inputs that are lists.")
+    def test_outputs_equivalence(self):
+        super().test_outputs_equivalence()
+
+    @unittest.skip("Test will pass if we change to deterministic values instead of empty in the DiT.")
+    def test_group_offloading(self):
+        super().test_group_offloading()
+
+    @unittest.skip("Test will pass if we change to deterministic values instead of empty in the DiT.")
+    def test_group_offloading_with_disk(self):
+        super().test_group_offloading_with_disk()
+
+
+class ZImageTransformerCompileTests(TorchCompileTesterMixin, unittest.TestCase):
+    model_class = ZImageTransformer2DModel
+    different_shapes_for_compilation = [(4, 4), (4, 8), (8, 8)]
+
+    def prepare_init_args_and_inputs_for_common(self):
+        return ZImageTransformerTests().prepare_init_args_and_inputs_for_common()
+
+    def prepare_dummy_input(self, height, width):
+        return ZImageTransformerTests().prepare_dummy_input(height=height, width=width)
+
+    @unittest.skip(
+        "The repeated block in this model is ZImageTransformerBlock, which is used for noise_refiner, context_refiner, and layers. As a consequence of this, the inputs recorded for the block would vary during compilation and full compilation with fullgraph=True would trigger recompilation at least thrice."
+    )
+    def test_torch_compile_recompilation_and_graph_break(self):
+        super().test_torch_compile_recompilation_and_graph_break()
+
+    @unittest.skip("Fullgraph AoT is broken")
+    def test_compile_works_with_aot(self):
+        super().test_compile_works_with_aot()
+
+    @unittest.skip("Fullgraph is broken")
+    def test_compile_on_different_shapes(self):
+        super().test_compile_on_different_shapes()
diff --git a/tests/modular_pipelines/flux/__init__.py b/tests/modular_pipelines/flux/__init__.py
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/tests/modular_pipelines/flux/test_modular_pipeline_flux.py b/tests/modular_pipelines/flux/test_modular_pipeline_flux.py
new file mode 100644
index 000000000000..854b5218c617
--- /dev/null
+++ b/tests/modular_pipelines/flux/test_modular_pipeline_flux.py
@@ -0,0 +1,181 @@
+# coding=utf-8
+# Copyright 2025 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import random
+import tempfile
+
+import numpy as np
+import PIL
+import torch
+
+from diffusers.image_processor import VaeImageProcessor
+from diffusers.modular_pipelines import (
+    FluxAutoBlocks,
+    FluxKontextAutoBlocks,
+    FluxKontextModularPipeline,
+    FluxModularPipeline,
+    ModularPipeline,
+)
+
+from ...testing_utils import floats_tensor, torch_device
+from ..test_modular_pipelines_common import ModularPipelineTesterMixin
+
+
+class TestFluxModularPipelineFast(ModularPipelineTesterMixin):
+    pipeline_class = FluxModularPipeline
+    pipeline_blocks_class = FluxAutoBlocks
+    pretrained_model_name_or_path = "hf-internal-testing/tiny-flux-modular"
+
+    params = frozenset(["prompt", "height", "width", "guidance_scale"])
+    batch_params = frozenset(["prompt"])
+
+    def get_dummy_inputs(self, seed=0):
+        generator = self.get_generator(seed)
+        inputs = {
+            "prompt": "A painting of a squirrel eating a burger",
+            "generator": generator,
+            "num_inference_steps": 2,
+            "guidance_scale": 5.0,
+            "height": 8,
+            "width": 8,
+            "max_sequence_length": 48,
+            "output_type": "pt",
+        }
+        return inputs
+
+    def test_float16_inference(self):
+        super().test_float16_inference(9e-2)
+
+
+class TestFluxImg2ImgModularPipelineFast(ModularPipelineTesterMixin):
+    pipeline_class = FluxModularPipeline
+    pipeline_blocks_class = FluxAutoBlocks
+    pretrained_model_name_or_path = "hf-internal-testing/tiny-flux-modular"
+
+    params = frozenset(["prompt", "height", "width", "guidance_scale", "image"])
+    batch_params = frozenset(["prompt", "image"])
+
+    def get_pipeline(self, components_manager=None, torch_dtype=torch.float32):
+        pipeline = super().get_pipeline(components_manager, torch_dtype)
+
+        # Override `vae_scale_factor` here as currently, `image_processor` is initialized with
+        # fixed constants instead of
+        # https://github.com/huggingface/diffusers/blob/d54622c2679d700b425ad61abce9b80fc36212c0/src/diffusers/pipelines/flux/pipeline_flux_img2img.py#L230C9-L232C10
+        pipeline.image_processor = VaeImageProcessor(vae_scale_factor=2)
+        return pipeline
+
+    def get_dummy_inputs(self, seed=0):
+        generator = self.get_generator(seed)
+        inputs = {
+            "prompt": "A painting of a squirrel eating a burger",
+            "generator": generator,
+            "num_inference_steps": 4,
+            "guidance_scale": 5.0,
+            "height": 8,
+            "width": 8,
+            "max_sequence_length": 48,
+            "output_type": "pt",
+        }
+        image = floats_tensor((1, 3, 32, 32), rng=random.Random(seed)).to(torch_device)
+        image = image.cpu().permute(0, 2, 3, 1)[0]
+        init_image = PIL.Image.fromarray(np.uint8(image)).convert("RGB")
+
+        inputs["image"] = init_image
+        inputs["strength"] = 0.5
+
+        return inputs
+
+    def test_save_from_pretrained(self):
+        pipes = []
+        base_pipe = self.get_pipeline().to(torch_device)
+        pipes.append(base_pipe)
+
+        with tempfile.TemporaryDirectory() as tmpdirname:
+            base_pipe.save_pretrained(tmpdirname)
+
+            pipe = ModularPipeline.from_pretrained(tmpdirname).to(torch_device)
+            pipe.load_components(torch_dtype=torch.float32)
+            pipe.to(torch_device)
+            pipe.image_processor = VaeImageProcessor(vae_scale_factor=2)
+
+        pipes.append(pipe)
+
+        image_slices = []
+        for pipe in pipes:
+            inputs = self.get_dummy_inputs()
+            image = pipe(**inputs, output="images")
+
+            image_slices.append(image[0, -3:, -3:, -1].flatten())
+
+        assert torch.abs(image_slices[0] - image_slices[1]).max() < 1e-3
+
+    def test_float16_inference(self):
+        super().test_float16_inference(8e-2)
+
+
+class TestFluxKontextModularPipelineFast(ModularPipelineTesterMixin):
+    pipeline_class = FluxKontextModularPipeline
+    pipeline_blocks_class = FluxKontextAutoBlocks
+    pretrained_model_name_or_path = "hf-internal-testing/tiny-flux-kontext-pipe"
+
+    params = frozenset(["prompt", "height", "width", "guidance_scale", "image"])
+    batch_params = frozenset(["prompt", "image"])
+
+    def get_dummy_inputs(self, seed=0):
+        generator = self.get_generator(seed)
+        inputs = {
+            "prompt": "A painting of a squirrel eating a burger",
+            "generator": generator,
+            "num_inference_steps": 2,
+            "guidance_scale": 5.0,
+            "height": 8,
+            "width": 8,
+            "max_sequence_length": 48,
+            "output_type": "pt",
+        }
+        image = PIL.Image.new("RGB", (32, 32), 0)
+
+        inputs["image"] = image
+        inputs["max_area"] = inputs["height"] * inputs["width"]
+        inputs["_auto_resize"] = False
+
+        return inputs
+
+    def test_save_from_pretrained(self):
+        pipes = []
+        base_pipe = self.get_pipeline().to(torch_device)
+        pipes.append(base_pipe)
+
+        with tempfile.TemporaryDirectory() as tmpdirname:
+            base_pipe.save_pretrained(tmpdirname)
+
+            pipe = ModularPipeline.from_pretrained(tmpdirname).to(torch_device)
+            pipe.load_components(torch_dtype=torch.float32)
+            pipe.to(torch_device)
+            pipe.image_processor = VaeImageProcessor(vae_scale_factor=2)
+
+        pipes.append(pipe)
+
+        image_slices = []
+        for pipe in pipes:
+            inputs = self.get_dummy_inputs()
+            image = pipe(**inputs, output="images")
+
+            image_slices.append(image[0, -3:, -3:, -1].flatten())
+
+        assert torch.abs(image_slices[0] - image_slices[1]).max() < 1e-3
+
+    def test_float16_inference(self):
+        super().test_float16_inference(9e-2)
diff --git a/tests/modular_pipelines/qwen/__init__.py b/tests/modular_pipelines/qwen/__init__.py
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/tests/modular_pipelines/qwen/test_modular_pipeline_qwenimage.py b/tests/modular_pipelines/qwen/test_modular_pipeline_qwenimage.py
new file mode 100644
index 000000000000..8d7600781b24
--- /dev/null
+++ b/tests/modular_pipelines/qwen/test_modular_pipeline_qwenimage.py
@@ -0,0 +1,120 @@
+# coding=utf-8
+# Copyright 2025 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import PIL
+import pytest
+
+from diffusers.modular_pipelines import (
+    QwenImageAutoBlocks,
+    QwenImageEditAutoBlocks,
+    QwenImageEditModularPipeline,
+    QwenImageEditPlusAutoBlocks,
+    QwenImageEditPlusModularPipeline,
+    QwenImageModularPipeline,
+)
+
+from ..test_modular_pipelines_common import ModularGuiderTesterMixin, ModularPipelineTesterMixin
+
+
+class TestQwenImageModularPipelineFast(ModularPipelineTesterMixin, ModularGuiderTesterMixin):
+    pipeline_class = QwenImageModularPipeline
+    pipeline_blocks_class = QwenImageAutoBlocks
+    pretrained_model_name_or_path = "hf-internal-testing/tiny-qwenimage-modular"
+
+    params = frozenset(["prompt", "height", "width", "negative_prompt", "attention_kwargs", "image", "mask_image"])
+    batch_params = frozenset(["prompt", "negative_prompt", "image", "mask_image"])
+
+    def get_dummy_inputs(self):
+        generator = self.get_generator()
+        inputs = {
+            "prompt": "dance monkey",
+            "negative_prompt": "bad quality",
+            "generator": generator,
+            "num_inference_steps": 2,
+            "height": 32,
+            "width": 32,
+            "max_sequence_length": 16,
+            "output_type": "pt",
+        }
+        return inputs
+
+    def test_inference_batch_single_identical(self):
+        super().test_inference_batch_single_identical(expected_max_diff=5e-4)
+
+
+class TestQwenImageEditModularPipelineFast(ModularPipelineTesterMixin, ModularGuiderTesterMixin):
+    pipeline_class = QwenImageEditModularPipeline
+    pipeline_blocks_class = QwenImageEditAutoBlocks
+    pretrained_model_name_or_path = "hf-internal-testing/tiny-qwenimage-edit-modular"
+
+    params = frozenset(["prompt", "height", "width", "negative_prompt", "attention_kwargs", "image", "mask_image"])
+    batch_params = frozenset(["prompt", "negative_prompt", "image", "mask_image"])
+
+    def get_dummy_inputs(self):
+        generator = self.get_generator()
+        inputs = {
+            "prompt": "dance monkey",
+            "negative_prompt": "bad quality",
+            "generator": generator,
+            "num_inference_steps": 2,
+            "height": 32,
+            "width": 32,
+            "output_type": "pt",
+        }
+        inputs["image"] = PIL.Image.new("RGB", (32, 32), 0)
+        return inputs
+
+    def test_guider_cfg(self):
+        super().test_guider_cfg(7e-5)
+
+
+class TestQwenImageEditPlusModularPipelineFast(ModularPipelineTesterMixin, ModularGuiderTesterMixin):
+    pipeline_class = QwenImageEditPlusModularPipeline
+    pipeline_blocks_class = QwenImageEditPlusAutoBlocks
+    pretrained_model_name_or_path = "hf-internal-testing/tiny-qwenimage-edit-plus-modular"
+
+    # No `mask_image` yet.
+    params = frozenset(["prompt", "height", "width", "negative_prompt", "attention_kwargs", "image"])
+    batch_params = frozenset(["prompt", "negative_prompt", "image"])
+
+    def get_dummy_inputs(self):
+        generator = self.get_generator()
+        inputs = {
+            "prompt": "dance monkey",
+            "negative_prompt": "bad quality",
+            "generator": generator,
+            "num_inference_steps": 2,
+            "height": 32,
+            "width": 32,
+            "output_type": "pt",
+        }
+        inputs["image"] = PIL.Image.new("RGB", (32, 32), 0)
+        return inputs
+
+    @pytest.mark.xfail(condition=True, reason="Batch of multiple images needs to be revisited", strict=True)
+    def test_num_images_per_prompt(self):
+        super().test_num_images_per_prompt()
+
+    @pytest.mark.xfail(condition=True, reason="Batch of multiple images needs to be revisited", strict=True)
+    def test_inference_batch_consistent():
+        super().test_inference_batch_consistent()
+
+    @pytest.mark.xfail(condition=True, reason="Batch of multiple images needs to be revisited", strict=True)
+    def test_inference_batch_single_identical():
+        super().test_inference_batch_single_identical()
+
+    def test_guider_cfg(self):
+        super().test_guider_cfg(1e-3)
diff --git a/tests/modular_pipelines/stable_diffusion_xl/test_modular_pipeline_stable_diffusion_xl.py b/tests/modular_pipelines/stable_diffusion_xl/test_modular_pipeline_stable_diffusion_xl.py
index d05f818135ab..7b55933e4caf 100644
--- a/tests/modular_pipelines/stable_diffusion_xl/test_modular_pipeline_stable_diffusion_xl.py
+++ b/tests/modular_pipelines/stable_diffusion_xl/test_modular_pipeline_stable_diffusion_xl.py
@@ -14,93 +14,41 @@
 # limitations under the License.
 
 import random
-import unittest
 from typing import Any, Dict
 
 import numpy as np
 import torch
 from PIL import Image
 
-from diffusers import (
-    ClassifierFreeGuidance,
-    StableDiffusionXLAutoBlocks,
-    StableDiffusionXLModularPipeline,
-)
+from diffusers import ClassifierFreeGuidance, StableDiffusionXLAutoBlocks, StableDiffusionXLModularPipeline
 from diffusers.loaders import ModularIPAdapterMixin
 
-from ...models.unets.test_models_unet_2d_condition import (
-    create_ip_adapter_state_dict,
-)
-from ...testing_utils import (
-    enable_full_determinism,
-    floats_tensor,
-    torch_device,
-)
-from ..test_modular_pipelines_common import (
-    ModularPipelineTesterMixin,
-)
+from ...models.unets.test_models_unet_2d_condition import create_ip_adapter_state_dict
+from ...testing_utils import enable_full_determinism, floats_tensor, torch_device
+from ..test_modular_pipelines_common import ModularGuiderTesterMixin, ModularPipelineTesterMixin
 
 
 enable_full_determinism()
 
 
-class SDXLModularTests:
+class SDXLModularTesterMixin:
     """
     This mixin defines method to create pipeline, base input and base test across all SDXL modular tests.
     """
 
-    pipeline_class = StableDiffusionXLModularPipeline
-    pipeline_blocks_class = StableDiffusionXLAutoBlocks
-    repo = "hf-internal-testing/tiny-sdxl-modular"
-    params = frozenset(
-        [
-            "prompt",
-            "height",
-            "width",
-            "negative_prompt",
-            "cross_attention_kwargs",
-            "image",
-            "mask_image",
-        ]
-    )
-    batch_params = frozenset(["prompt", "negative_prompt", "image", "mask_image"])
-
-    def get_pipeline(self, components_manager=None, torch_dtype=torch.float32):
-        pipeline = self.pipeline_blocks_class().init_pipeline(self.repo, components_manager=components_manager)
-        pipeline.load_components(torch_dtype=torch_dtype)
-        return pipeline
-
-    def get_dummy_inputs(self, device, seed=0):
-        if str(device).startswith("mps"):
-            generator = torch.manual_seed(seed)
-        else:
-            generator = torch.Generator(device=device).manual_seed(seed)
-        inputs = {
-            "prompt": "A painting of a squirrel eating a burger",
-            "generator": generator,
-            "num_inference_steps": 2,
-            "output_type": "np",
-        }
-        return inputs
-
     def _test_stable_diffusion_xl_euler(self, expected_image_shape, expected_slice, expected_max_diff=1e-2):
-        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
-        sd_pipe = self.get_pipeline()
-        sd_pipe = sd_pipe.to(device)
-        sd_pipe.set_progress_bar_config(disable=None)
+        sd_pipe = self.get_pipeline().to(torch_device)
 
-        inputs = self.get_dummy_inputs(device)
+        inputs = self.get_dummy_inputs()
         image = sd_pipe(**inputs, output="images")
-        image_slice = image[0, -3:, -3:, -1]
+        image_slice = image[0, -3:, -3:, -1].cpu()
 
         assert image.shape == expected_image_shape
-
-        assert np.abs(image_slice.flatten() - expected_slice).max() < expected_max_diff, (
-            "Image Slice does not match expected slice"
-        )
+        max_diff = torch.abs(image_slice.flatten() - expected_slice).max()
+        assert max_diff < expected_max_diff, f"Image slice does not match expected slice. Max Difference: {max_diff}"
 
 
-class SDXLModularIPAdapterTests:
+class SDXLModularIPAdapterTesterMixin:
     """
     This mixin is designed to test IP Adapter.
     """
@@ -139,7 +87,7 @@ def _modify_inputs_for_ip_adapter_test(self, inputs: Dict[str, Any]):
         if "image" in parameters and "strength" in parameters:
             inputs["num_inference_steps"] = 4
 
-        inputs["output_type"] = "np"
+        inputs["output_type"] = "pt"
         return inputs
 
     def test_ip_adapter(self, expected_max_diff: float = 1e-4, expected_pipe_slice=None):
@@ -157,14 +105,14 @@ def test_ip_adapter(self, expected_max_diff: float = 1e-4, expected_pipe_slice=N
 
         blocks = self.pipeline_blocks_class()
         _ = blocks.sub_blocks.pop("ip_adapter")
-        pipe = blocks.init_pipeline(self.repo)
+        pipe = blocks.init_pipeline(self.pretrained_model_name_or_path)
         pipe.load_components(torch_dtype=torch.float32)
         pipe = pipe.to(torch_device)
-        pipe.set_progress_bar_config(disable=None)
+
         cross_attention_dim = pipe.unet.config.get("cross_attention_dim")
 
         # forward pass without ip adapter
-        inputs = self._modify_inputs_for_ip_adapter_test(self.get_dummy_inputs(torch_device))
+        inputs = self._modify_inputs_for_ip_adapter_test(self.get_dummy_inputs())
         if expected_pipe_slice is None:
             output_without_adapter = pipe(**inputs, output="images")
         else:
@@ -175,7 +123,7 @@ def test_ip_adapter(self, expected_max_diff: float = 1e-4, expected_pipe_slice=N
         pipe.unet._load_ip_adapter_weights(adapter_state_dict)
 
         # forward pass with single ip adapter, but scale=0 which should have no effect
-        inputs = self._modify_inputs_for_ip_adapter_test(self.get_dummy_inputs(torch_device))
+        inputs = self._modify_inputs_for_ip_adapter_test(self.get_dummy_inputs())
         inputs["ip_adapter_embeds"] = [self._get_dummy_image_embeds(cross_attention_dim)]
         inputs["negative_ip_adapter_embeds"] = [self._get_dummy_image_embeds(cross_attention_dim)]
         pipe.set_ip_adapter_scale(0.0)
@@ -184,7 +132,7 @@ def test_ip_adapter(self, expected_max_diff: float = 1e-4, expected_pipe_slice=N
             output_without_adapter_scale = output_without_adapter_scale[0, -3:, -3:, -1].flatten()
 
         # forward pass with single ip adapter, but with scale of adapter weights
-        inputs = self._modify_inputs_for_ip_adapter_test(self.get_dummy_inputs(torch_device))
+        inputs = self._modify_inputs_for_ip_adapter_test(self.get_dummy_inputs())
         inputs["ip_adapter_embeds"] = [self._get_dummy_image_embeds(cross_attention_dim)]
         inputs["negative_ip_adapter_embeds"] = [self._get_dummy_image_embeds(cross_attention_dim)]
         pipe.set_ip_adapter_scale(42.0)
@@ -192,8 +140,8 @@ def test_ip_adapter(self, expected_max_diff: float = 1e-4, expected_pipe_slice=N
         if expected_pipe_slice is not None:
             output_with_adapter_scale = output_with_adapter_scale[0, -3:, -3:, -1].flatten()
 
-        max_diff_without_adapter_scale = np.abs(output_without_adapter_scale - output_without_adapter).max()
-        max_diff_with_adapter_scale = np.abs(output_with_adapter_scale - output_without_adapter).max()
+        max_diff_without_adapter_scale = torch.abs(output_without_adapter_scale - output_without_adapter).max()
+        max_diff_with_adapter_scale = torch.abs(output_with_adapter_scale - output_without_adapter).max()
 
         assert max_diff_without_adapter_scale < expected_max_diff, (
             "Output without ip-adapter must be same as normal inference"
@@ -206,7 +154,7 @@ def test_ip_adapter(self, expected_max_diff: float = 1e-4, expected_pipe_slice=N
         pipe.unet._load_ip_adapter_weights([adapter_state_dict_1, adapter_state_dict_2])
 
         # forward pass with multi ip adapter, but scale=0 which should have no effect
-        inputs = self._modify_inputs_for_ip_adapter_test(self.get_dummy_inputs(torch_device))
+        inputs = self._modify_inputs_for_ip_adapter_test(self.get_dummy_inputs())
         inputs["ip_adapter_embeds"] = [self._get_dummy_image_embeds(cross_attention_dim)] * 2
         inputs["negative_ip_adapter_embeds"] = [self._get_dummy_image_embeds(cross_attention_dim)] * 2
         pipe.set_ip_adapter_scale([0.0, 0.0])
@@ -215,7 +163,7 @@ def test_ip_adapter(self, expected_max_diff: float = 1e-4, expected_pipe_slice=N
             output_without_multi_adapter_scale = output_without_multi_adapter_scale[0, -3:, -3:, -1].flatten()
 
         # forward pass with multi ip adapter, but with scale of adapter weights
-        inputs = self._modify_inputs_for_ip_adapter_test(self.get_dummy_inputs(torch_device))
+        inputs = self._modify_inputs_for_ip_adapter_test(self.get_dummy_inputs())
         inputs["ip_adapter_embeds"] = [self._get_dummy_image_embeds(cross_attention_dim)] * 2
         inputs["negative_ip_adapter_embeds"] = [self._get_dummy_image_embeds(cross_attention_dim)] * 2
         pipe.set_ip_adapter_scale([42.0, 42.0])
@@ -223,10 +171,10 @@ def test_ip_adapter(self, expected_max_diff: float = 1e-4, expected_pipe_slice=N
         if expected_pipe_slice is not None:
             output_with_multi_adapter_scale = output_with_multi_adapter_scale[0, -3:, -3:, -1].flatten()
 
-        max_diff_without_multi_adapter_scale = np.abs(
+        max_diff_without_multi_adapter_scale = torch.abs(
             output_without_multi_adapter_scale - output_without_adapter
         ).max()
-        max_diff_with_multi_adapter_scale = np.abs(output_with_multi_adapter_scale - output_without_adapter).max()
+        max_diff_with_multi_adapter_scale = torch.abs(output_with_multi_adapter_scale - output_without_adapter).max()
         assert max_diff_without_multi_adapter_scale < expected_max_diff, (
             "Output without multi-ip-adapter must be same as normal inference"
         )
@@ -235,7 +183,7 @@ def test_ip_adapter(self, expected_max_diff: float = 1e-4, expected_pipe_slice=N
         )
 
 
-class SDXLModularControlNetTests:
+class SDXLModularControlNetTesterMixin:
     """
     This mixin is designed to test ControlNet.
     """
@@ -269,29 +217,29 @@ def test_controlnet(self, expected_max_diff: float = 1e-4, expected_pipe_slice=N
         # compare against static slices and that can be shaky (with a VVVV low probability).
         expected_max_diff = 9e-4 if torch_device == "cpu" else expected_max_diff
 
-        pipe = self.get_pipeline()
-        pipe = pipe.to(torch_device)
-        pipe.set_progress_bar_config(disable=None)
+        pipe = self.get_pipeline().to(torch_device)
 
         # forward pass without controlnet
-        inputs = self.get_dummy_inputs(torch_device)
+        inputs = self.get_dummy_inputs()
         output_without_controlnet = pipe(**inputs, output="images")
         output_without_controlnet = output_without_controlnet[0, -3:, -3:, -1].flatten()
 
         # forward pass with single controlnet, but scale=0 which should have no effect
-        inputs = self._modify_inputs_for_controlnet_test(self.get_dummy_inputs(torch_device))
+        inputs = self._modify_inputs_for_controlnet_test(self.get_dummy_inputs())
         inputs["controlnet_conditioning_scale"] = 0.0
         output_without_controlnet_scale = pipe(**inputs, output="images")
         output_without_controlnet_scale = output_without_controlnet_scale[0, -3:, -3:, -1].flatten()
 
         # forward pass with single controlnet, but with scale of adapter weights
-        inputs = self._modify_inputs_for_controlnet_test(self.get_dummy_inputs(torch_device))
+        inputs = self._modify_inputs_for_controlnet_test(self.get_dummy_inputs())
         inputs["controlnet_conditioning_scale"] = 42.0
         output_with_controlnet_scale = pipe(**inputs, output="images")
         output_with_controlnet_scale = output_with_controlnet_scale[0, -3:, -3:, -1].flatten()
 
-        max_diff_without_controlnet_scale = np.abs(output_without_controlnet_scale - output_without_controlnet).max()
-        max_diff_with_controlnet_scale = np.abs(output_with_controlnet_scale - output_without_controlnet).max()
+        max_diff_without_controlnet_scale = torch.abs(
+            output_without_controlnet_scale - output_without_controlnet
+        ).max()
+        max_diff_with_controlnet_scale = torch.abs(output_with_controlnet_scale - output_without_controlnet).max()
 
         assert max_diff_without_controlnet_scale < expected_max_diff, (
             "Output without controlnet must be same as normal inference"
@@ -299,76 +247,66 @@ def test_controlnet(self, expected_max_diff: float = 1e-4, expected_pipe_slice=N
         assert max_diff_with_controlnet_scale > 1e-2, "Output with controlnet must be different from normal inference"
 
     def test_controlnet_cfg(self):
-        pipe = self.get_pipeline()
-        pipe = pipe.to(torch_device)
-        pipe.set_progress_bar_config(disable=None)
+        pipe = self.get_pipeline().to(torch_device)
 
         # forward pass with CFG not applied
         guider = ClassifierFreeGuidance(guidance_scale=1.0)
         pipe.update_components(guider=guider)
 
-        inputs = self._modify_inputs_for_controlnet_test(self.get_dummy_inputs(torch_device))
+        inputs = self._modify_inputs_for_controlnet_test(self.get_dummy_inputs())
         out_no_cfg = pipe(**inputs, output="images")
 
         # forward pass with CFG applied
         guider = ClassifierFreeGuidance(guidance_scale=7.5)
         pipe.update_components(guider=guider)
-        inputs = self._modify_inputs_for_controlnet_test(self.get_dummy_inputs(torch_device))
+        inputs = self._modify_inputs_for_controlnet_test(self.get_dummy_inputs())
         out_cfg = pipe(**inputs, output="images")
 
         assert out_cfg.shape == out_no_cfg.shape
-        max_diff = np.abs(out_cfg - out_no_cfg).max()
+        max_diff = torch.abs(out_cfg - out_no_cfg).max()
         assert max_diff > 1e-2, "Output with CFG must be different from normal inference"
 
 
-class SDXLModularGuiderTests:
-    def test_guider_cfg(self):
-        pipe = self.get_pipeline()
-        pipe = pipe.to(torch_device)
-        pipe.set_progress_bar_config(disable=None)
-
-        # forward pass with CFG not applied
-        guider = ClassifierFreeGuidance(guidance_scale=1.0)
-        pipe.update_components(guider=guider)
-
-        inputs = self.get_dummy_inputs(torch_device)
-        out_no_cfg = pipe(**inputs, output="images")
-
-        # forward pass with CFG applied
-        guider = ClassifierFreeGuidance(guidance_scale=7.5)
-        pipe.update_components(guider=guider)
-        inputs = self.get_dummy_inputs(torch_device)
-        out_cfg = pipe(**inputs, output="images")
-
-        assert out_cfg.shape == out_no_cfg.shape
-        max_diff = np.abs(out_cfg - out_no_cfg).max()
-        assert max_diff > 1e-2, "Output with CFG must be different from normal inference"
-
-
-class SDXLModularPipelineFastTests(
-    SDXLModularTests,
-    SDXLModularIPAdapterTests,
-    SDXLModularControlNetTests,
-    SDXLModularGuiderTests,
+class TestSDXLModularPipelineFast(
+    SDXLModularTesterMixin,
+    SDXLModularIPAdapterTesterMixin,
+    SDXLModularControlNetTesterMixin,
+    ModularGuiderTesterMixin,
     ModularPipelineTesterMixin,
-    unittest.TestCase,
 ):
     """Test cases for Stable Diffusion XL modular pipeline fast tests."""
 
+    pipeline_class = StableDiffusionXLModularPipeline
+    pipeline_blocks_class = StableDiffusionXLAutoBlocks
+    pretrained_model_name_or_path = "hf-internal-testing/tiny-sdxl-modular"
+    params = frozenset(
+        [
+            "prompt",
+            "height",
+            "width",
+            "negative_prompt",
+            "cross_attention_kwargs",
+        ]
+    )
+    batch_params = frozenset(["prompt", "negative_prompt"])
+    expected_image_output_shape = (1, 3, 64, 64)
+
+    def get_dummy_inputs(self, seed=0):
+        generator = self.get_generator(seed)
+        inputs = {
+            "prompt": "A painting of a squirrel eating a burger",
+            "generator": generator,
+            "num_inference_steps": 2,
+            "output_type": "pt",
+        }
+        return inputs
+
     def test_stable_diffusion_xl_euler(self):
         self._test_stable_diffusion_xl_euler(
-            expected_image_shape=(1, 64, 64, 3),
-            expected_slice=[
-                0.5966781,
-                0.62939394,
-                0.48465094,
-                0.51573336,
-                0.57593524,
-                0.47035995,
-                0.53410417,
-                0.51436996,
-                0.47313565,
-            ],
+            expected_image_shape=self.expected_image_output_shape,
+            expected_slice=torch.tensor(
+                [0.3886, 0.4685, 0.4953, 0.4217, 0.4317, 0.3945, 0.4847, 0.4704, 0.4731],
+            ),
             expected_max_diff=1e-2,
         )
 
@@ -376,39 +314,52 @@ def test_inference_batch_single_identical(self):
         super().test_inference_batch_single_identical(expected_max_diff=3e-3)
 
 
-class SDXLImg2ImgModularPipelineFastTests(
-    SDXLModularTests,
-    SDXLModularIPAdapterTests,
-    SDXLModularControlNetTests,
-    SDXLModularGuiderTests,
+class TestSDXLImg2ImgModularPipelineFast(
+    SDXLModularTesterMixin,
+    SDXLModularIPAdapterTesterMixin,
+    SDXLModularControlNetTesterMixin,
+    ModularGuiderTesterMixin,
     ModularPipelineTesterMixin,
-    unittest.TestCase,
 ):
     """Test cases for Stable Diffusion XL image-to-image modular pipeline fast tests."""
 
-    def get_dummy_inputs(self, device, seed=0):
-        inputs = super().get_dummy_inputs(device, seed)
-        image = floats_tensor((1, 3, 64, 64), rng=random.Random(seed)).to(device)
-        image = image / 2 + 0.5
-        inputs["image"] = image
-        inputs["strength"] = 0.8
+    pipeline_class = StableDiffusionXLModularPipeline
+    pipeline_blocks_class = StableDiffusionXLAutoBlocks
+    pretrained_model_name_or_path = "hf-internal-testing/tiny-sdxl-modular"
+    params = frozenset(
+        [
+            "prompt",
+            "height",
+            "width",
+            "negative_prompt",
+            "cross_attention_kwargs",
+            "image",
+        ]
+    )
+    batch_params = frozenset(["prompt", "negative_prompt", "image"])
+    expected_image_output_shape = (1, 3, 64, 64)
+
+    def get_dummy_inputs(self, seed=0):
+        generator = self.get_generator(seed)
+        inputs = {
+            "prompt": "A painting of a squirrel eating a burger",
+            "generator": generator,
+            "num_inference_steps": 4,
+            "output_type": "pt",
+        }
+        image = floats_tensor((1, 3, 32, 32), rng=random.Random(seed)).to(torch_device)
+        image = image.cpu().permute(0, 2, 3, 1)[0]
+        init_image = Image.fromarray(np.uint8(image)).convert("RGB").resize((64, 64))
+
+        inputs["image"] = init_image
+        inputs["strength"] = 0.5
 
         return inputs
 
     def test_stable_diffusion_xl_euler(self):
         self._test_stable_diffusion_xl_euler(
-            expected_image_shape=(1, 64, 64, 3),
-            expected_slice=[
-                0.56943184,
-                0.4702148,
-                0.48048905,
-                0.6235963,
-                0.551138,
-                0.49629188,
-                0.60031277,
-                0.5688907,
-                0.43996853,
-            ],
+            expected_image_shape=self.expected_image_output_shape,
+            expected_slice=torch.tensor([0.5246, 0.4466, 0.444, 0.3246, 0.4443, 0.5108, 0.5225, 0.559, 0.5147]),
             expected_max_diff=1e-2,
         )
 
@@ -417,20 +368,43 @@ def test_inference_batch_single_identical(self):
 
 
 class SDXLInpaintingModularPipelineFastTests(
-    SDXLModularTests,
-    SDXLModularIPAdapterTests,
-    SDXLModularControlNetTests,
-    SDXLModularGuiderTests,
+    SDXLModularTesterMixin,
+    SDXLModularIPAdapterTesterMixin,
+    SDXLModularControlNetTesterMixin,
+    ModularGuiderTesterMixin,
     ModularPipelineTesterMixin,
-    unittest.TestCase,
 ):
     """Test cases for Stable Diffusion XL inpainting modular pipeline fast tests."""
 
+    pipeline_class = StableDiffusionXLModularPipeline
+    pipeline_blocks_class = StableDiffusionXLAutoBlocks
+    pretrained_model_name_or_path = "hf-internal-testing/tiny-sdxl-modular"
+    params = frozenset(
+        [
+            "prompt",
+            "height",
+            "width",
+            "negative_prompt",
+            "cross_attention_kwargs",
+            "image",
+            "mask_image",
+        ]
+    )
+    batch_params = frozenset(["prompt", "negative_prompt", "image", "mask_image"])
+    expected_image_output_shape = (1, 3, 64, 64)
+
     def get_dummy_inputs(self, device, seed=0):
-        inputs = super().get_dummy_inputs(device, seed)
+        generator = self.get_generator(seed)
+        inputs = {
+            "prompt": "A painting of a squirrel eating a burger",
+            "generator": generator,
+            "num_inference_steps": 4,
+            "output_type": "pt",
+        }
         image = floats_tensor((1, 3, 32, 32), rng=random.Random(seed)).to(device)
         image = image.cpu().permute(0, 2, 3, 1)[0]
         init_image = Image.fromarray(np.uint8(image)).convert("RGB").resize((64, 64))
+
         # create mask
         image[8:, 8:, :] = 255
         mask_image = Image.fromarray(np.uint8(image)).convert("L").resize((64, 64))
@@ -443,18 +417,21 @@ def get_dummy_inputs(self, device, seed=0):
 
     def test_stable_diffusion_xl_euler(self):
         self._test_stable_diffusion_xl_euler(
-            expected_image_shape=(1, 64, 64, 3),
-            expected_slice=[
-                0.40872607,
-                0.38842705,
-                0.34893104,
-                0.47837183,
-                0.43792963,
-                0.5332134,
-                0.3716843,
-                0.47274873,
-                0.45000193,
-            ],
+            expected_image_shape=self.expected_image_output_shape,
+            expected_slice=torch.tensor(
+                [
+                    0.40872607,
+                    0.38842705,
+                    0.34893104,
+                    0.47837183,
+                    0.43792963,
+                    0.5332134,
+                    0.3716843,
+                    0.47274873,
+                    0.45000193,
+                ],
+                device=torch_device,
+            ),
             expected_max_diff=1e-2,
         )
 
diff --git a/tests/modular_pipelines/test_modular_pipelines_common.py b/tests/modular_pipelines/test_modular_pipelines_common.py
index d309fcf35339..a33951dac538 100644
--- a/tests/modular_pipelines/test_modular_pipelines_common.py
+++ b/tests/modular_pipelines/test_modular_pipelines_common.py
@@ -1,35 +1,20 @@
 import gc
 import tempfile
-import unittest
 from typing import Callable, Union
 
-import numpy as np
+import pytest
 import torch
 
 import diffusers
 from diffusers import ComponentsManager, ModularPipeline, ModularPipelineBlocks
+from diffusers.guiders import ClassifierFreeGuidance
 from diffusers.utils import logging
 
-from ..testing_utils import (
-    backend_empty_cache,
-    numpy_cosine_similarity_distance,
-    require_accelerator,
-    require_torch,
-    torch_device,
-)
+from ..testing_utils import backend_empty_cache, numpy_cosine_similarity_distance, require_accelerator, torch_device
 
 
-def to_np(tensor):
-    if isinstance(tensor, torch.Tensor):
-        tensor = tensor.detach().cpu().numpy()
-
-    return tensor
-
-
-@require_torch
 class ModularPipelineTesterMixin:
     """
-    This mixin is designed to be used with unittest.TestCase classes.
     It provides a set of common tests for each modular pipeline,
     including:
     - test_pipeline_call_signature: check if the pipeline's __call__ method has all required parameters
@@ -42,24 +27,12 @@ class ModularPipelineTesterMixin:
     # Canonical parameters that are passed to `__call__` regardless
     # of the type of pipeline. They are always optional and have common
     # sense default values.
-    optional_params = frozenset(
-        [
-            "num_inference_steps",
-            "num_images_per_prompt",
-            "latents",
-            "output_type",
-        ]
-    )
+    optional_params = frozenset(["num_inference_steps", "num_images_per_prompt", "latents", "output_type"])
     # this is modular specific: generator needs to be a intermediate input because it's mutable
-    intermediate_params = frozenset(
-        [
-            "generator",
-        ]
-    )
+    intermediate_params = frozenset(["generator"])
 
-    def get_generator(self, seed):
-        device = torch_device if torch_device != "mps" else "cpu"
-        generator = torch.Generator(device).manual_seed(seed)
+    def get_generator(self, seed=0):
+        generator = torch.Generator("cpu").manual_seed(seed)
         return generator
 
     @property
@@ -70,9 +43,9 @@ def pipeline_class(self) -> Union[Callable, ModularPipeline]:
         )
 
     @property
-    def repo(self) -> str:
+    def pretrained_model_name_or_path(self) -> str:
         raise NotImplementedError(
-            "You need to set the attribute `repo` in the child test class. See existing pipeline tests for reference."
+            "You need to set the attribute `pretrained_model_name_or_path` in the child test class. See existing pipeline tests for reference."
         )
 
     @property
@@ -82,13 +55,7 @@ def pipeline_blocks_class(self) -> Union[Callable, ModularPipelineBlocks]:
             "See existing pipeline tests for reference."
         )
 
-    def get_pipeline(self):
-        raise NotImplementedError(
-            "You need to implement `get_pipeline(self)` in the child test class. "
-            "See existing pipeline tests for reference."
-        )
-
-    def get_dummy_inputs(self, device, seed=0):
+    def get_dummy_inputs(self, seed=0):
         raise NotImplementedError(
             "You need to implement `get_dummy_inputs(self, device, seed)` in the child test class. "
             "See existing pipeline tests for reference."
@@ -123,20 +90,26 @@ def batch_params(self) -> frozenset:
             "See existing pipeline tests for reference."
         )
 
-    def setUp(self):
+    def setup_method(self):
         # clean up the VRAM before each test
-        super().setUp()
         torch.compiler.reset()
         gc.collect()
         backend_empty_cache(torch_device)
 
-    def tearDown(self):
+    def teardown_method(self):
         # clean up the VRAM after each test in case of CUDA runtime errors
-        super().tearDown()
         torch.compiler.reset()
         gc.collect()
         backend_empty_cache(torch_device)
 
+    def get_pipeline(self, components_manager=None, torch_dtype=torch.float32):
+        pipeline = self.pipeline_blocks_class().init_pipeline(
+            self.pretrained_model_name_or_path, components_manager=components_manager
+        )
+        pipeline.load_components(torch_dtype=torch_dtype)
+        pipeline.set_progress_bar_config(disable=None)
+        return pipeline
+
     def test_pipeline_call_signature(self):
         pipe = self.get_pipeline()
         input_parameters = pipe.blocks.input_names
@@ -152,11 +125,9 @@ def _check_for_parameters(parameters, expected_parameters, param_type):
         _check_for_parameters(self.optional_params, optional_parameters, "optional")
 
     def test_inference_batch_consistent(self, batch_sizes=[2], batch_generator=True):
-        pipe = self.get_pipeline()
-        pipe.to(torch_device)
-        pipe.set_progress_bar_config(disable=None)
+        pipe = self.get_pipeline().to(torch_device)
 
-        inputs = self.get_dummy_inputs(torch_device)
+        inputs = self.get_dummy_inputs()
         inputs["generator"] = self.get_generator(0)
 
         logger = logging.get_logger(pipe.__module__)
@@ -193,10 +164,9 @@ def test_inference_batch_single_identical(
         batch_size=2,
         expected_max_diff=1e-4,
     ):
-        pipe = self.get_pipeline()
-        pipe.to(torch_device)
-        pipe.set_progress_bar_config(disable=None)
-        inputs = self.get_dummy_inputs(torch_device)
+        pipe = self.get_pipeline().to(torch_device)
+
+        inputs = self.get_dummy_inputs()
 
         # Reset generator in case it is has been used in self.get_dummy_inputs
         inputs["generator"] = self.get_generator(0)
@@ -226,45 +196,39 @@ def test_inference_batch_single_identical(
 
         assert output_batch.shape[0] == batch_size
 
-        max_diff = np.abs(to_np(output_batch[0]) - to_np(output[0])).max()
+        max_diff = torch.abs(output_batch[0] - output[0]).max()
         assert max_diff < expected_max_diff, "Batch inference results different from single inference results"
 
-    @unittest.skipIf(torch_device not in ["cuda", "xpu"], reason="float16 requires CUDA or XPU")
     @require_accelerator
     def test_float16_inference(self, expected_max_diff=5e-2):
         pipe = self.get_pipeline()
         pipe.to(torch_device, torch.float32)
-        pipe.set_progress_bar_config(disable=None)
 
         pipe_fp16 = self.get_pipeline()
         pipe_fp16.to(torch_device, torch.float16)
-        pipe_fp16.set_progress_bar_config(disable=None)
 
-        inputs = self.get_dummy_inputs(torch_device)
+        inputs = self.get_dummy_inputs()
         # Reset generator in case it is used inside dummy inputs
         if "generator" in inputs:
             inputs["generator"] = self.get_generator(0)
         output = pipe(**inputs, output="images")
 
-        fp16_inputs = self.get_dummy_inputs(torch_device)
+        fp16_inputs = self.get_dummy_inputs()
         # Reset generator in case it is used inside dummy inputs
         if "generator" in fp16_inputs:
             fp16_inputs["generator"] = self.get_generator(0)
         output_fp16 = pipe_fp16(**fp16_inputs, output="images")
 
-        if isinstance(output, torch.Tensor):
-            output = output.cpu()
-            output_fp16 = output_fp16.cpu()
+        output = output.cpu()
+        output_fp16 = output_fp16.cpu()
 
         max_diff = numpy_cosine_similarity_distance(output.flatten(), output_fp16.flatten())
         assert max_diff < expected_max_diff, "FP16 inference is different from FP32 inference"
 
     @require_accelerator
     def test_to_device(self):
-        pipe = self.get_pipeline()
-        pipe.set_progress_bar_config(disable=None)
+        pipe = self.get_pipeline().to("cpu")
 
-        pipe.to("cpu")
         model_devices = [
             component.device.type for component in pipe.components.values() if hasattr(component, "device")
         ]
@@ -279,37 +243,30 @@ def test_to_device(self):
         )
 
     def test_inference_is_not_nan_cpu(self):
-        pipe = self.get_pipeline()
-        pipe.set_progress_bar_config(disable=None)
-        pipe.to("cpu")
+        pipe = self.get_pipeline().to("cpu")
 
-        output = pipe(**self.get_dummy_inputs("cpu"), output="images")
-        assert np.isnan(to_np(output)).sum() == 0, "CPU Inference returns NaN"
+        output = pipe(**self.get_dummy_inputs(), output="images")
+        assert torch.isnan(output).sum() == 0, "CPU Inference returns NaN"
 
     @require_accelerator
     def test_inference_is_not_nan(self):
-        pipe = self.get_pipeline()
-        pipe.set_progress_bar_config(disable=None)
-        pipe.to(torch_device)
+        pipe = self.get_pipeline().to(torch_device)
 
-        output = pipe(**self.get_dummy_inputs(torch_device), output="images")
-        assert np.isnan(to_np(output)).sum() == 0, "Accelerator Inference returns NaN"
+        output = pipe(**self.get_dummy_inputs(), output="images")
+        assert torch.isnan(output).sum() == 0, "Accelerator Inference returns NaN"
 
     def test_num_images_per_prompt(self):
-        pipe = self.get_pipeline()
+        pipe = self.get_pipeline().to(torch_device)
 
         if "num_images_per_prompt" not in pipe.blocks.input_names:
-            return
-
-        pipe = pipe.to(torch_device)
-        pipe.set_progress_bar_config(disable=None)
+            pytest.mark.skip("Skipping test as `num_images_per_prompt` is not present in input names.")
 
         batch_sizes = [1, 2]
         num_images_per_prompts = [1, 2]
 
         for batch_size in batch_sizes:
             for num_images_per_prompt in num_images_per_prompts:
-                inputs = self.get_dummy_inputs(torch_device)
+                inputs = self.get_dummy_inputs()
 
                 for key in inputs.keys():
                     if key in self.batch_params:
@@ -329,12 +286,12 @@ def test_components_auto_cpu_offload_inference_consistent(self):
 
         image_slices = []
         for pipe in [base_pipe, offload_pipe]:
-            inputs = self.get_dummy_inputs(torch_device)
+            inputs = self.get_dummy_inputs()
             image = pipe(**inputs, output="images")
 
             image_slices.append(image[0, -3:, -3:, -1].flatten())
 
-        assert np.abs(image_slices[0] - image_slices[1]).max() < 1e-3
+        assert torch.abs(image_slices[0] - image_slices[1]).max() < 1e-3
 
     def test_save_from_pretrained(self):
         pipes = []
@@ -351,9 +308,31 @@ def test_save_from_pretrained(self):
 
         image_slices = []
         for pipe in pipes:
-            inputs = self.get_dummy_inputs(torch_device)
+            inputs = self.get_dummy_inputs()
             image = pipe(**inputs, output="images")
 
             image_slices.append(image[0, -3:, -3:, -1].flatten())
 
-        assert np.abs(image_slices[0] - image_slices[1]).max() < 1e-3
+        assert torch.abs(image_slices[0] - image_slices[1]).max() < 1e-3
+
+
+class ModularGuiderTesterMixin:
+    def test_guider_cfg(self, expected_max_diff=1e-2):
+        pipe = self.get_pipeline().to(torch_device)
+
+        # forward pass with CFG not applied
+        guider = ClassifierFreeGuidance(guidance_scale=1.0)
+        pipe.update_components(guider=guider)
+
+        inputs = self.get_dummy_inputs()
+        out_no_cfg = pipe(**inputs, output="images")
+
+        # forward pass with CFG applied
+        guider = ClassifierFreeGuidance(guidance_scale=7.5)
+        pipe.update_components(guider=guider)
+        inputs = self.get_dummy_inputs()
+        out_cfg = pipe(**inputs, output="images")
+
+        assert out_cfg.shape == out_no_cfg.shape
+        max_diff = torch.abs(out_cfg - out_no_cfg).max()
+        assert max_diff > expected_max_diff, "Output with CFG must be different from normal inference"
diff --git a/tests/others/test_attention_backends.py b/tests/others/test_attention_backends.py
index 2e5a2fc82bb6..01f4521c5adc 100644
--- a/tests/others/test_attention_backends.py
+++ b/tests/others/test_attention_backends.py
@@ -7,7 +7,6 @@
 
 ```bash
 export RUN_ATTENTION_BACKEND_TESTS=yes
-export DIFFUSERS_ENABLE_HUB_KERNELS=yes
 
 pytest tests/others/test_attention_backends.py
 ```
@@ -35,7 +34,10 @@
 
 # fmt: off
 FORWARD_CASES = [
-    ("flash_hub", None),
+    (
+        "flash_hub",
+        torch.tensor([0.0820, 0.0859, 0.0918, 0.1016, 0.0957, 0.0996, 0.0996, 0.1016, 0.2188, 0.2266, 0.2363, 0.2500, 0.2539, 0.2461, 0.2422, 0.2695], dtype=torch.bfloat16)
+    ),
     (
         "_flash_3_hub",
         torch.tensor([0.0820, 0.0859, 0.0938, 0.1016, 0.0977, 0.0996, 0.1016, 0.1016, 0.2188, 0.2246, 0.2344, 0.2480, 0.2539, 0.2480, 0.2441, 0.2715], dtype=torch.bfloat16),
@@ -55,7 +57,11 @@
 ]
 
 COMPILE_CASES = [
-    ("flash_hub", None, True),
+    (
+        "flash_hub",
+        torch.tensor([0.0410, 0.0410, 0.0449, 0.0508, 0.0488, 0.0586, 0.0605, 0.0586, 0.2324, 0.2422, 0.2539, 0.2734, 0.2832, 0.2812, 0.2773, 0.3047], dtype=torch.bfloat16),
+        True
+    ),
     (
         "_flash_3_hub",
         torch.tensor([0.0410, 0.0410, 0.0449, 0.0508, 0.0508, 0.0605, 0.0625, 0.0605, 0.2344, 0.2461, 0.2578, 0.2734, 0.2852, 0.2812, 0.2773, 0.3047], dtype=torch.bfloat16),
diff --git a/tests/pipelines/audioldm2/test_audioldm2.py b/tests/pipelines/audioldm2/test_audioldm2.py
index 14ff1272a29e..5ccba1dabbfe 100644
--- a/tests/pipelines/audioldm2/test_audioldm2.py
+++ b/tests/pipelines/audioldm2/test_audioldm2.py
@@ -21,11 +21,9 @@
 import pytest
 import torch
 from transformers import (
-    ClapAudioConfig,
     ClapConfig,
     ClapFeatureExtractor,
     ClapModel,
-    ClapTextConfig,
     GPT2Config,
     GPT2LMHeadModel,
     RobertaTokenizer,
@@ -111,33 +109,33 @@ def get_dummy_components(self):
             latent_channels=4,
         )
         torch.manual_seed(0)
-        text_branch_config = ClapTextConfig(
-            bos_token_id=0,
-            eos_token_id=2,
-            hidden_size=8,
-            intermediate_size=37,
-            layer_norm_eps=1e-05,
-            num_attention_heads=1,
-            num_hidden_layers=1,
-            pad_token_id=1,
-            vocab_size=1000,
-            projection_dim=8,
-        )
-        audio_branch_config = ClapAudioConfig(
-            spec_size=8,
-            window_size=4,
-            num_mel_bins=8,
-            intermediate_size=37,
-            layer_norm_eps=1e-05,
-            depths=[1, 1],
-            num_attention_heads=[1, 1],
-            num_hidden_layers=1,
-            hidden_size=192,
-            projection_dim=8,
-            patch_size=2,
-            patch_stride=2,
-            patch_embed_input_channels=4,
-        )
+        text_branch_config = {
+            "bos_token_id": 0,
+            "eos_token_id": 2,
+            "hidden_size": 8,
+            "intermediate_size": 37,
+            "layer_norm_eps": 1e-05,
+            "num_attention_heads": 1,
+            "num_hidden_layers": 1,
+            "pad_token_id": 1,
+            "vocab_size": 1000,
+            "projection_dim": 8,
+        }
+        audio_branch_config = {
+            "spec_size": 8,
+            "window_size": 4,
+            "num_mel_bins": 8,
+            "intermediate_size": 37,
+            "layer_norm_eps": 1e-05,
+            "depths": [1, 1],
+            "num_attention_heads": [1, 1],
+            "num_hidden_layers": 1,
+            "hidden_size": 192,
+            "projection_dim": 8,
+            "patch_size": 2,
+            "patch_stride": 2,
+            "patch_embed_input_channels": 4,
+        }
         text_encoder_config = ClapConfig(
             text_config=text_branch_config, audio_config=audio_branch_config, projection_dim=16
         )
diff --git a/tests/pipelines/chronoedit/__init__.py b/tests/pipelines/chronoedit/__init__.py
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/tests/pipelines/chronoedit/test_chronoedit.py b/tests/pipelines/chronoedit/test_chronoedit.py
new file mode 100644
index 000000000000..43e5b3159b1c
--- /dev/null
+++ b/tests/pipelines/chronoedit/test_chronoedit.py
@@ -0,0 +1,176 @@
+# Copyright 2025 The HuggingFace Team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import torch
+from PIL import Image
+from transformers import (
+    AutoTokenizer,
+    CLIPImageProcessor,
+    CLIPVisionConfig,
+    CLIPVisionModelWithProjection,
+    T5EncoderModel,
+)
+
+from diffusers import (
+    AutoencoderKLWan,
+    ChronoEditPipeline,
+    ChronoEditTransformer3DModel,
+    FlowMatchEulerDiscreteScheduler,
+)
+
+from ...testing_utils import enable_full_determinism
+from ..pipeline_params import TEXT_TO_IMAGE_BATCH_PARAMS, TEXT_TO_IMAGE_IMAGE_PARAMS, TEXT_TO_IMAGE_PARAMS
+from ..test_pipelines_common import PipelineTesterMixin
+
+
+enable_full_determinism()
+
+
+class ChronoEditPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
+    pipeline_class = ChronoEditPipeline
+    params = TEXT_TO_IMAGE_PARAMS - {"cross_attention_kwargs", "height", "width"}
+    batch_params = TEXT_TO_IMAGE_BATCH_PARAMS
+    image_params = TEXT_TO_IMAGE_IMAGE_PARAMS
+    image_latents_params = TEXT_TO_IMAGE_IMAGE_PARAMS
+    required_optional_params = frozenset(
+        [
+            "num_inference_steps",
+            "generator",
+            "latents",
+            "return_dict",
+            "callback_on_step_end",
+            "callback_on_step_end_tensor_inputs",
+        ]
+    )
+    test_xformers_attention = False
+    supports_dduf = False
+
+    def get_dummy_components(self):
+        torch.manual_seed(0)
+        vae = AutoencoderKLWan(
+            base_dim=3,
+            z_dim=16,
+            dim_mult=[1, 1, 1, 1],
+            num_res_blocks=1,
+            temperal_downsample=[False, True, True],
+        )
+
+        torch.manual_seed(0)
+        # TODO: impl FlowDPMSolverMultistepScheduler
+        scheduler = FlowMatchEulerDiscreteScheduler(shift=7.0)
+        text_encoder = T5EncoderModel.from_pretrained("hf-internal-testing/tiny-random-t5")
+        tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-t5")
+
+        torch.manual_seed(0)
+        transformer = ChronoEditTransformer3DModel(
+            patch_size=(1, 2, 2),
+            num_attention_heads=2,
+            attention_head_dim=12,
+            in_channels=36,
+            out_channels=16,
+            text_dim=32,
+            freq_dim=256,
+            ffn_dim=32,
+            num_layers=2,
+            cross_attn_norm=True,
+            qk_norm="rms_norm_across_heads",
+            rope_max_seq_len=32,
+            image_dim=4,
+        )
+
+        torch.manual_seed(0)
+        image_encoder_config = CLIPVisionConfig(
+            hidden_size=4,
+            projection_dim=4,
+            num_hidden_layers=2,
+            num_attention_heads=2,
+            image_size=32,
+            intermediate_size=16,
+            patch_size=1,
+        )
+        image_encoder = CLIPVisionModelWithProjection(image_encoder_config)
+
+        torch.manual_seed(0)
+        image_processor = CLIPImageProcessor(crop_size=32, size=32)
+
+        components = {
+            "transformer": transformer,
+            "vae": vae,
+            "scheduler": scheduler,
+            "text_encoder": text_encoder,
+            "tokenizer": tokenizer,
+            "image_encoder": image_encoder,
+            "image_processor": image_processor,
+        }
+        return components
+
+    def get_dummy_inputs(self, device, seed=0):
+        if str(device).startswith("mps"):
+            generator = torch.manual_seed(seed)
+        else:
+            generator = torch.Generator(device=device).manual_seed(seed)
+        image_height = 16
+        image_width = 16
+        image = Image.new("RGB", (image_width, image_height))
+        inputs = {
+            "image": image,
+            "prompt": "dance monkey",
+            "negative_prompt": "negative",  # TODO
+            "height": image_height,
+            "width": image_width,
+            "generator": generator,
+            "num_inference_steps": 2,
+            "guidance_scale": 6.0,
+            "num_frames": 5,
+            "max_sequence_length": 16,
+            "output_type": "pt",
+        }
+        return inputs
+
+    def test_inference(self):
+        device = "cpu"
+
+        components = self.get_dummy_components()
+        pipe = self.pipeline_class(**components)
+        pipe.to(device)
+        pipe.set_progress_bar_config(disable=None)
+
+        inputs = self.get_dummy_inputs(device)
+        video = pipe(**inputs).frames
+        generated_video = video[0]
+        self.assertEqual(generated_video.shape, (5, 3, 16, 16))
+
+        # fmt: off
+        expected_slice = torch.tensor([0.4525, 0.4520, 0.4485, 0.4534, 0.4523, 0.4522, 0.4529, 0.4528, 0.5022, 0.5064, 0.5011, 0.5061, 0.5028, 0.4979, 0.5117, 0.5192])
+        # fmt: on
+
+        generated_slice = generated_video.flatten()
+        generated_slice = torch.cat([generated_slice[:8], generated_slice[-8:]])
+        self.assertTrue(torch.allclose(generated_slice, expected_slice, atol=1e-3))
+
+    @unittest.skip("Test not supported")
+    def test_attention_slicing_forward_pass(self):
+        pass
+
+    @unittest.skip("TODO: revisit failing as it requires a very high threshold to pass")
+    def test_inference_batch_single_identical(self):
+        pass
+
+    @unittest.skip(
+        "ChronoEditPipeline has to run in mixed precision. Save/Load the entire pipeline in FP16 will result in errors"
+    )
+    def test_save_load_float16(self):
+        pass
diff --git a/tests/pipelines/flux/test_pipeline_flux.py b/tests/pipelines/flux/test_pipeline_flux.py
index 1ddbd4ba3df8..74499bfa607a 100644
--- a/tests/pipelines/flux/test_pipeline_flux.py
+++ b/tests/pipelines/flux/test_pipeline_flux.py
@@ -29,6 +29,7 @@
     FluxIPAdapterTesterMixin,
     PipelineTesterMixin,
     PyramidAttentionBroadcastTesterMixin,
+    TaylorSeerCacheTesterMixin,
     check_qkv_fused_layers_exist,
 )
 
@@ -39,6 +40,7 @@ class FluxPipelineFastTests(
     PyramidAttentionBroadcastTesterMixin,
     FasterCacheTesterMixin,
     FirstBlockCacheTesterMixin,
+    TaylorSeerCacheTesterMixin,
     unittest.TestCase,
 ):
     pipeline_class = FluxPipeline
diff --git a/tests/pipelines/flux2/__init__.py b/tests/pipelines/flux2/__init__.py
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/tests/pipelines/flux2/test_pipeline_flux2.py b/tests/pipelines/flux2/test_pipeline_flux2.py
new file mode 100644
index 000000000000..4404dbc51047
--- /dev/null
+++ b/tests/pipelines/flux2/test_pipeline_flux2.py
@@ -0,0 +1,190 @@
+import unittest
+
+import numpy as np
+import torch
+from transformers import AutoProcessor, Mistral3Config, Mistral3ForConditionalGeneration
+
+from diffusers import (
+    AutoencoderKLFlux2,
+    FlowMatchEulerDiscreteScheduler,
+    Flux2Pipeline,
+    Flux2Transformer2DModel,
+)
+
+from ...testing_utils import (
+    torch_device,
+)
+from ..test_pipelines_common import (
+    PipelineTesterMixin,
+    check_qkv_fused_layers_exist,
+)
+
+
+class Flux2PipelineFastTests(PipelineTesterMixin, unittest.TestCase):
+    pipeline_class = Flux2Pipeline
+    params = frozenset(["prompt", "height", "width", "guidance_scale", "prompt_embeds"])
+    batch_params = frozenset(["prompt"])
+
+    test_xformers_attention = False
+    test_layerwise_casting = True
+    test_group_offloading = True
+
+    supports_dduf = False
+
+    def get_dummy_components(self, num_layers: int = 1, num_single_layers: int = 1):
+        torch.manual_seed(0)
+        transformer = Flux2Transformer2DModel(
+            patch_size=1,
+            in_channels=4,
+            num_layers=num_layers,
+            num_single_layers=num_single_layers,
+            attention_head_dim=16,
+            num_attention_heads=2,
+            joint_attention_dim=16,
+            timestep_guidance_channels=256,  # Hardcoded in original code
+            axes_dims_rope=[4, 4, 4, 4],
+        )
+
+        config = Mistral3Config(
+            text_config={
+                "model_type": "mistral",
+                "vocab_size": 32000,
+                "hidden_size": 16,
+                "intermediate_size": 37,
+                "max_position_embeddings": 512,
+                "num_attention_heads": 4,
+                "num_hidden_layers": 1,
+                "num_key_value_heads": 2,
+                "rms_norm_eps": 1e-05,
+                "rope_theta": 1000000000.0,
+                "sliding_window": None,
+                "bos_token_id": 2,
+                "eos_token_id": 3,
+                "pad_token_id": 4,
+            },
+            vision_config={
+                "model_type": "pixtral",
+                "hidden_size": 16,
+                "num_hidden_layers": 1,
+                "num_attention_heads": 4,
+                "intermediate_size": 37,
+                "image_size": 30,
+                "patch_size": 6,
+                "num_channels": 3,
+            },
+            bos_token_id=2,
+            eos_token_id=3,
+            pad_token_id=4,
+            model_dtype="mistral3",
+            image_seq_length=4,
+            vision_feature_layer=-1,
+            image_token_index=1,
+        )
+        torch.manual_seed(0)
+        text_encoder = Mistral3ForConditionalGeneration(config)
+        tokenizer = AutoProcessor.from_pretrained(
+            "hf-internal-testing/Mistral-Small-3.1-24B-Instruct-2503-only-processor"
+        )
+
+        torch.manual_seed(0)
+        vae = AutoencoderKLFlux2(
+            sample_size=32,
+            in_channels=3,
+            out_channels=3,
+            down_block_types=("DownEncoderBlock2D",),
+            up_block_types=("UpDecoderBlock2D",),
+            block_out_channels=(4,),
+            layers_per_block=1,
+            latent_channels=1,
+            norm_num_groups=1,
+            use_quant_conv=False,
+            use_post_quant_conv=False,
+        )
+
+        scheduler = FlowMatchEulerDiscreteScheduler()
+
+        return {
+            "scheduler": scheduler,
+            "text_encoder": text_encoder,
+            "tokenizer": tokenizer,
+            "transformer": transformer,
+            "vae": vae,
+        }
+
+    def get_dummy_inputs(self, device, seed=0):
+        if str(device).startswith("mps"):
+            generator = torch.manual_seed(seed)
+        else:
+            generator = torch.Generator(device="cpu").manual_seed(seed)
+
+        inputs = {
+            "prompt": "a dog is dancing",
+            "generator": generator,
+            "num_inference_steps": 2,
+            "guidance_scale": 5.0,
+            "height": 8,
+            "width": 8,
+            "max_sequence_length": 8,
+            "output_type": "np",
+            "text_encoder_out_layers": (1,),
+        }
+        return inputs
+
+    def test_fused_qkv_projections(self):
+        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
+        components = self.get_dummy_components()
+        pipe = self.pipeline_class(**components)
+        pipe = pipe.to(device)
+        pipe.set_progress_bar_config(disable=None)
+
+        inputs = self.get_dummy_inputs(device)
+        image = pipe(**inputs).images
+        original_image_slice = image[0, -3:, -3:, -1]
+
+        # TODO (sayakpaul): will refactor this once `fuse_qkv_projections()` has been added
+        # to the pipeline level.
+        pipe.transformer.fuse_qkv_projections()
+        self.assertTrue(
+            check_qkv_fused_layers_exist(pipe.transformer, ["to_qkv"]),
+            ("Something wrong with the fused attention layers. Expected all the attention projections to be fused."),
+        )
+
+        inputs = self.get_dummy_inputs(device)
+        image = pipe(**inputs).images
+        image_slice_fused = image[0, -3:, -3:, -1]
+
+        pipe.transformer.unfuse_qkv_projections()
+        inputs = self.get_dummy_inputs(device)
+        image = pipe(**inputs).images
+        image_slice_disabled = image[0, -3:, -3:, -1]
+
+        self.assertTrue(
+            np.allclose(original_image_slice, image_slice_fused, atol=1e-3, rtol=1e-3),
+            ("Fusion of QKV projections shouldn't affect the outputs."),
+        )
+        self.assertTrue(
+            np.allclose(image_slice_fused, image_slice_disabled, atol=1e-3, rtol=1e-3),
+            ("Outputs, with QKV projection fusion enabled, shouldn't change when fused QKV projections are disabled."),
+        )
+        self.assertTrue(
+            np.allclose(original_image_slice, image_slice_disabled, atol=1e-2, rtol=1e-2),
+            ("Original outputs should match when fused QKV projections are disabled."),
+        )
+
+    def test_flux_image_output_shape(self):
+        pipe = self.pipeline_class(**self.get_dummy_components()).to(torch_device)
+        inputs = self.get_dummy_inputs(torch_device)
+
+        height_width_pairs = [(32, 32), (72, 57)]
+        for height, width in height_width_pairs:
+            expected_height = height - height % (pipe.vae_scale_factor * 2)
+            expected_width = width - width % (pipe.vae_scale_factor * 2)
+
+            inputs.update({"height": height, "width": width})
+            image = pipe(**inputs).images[0]
+            output_height, output_width, _ = image.shape
+            self.assertEqual(
+                (output_height, output_width),
+                (expected_height, expected_width),
+                f"Output shape {image.shape} does not match expected shape {(expected_height, expected_width)}",
+            )
diff --git a/tests/pipelines/hunyuan_video/test_hunyuan_video.py b/tests/pipelines/hunyuan_video/test_hunyuan_video.py
index 4bdf3ee20e1b..57a6daebad1f 100644
--- a/tests/pipelines/hunyuan_video/test_hunyuan_video.py
+++ b/tests/pipelines/hunyuan_video/test_hunyuan_video.py
@@ -33,6 +33,7 @@
     FirstBlockCacheTesterMixin,
     PipelineTesterMixin,
     PyramidAttentionBroadcastTesterMixin,
+    TaylorSeerCacheTesterMixin,
     to_np,
 )
 
@@ -45,6 +46,7 @@ class HunyuanVideoPipelineFastTests(
     PyramidAttentionBroadcastTesterMixin,
     FasterCacheTesterMixin,
     FirstBlockCacheTesterMixin,
+    TaylorSeerCacheTesterMixin,
     unittest.TestCase,
 ):
     pipeline_class = HunyuanVideoPipeline
diff --git a/tests/pipelines/hunyuan_video1_5/__init__.py b/tests/pipelines/hunyuan_video1_5/__init__.py
new file mode 100644
index 000000000000..8fb044d9cf83
--- /dev/null
+++ b/tests/pipelines/hunyuan_video1_5/__init__.py
@@ -0,0 +1 @@
+# Copyright 2025 The HuggingFace Team.
diff --git a/tests/pipelines/hunyuan_video1_5/test_hunyuan_1_5.py b/tests/pipelines/hunyuan_video1_5/test_hunyuan_1_5.py
new file mode 100644
index 000000000000..993c7ef6e4bb
--- /dev/null
+++ b/tests/pipelines/hunyuan_video1_5/test_hunyuan_1_5.py
@@ -0,0 +1,187 @@
+# Copyright 2025 The HuggingFace Team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import torch
+from transformers import ByT5Tokenizer, Qwen2_5_VLTextConfig, Qwen2_5_VLTextModel, Qwen2Tokenizer, T5EncoderModel
+
+from diffusers import (
+    AutoencoderKLHunyuanVideo15,
+    FlowMatchEulerDiscreteScheduler,
+    HunyuanVideo15Pipeline,
+    HunyuanVideo15Transformer3DModel,
+)
+from diffusers.guiders import ClassifierFreeGuidance
+
+from ...testing_utils import enable_full_determinism
+from ..test_pipelines_common import PipelineTesterMixin
+
+
+enable_full_determinism()
+
+
+class HunyuanVideo15PipelineFastTests(PipelineTesterMixin, unittest.TestCase):
+    pipeline_class = HunyuanVideo15Pipeline
+    params = frozenset(
+        [
+            "prompt",
+            "negative_prompt",
+            "height",
+            "width",
+            "prompt_embeds",
+            "prompt_embeds_mask",
+            "negative_prompt_embeds",
+            "negative_prompt_embeds_mask",
+            "prompt_embeds_2",
+            "prompt_embeds_mask_2",
+            "negative_prompt_embeds_2",
+            "negative_prompt_embeds_mask_2",
+        ]
+    )
+    batch_params = ["prompt", "negative_prompt"]
+    required_optional_params = frozenset(["num_inference_steps", "generator", "latents", "return_dict"])
+    test_attention_slicing = False
+    test_xformers_attention = False
+    test_layerwise_casting = True
+    test_group_offloading = False
+    supports_dduf = False
+
+    def get_dummy_components(self, num_layers: int = 1):
+        torch.manual_seed(0)
+        transformer = HunyuanVideo15Transformer3DModel(
+            in_channels=9,
+            out_channels=4,
+            num_attention_heads=2,
+            attention_head_dim=8,
+            num_layers=num_layers,
+            num_refiner_layers=1,
+            mlp_ratio=2.0,
+            patch_size=1,
+            patch_size_t=1,
+            text_embed_dim=16,
+            text_embed_2_dim=32,
+            image_embed_dim=12,
+            rope_axes_dim=(2, 2, 4),
+            target_size=16,
+            task_type="t2v",
+        )
+
+        torch.manual_seed(0)
+        vae = AutoencoderKLHunyuanVideo15(
+            in_channels=3,
+            out_channels=3,
+            latent_channels=4,
+            block_out_channels=(16, 16),
+            layers_per_block=1,
+            spatial_compression_ratio=4,
+            temporal_compression_ratio=2,
+            downsample_match_channel=False,
+            upsample_match_channel=False,
+        )
+
+        torch.manual_seed(0)
+        scheduler = FlowMatchEulerDiscreteScheduler(shift=7.0)
+
+        torch.manual_seed(0)
+        qwen_config = Qwen2_5_VLTextConfig(
+            **{
+                "hidden_size": 16,
+                "intermediate_size": 16,
+                "num_hidden_layers": 2,
+                "num_attention_heads": 2,
+                "num_key_value_heads": 2,
+                "rope_scaling": {
+                    "mrope_section": [1, 1, 2],
+                    "rope_type": "default",
+                    "type": "default",
+                },
+                "rope_theta": 1000000.0,
+            }
+        )
+        text_encoder = Qwen2_5_VLTextModel(qwen_config)
+        tokenizer = Qwen2Tokenizer.from_pretrained("hf-internal-testing/tiny-random-Qwen2VLForConditionalGeneration")
+
+        torch.manual_seed(0)
+        text_encoder_2 = T5EncoderModel.from_pretrained("hf-internal-testing/tiny-random-t5")
+        tokenizer_2 = ByT5Tokenizer()
+
+        guider = ClassifierFreeGuidance(guidance_scale=1.0)
+
+        components = {
+            "transformer": transformer.eval(),
+            "vae": vae.eval(),
+            "scheduler": scheduler,
+            "text_encoder": text_encoder.eval(),
+            "text_encoder_2": text_encoder_2.eval(),
+            "tokenizer": tokenizer,
+            "tokenizer_2": tokenizer_2,
+            "guider": guider,
+        }
+        return components
+
+    def get_dummy_inputs(self, device, seed=0):
+        if str(device).startswith("mps"):
+            generator = torch.manual_seed(seed)
+        else:
+            generator = torch.Generator(device=device).manual_seed(seed)
+
+        inputs = {
+            "prompt": "monkey",
+            "generator": generator,
+            "num_inference_steps": 2,
+            "height": 16,
+            "width": 16,
+            "num_frames": 9,
+            "output_type": "pt",
+        }
+        return inputs
+
+    def test_inference(self):
+        device = "cpu"
+
+        components = self.get_dummy_components()
+        pipe = self.pipeline_class(**components)
+        pipe.to(device)
+        pipe.set_progress_bar_config(disable=None)
+
+        inputs = self.get_dummy_inputs(device)
+        result = pipe(**inputs)
+        video = result.frames
+
+        generated_video = video[0]
+        self.assertEqual(generated_video.shape, (9, 3, 16, 16))
+        generated_slice = generated_video.flatten()
+        generated_slice = torch.cat([generated_slice[:8], generated_slice[-8:]])
+
+        # fmt: off
+        expected_slice = torch.tensor([0.4296, 0.5549, 0.3088, 0.9115, 0.5049, 0.7926, 0.5549, 0.8618, 0.5091, 0.5075, 0.7117, 0.5292, 0.7053, 0.4864, 0.5206, 0.3878])
+        # fmt: on
+
+        self.assertTrue(
+            torch.abs(generated_slice - expected_slice).max() < 1e-3,
+            f"output_slice: {generated_slice}, expected_slice: {expected_slice}",
+        )
+
+    @unittest.skip("TODO: Test not supported for now because needs to be adjusted to work with guiders.")
+    def test_encode_prompt_works_in_isolation(self):
+        pass
+
+    @unittest.skip("Needs to be revisited.")
+    def test_inference_batch_consistent(self):
+        super().test_inference_batch_consistent()
+
+    @unittest.skip("Needs to be revisited.")
+    def test_inference_batch_single_identical(self):
+        super().test_inference_batch_single_identical()
diff --git a/tests/pipelines/kandinsky2_2/test_kandinsky_combined.py b/tests/pipelines/kandinsky2_2/test_kandinsky_combined.py
index 476fc584cc56..62f5853da9a5 100644
--- a/tests/pipelines/kandinsky2_2/test_kandinsky_combined.py
+++ b/tests/pipelines/kandinsky2_2/test_kandinsky_combined.py
@@ -23,7 +23,7 @@
     KandinskyV22InpaintCombinedPipeline,
 )
 
-from ...testing_utils import enable_full_determinism, require_torch_accelerator, torch_device
+from ...testing_utils import enable_full_determinism, require_accelerator, require_torch_accelerator, torch_device
 from ..test_pipelines_common import PipelineTesterMixin
 from .test_kandinsky import Dummies
 from .test_kandinsky_img2img import Dummies as Img2ImgDummies
@@ -402,6 +402,7 @@ def test_save_load_local(self):
     def test_save_load_optional_components(self):
         super().test_save_load_optional_components(expected_max_difference=5e-4)
 
+    @require_accelerator
     def test_sequential_cpu_offload_forward_pass(self):
         super().test_sequential_cpu_offload_forward_pass(expected_max_diff=5e-4)
 
diff --git a/tests/pipelines/kandinsky2_2/test_kandinsky_inpaint.py b/tests/pipelines/kandinsky2_2/test_kandinsky_inpaint.py
index d4eb650263af..8a693e9c2dd0 100644
--- a/tests/pipelines/kandinsky2_2/test_kandinsky_inpaint.py
+++ b/tests/pipelines/kandinsky2_2/test_kandinsky_inpaint.py
@@ -37,6 +37,7 @@
     load_image,
     load_numpy,
     numpy_cosine_similarity_distance,
+    require_accelerator,
     require_torch_accelerator,
     slow,
     torch_device,
@@ -254,6 +255,7 @@ def test_model_cpu_offload_forward_pass(self):
     def test_save_load_optional_components(self):
         super().test_save_load_optional_components(expected_max_difference=5e-4)
 
+    @require_accelerator
     def test_sequential_cpu_offload_forward_pass(self):
         super().test_sequential_cpu_offload_forward_pass(expected_max_diff=5e-4)
 
diff --git a/tests/pipelines/kandinsky5/test_kandinsky5.py b/tests/pipelines/kandinsky5/test_kandinsky5.py
index 47fccb632a54..4101e7798dea 100644
--- a/tests/pipelines/kandinsky5/test_kandinsky5.py
+++ b/tests/pipelines/kandinsky5/test_kandinsky5.py
@@ -1,4 +1,4 @@
-# Copyright 2025 The Kandinsky Team and The HuggingFace Team. All rights reserved.
+# Copyright 2025 The Kandinsky Team and The HuggingFace Team.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -16,12 +16,12 @@
 
 import torch
 from transformers import (
+    AutoProcessor,
     CLIPTextConfig,
     CLIPTextModel,
     CLIPTokenizer,
     Qwen2_5_VLConfig,
     Qwen2_5_VLForConditionalGeneration,
-    Qwen2VLProcessor,
 )
 
 from diffusers import (
@@ -33,9 +33,7 @@
 
 from ...testing_utils import (
     enable_full_determinism,
-    torch_device,
 )
-from ..pipeline_params import TEXT_TO_IMAGE_BATCH_PARAMS, TEXT_TO_IMAGE_PARAMS
 from ..test_pipelines_common import PipelineTesterMixin
 
 
@@ -44,51 +42,62 @@
 
 class Kandinsky5T2VPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
     pipeline_class = Kandinsky5T2VPipeline
-    params = TEXT_TO_IMAGE_PARAMS - {"cross_attention_kwargs", "prompt_embeds", "negative_prompt_embeds"}
-    batch_params = TEXT_TO_IMAGE_BATCH_PARAMS
 
-    # Define required optional parameters for your pipeline
-    required_optional_params = frozenset(
-        [
-            "num_inference_steps",
-            "generator",
-            "latents",
-            "return_dict",
-            "callback_on_step_end",
-            "callback_on_step_end_tensor_inputs",
-            "max_sequence_length",
-        ]
-    )
+    batch_params = ["prompt", "negative_prompt"]
 
+    params = frozenset(["prompt", "height", "width", "num_frames", "num_inference_steps", "guidance_scale"])
+
+    required_optional_params = {
+        "num_inference_steps",
+        "generator",
+        "latents",
+        "return_dict",
+        "callback_on_step_end",
+        "callback_on_step_end_tensor_inputs",
+        "max_sequence_length",
+    }
     test_xformers_attention = False
+    supports_optional_components = True
     supports_dduf = False
+    test_attention_slicing = False
 
     def get_dummy_components(self):
         torch.manual_seed(0)
         vae = AutoencoderKLHunyuanVideo(
+            act_fn="silu",
+            block_out_channels=[32, 64],
+            down_block_types=[
+                "HunyuanVideoDownBlock3D",
+                "HunyuanVideoDownBlock3D",
+            ],
             in_channels=3,
+            latent_channels=16,
+            layers_per_block=1,
+            mid_block_add_attention=False,
+            norm_num_groups=32,
             out_channels=3,
+            scaling_factor=0.476986,
             spatial_compression_ratio=8,
             temporal_compression_ratio=4,
-            latent_channels=4,
-            block_out_channels=(8, 8, 8, 8),
-            layers_per_block=1,
-            norm_num_groups=4,
+            up_block_types=[
+                "HunyuanVideoUpBlock3D",
+                "HunyuanVideoUpBlock3D",
+            ],
         )
 
-        torch.manual_seed(0)
         scheduler = FlowMatchEulerDiscreteScheduler(shift=7.0)
 
-        # Dummy Qwen2.5-VL model
-        config = Qwen2_5_VLConfig(
+        qwen_hidden_size = 32
+        torch.manual_seed(0)
+        qwen_config = Qwen2_5_VLConfig(
             text_config={
-                "hidden_size": 16,
-                "intermediate_size": 16,
+                "hidden_size": qwen_hidden_size,
+                "intermediate_size": qwen_hidden_size,
                 "num_hidden_layers": 2,
                 "num_attention_heads": 2,
                 "num_key_value_heads": 2,
                 "rope_scaling": {
-                    "mrope_section": [1, 1, 2],
+                    "mrope_section": [2, 2, 4],
                     "rope_type": "default",
                     "type": "default",
                 },
@@ -96,211 +105,106 @@ def get_dummy_components(self):
             },
             vision_config={
                 "depth": 2,
-                "hidden_size": 16,
-                "intermediate_size": 16,
+                "hidden_size": qwen_hidden_size,
+                "intermediate_size": qwen_hidden_size,
                 "num_heads": 2,
-                "out_hidden_size": 16,
+                "out_hidden_size": qwen_hidden_size,
             },
-            hidden_size=16,
+            hidden_size=qwen_hidden_size,
             vocab_size=152064,
             vision_end_token_id=151653,
             vision_start_token_id=151652,
             vision_token_id=151654,
         )
-        text_encoder = Qwen2_5_VLForConditionalGeneration(config)
-        tokenizer = Qwen2VLProcessor.from_pretrained("hf-internal-testing/tiny-random-Qwen2VLForConditionalGeneration")
+        text_encoder = Qwen2_5_VLForConditionalGeneration(qwen_config)
+        tokenizer = AutoProcessor.from_pretrained("hf-internal-testing/tiny-random-Qwen2VLForConditionalGeneration")
 
-        # Dummy CLIP model
-        clip_text_encoder_config = CLIPTextConfig(
+        clip_hidden_size = 16
+        torch.manual_seed(0)
+        clip_config = CLIPTextConfig(
             bos_token_id=0,
             eos_token_id=2,
-            hidden_size=32,
-            intermediate_size=37,
+            hidden_size=clip_hidden_size,
+            intermediate_size=16,
             layer_norm_eps=1e-05,
-            num_attention_heads=4,
-            num_hidden_layers=5,
+            num_attention_heads=2,
+            num_hidden_layers=2,
             pad_token_id=1,
             vocab_size=1000,
-            hidden_act="gelu",
-            projection_dim=32,
+            projection_dim=clip_hidden_size,
         )
-
-        torch.manual_seed(0)
-        text_encoder_2 = CLIPTextModel(clip_text_encoder_config)
+        text_encoder_2 = CLIPTextModel(clip_config)
         tokenizer_2 = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
 
         torch.manual_seed(0)
         transformer = Kandinsky5Transformer3DModel(
-            in_visual_dim=4,
-            in_text_dim=16,  # Match tiny Qwen2.5-VL hidden size
-            in_text_dim2=32,  # Match tiny CLIP hidden size
-            time_dim=32,
-            out_visual_dim=4,
+            in_visual_dim=16,
+            in_text_dim=qwen_hidden_size,
+            in_text_dim2=clip_hidden_size,
+            time_dim=16,
+            out_visual_dim=16,
             patch_size=(1, 2, 2),
-            model_dim=48,
-            ff_dim=128,
+            model_dim=16,
+            ff_dim=32,
             num_text_blocks=1,
-            num_visual_blocks=1,
-            axes_dims=(8, 8, 8),
+            num_visual_blocks=2,
+            axes_dims=(1, 1, 2),
             visual_cond=False,
+            attention_type="regular",
         )
 
-        components = {
-            "transformer": transformer.eval(),
-            "vae": vae.eval(),
-            "scheduler": scheduler,
-            "text_encoder": text_encoder.eval(),
+        return {
+            "vae": vae,
+            "text_encoder": text_encoder,
             "tokenizer": tokenizer,
-            "text_encoder_2": text_encoder_2.eval(),
+            "text_encoder_2": text_encoder_2,
             "tokenizer_2": tokenizer_2,
+            "transformer": transformer,
+            "scheduler": scheduler,
         }
-        return components
 
     def get_dummy_inputs(self, device, seed=0):
         if str(device).startswith("mps"):
             generator = torch.manual_seed(seed)
         else:
             generator = torch.Generator(device=device).manual_seed(seed)
-        inputs = {
-            "prompt": "A cat dancing",
-            "negative_prompt": "blurry, low quality",
-            "generator": generator,
-            "num_inference_steps": 2,
-            "guidance_scale": 5.0,
+
+        return {
+            "prompt": "a red square",
             "height": 32,
             "width": 32,
             "num_frames": 5,
-            "max_sequence_length": 16,
+            "num_inference_steps": 2,
+            "guidance_scale": 4.0,
+            "generator": generator,
             "output_type": "pt",
+            "max_sequence_length": 8,
         }
-        return inputs
 
     def test_inference(self):
         device = "cpu"
-
         components = self.get_dummy_components()
         pipe = self.pipeline_class(**components)
         pipe.to(device)
         pipe.set_progress_bar_config(disable=None)
 
         inputs = self.get_dummy_inputs(device)
-        video = pipe(**inputs).frames
-
-        # Check video shape: (batch, frames, channel, height, width)
-        expected_shape = (1, 5, 3, 32, 32)
-        self.assertEqual(video.shape, expected_shape)
+        output = pipe(**inputs)
+        video = output.frames[0]
 
-        # Check specific values
-        expected_slice = torch.tensor(
-            [
-                0.4330,
-                0.4254,
-                0.4285,
-                0.3835,
-                0.4253,
-                0.4196,
-                0.3704,
-                0.3714,
-                0.4999,
-                0.5346,
-                0.4795,
-                0.4637,
-                0.4930,
-                0.5124,
-                0.4902,
-                0.4570,
-            ]
-        )
+        self.assertEqual(video.shape, (3, 3, 16, 16))
 
-        generated_slice = video.flatten()
-        # Take first 8 and last 8 values for comparison
-        video_slice = torch.cat([generated_slice[:8], generated_slice[-8:]])
-        self.assertTrue(
-            torch.allclose(video_slice, expected_slice, atol=1e-3),
-            f"video_slice: {video_slice}, expected_slice: {expected_slice}",
-        )
-
-    def test_inference_batch_single_identical(self):
-        # Override to test batch single identical with video
-        super().test_inference_batch_single_identical(batch_size=2, expected_max_diff=1e-2)
-
-    def test_encode_prompt_works_in_isolation(self, extra_required_param_value_dict=None, atol=1e-3, rtol=1e-3):
-        components = self.get_dummy_components()
-
-        text_component_names = ["text_encoder", "text_encoder_2", "tokenizer", "tokenizer_2"]
-        text_components = {k: (v if k in text_component_names else None) for k, v in components.items()}
-        non_text_components = {k: (v if k not in text_component_names else None) for k, v in components.items()}
-
-        pipe_with_just_text_encoder = self.pipeline_class(**text_components)
-        pipe_with_just_text_encoder = pipe_with_just_text_encoder.to(torch_device)
-
-        pipe_without_text_encoders = self.pipeline_class(**non_text_components)
-        pipe_without_text_encoders = pipe_without_text_encoders.to(torch_device)
-
-        pipe = self.pipeline_class(**components)
-        pipe = pipe.to(torch_device)
-
-        # Compute `encode_prompt()`.
-
-        # Test single prompt
-        prompt = "A cat dancing"
-        with torch.no_grad():
-            prompt_embeds_qwen, prompt_embeds_clip, prompt_cu_seqlens = pipe_with_just_text_encoder.encode_prompt(
-                prompt, device=torch_device, max_sequence_length=16
-            )
-
-        # Check shapes
-        self.assertEqual(prompt_embeds_qwen.shape, (1, 4, 16))  # [batch, seq_len, embed_dim]
-        self.assertEqual(prompt_embeds_clip.shape, (1, 32))  # [batch, embed_dim]
-        self.assertEqual(prompt_cu_seqlens.shape, (2,))  # [batch + 1]
-
-        # Test batch of prompts
-        prompts = ["A cat dancing", "A dog running"]
-        with torch.no_grad():
-            batch_embeds_qwen, batch_embeds_clip, batch_cu_seqlens = pipe_with_just_text_encoder.encode_prompt(
-                prompts, device=torch_device, max_sequence_length=16
-            )
-
-        # Check batch size
-        self.assertEqual(batch_embeds_qwen.shape, (len(prompts), 4, 16))
-        self.assertEqual(batch_embeds_clip.shape, (len(prompts), 32))
-        self.assertEqual(len(batch_cu_seqlens), len(prompts) + 1)  # [0, len1, len1+len2]
-
-        inputs = self.get_dummy_inputs(torch_device)
-        inputs["guidance_scale"] = 1.0
-
-        # baseline output: full pipeline
-        pipe_out = pipe(**inputs).frames
-
-        # test against pipeline call with pre-computed prompt embeds
-        inputs = self.get_dummy_inputs(torch_device)
-        inputs["guidance_scale"] = 1.0
-
-        with torch.no_grad():
-            prompt_embeds_qwen, prompt_embeds_clip, prompt_cu_seqlens = pipe_with_just_text_encoder.encode_prompt(
-                inputs["prompt"], device=torch_device, max_sequence_length=inputs["max_sequence_length"]
-            )
-
-        inputs["prompt"] = None
-        inputs["prompt_embeds_qwen"] = prompt_embeds_qwen
-        inputs["prompt_embeds_clip"] = prompt_embeds_clip
-        inputs["prompt_cu_seqlens"] = prompt_cu_seqlens
-
-        pipe_out_2 = pipe_without_text_encoders(**inputs)[0]
-
-        self.assertTrue(
-            torch.allclose(pipe_out, pipe_out_2, atol=atol, rtol=rtol),
-            f"max diff: {torch.max(torch.abs(pipe_out - pipe_out_2))}",
-        )
-
-    @unittest.skip("Kandinsky5T2VPipeline does not support attention slicing")
     def test_attention_slicing_forward_pass(self):
         pass
 
-    @unittest.skip("Kandinsky5T2VPipeline does not support xformers")
-    def test_xformers_attention_forwardGenerator_pass(self):
+    @unittest.skip("Only SDPA or NABLA (flex)")
+    def test_xformers_memory_efficient_attention(self):
         pass
 
-    @unittest.skip("Kandinsky5T2VPipeline does not support VAE slicing")
-    def test_vae_slicing(self):
+    @unittest.skip("TODO:Test does not work")
+    def test_encode_prompt_works_in_isolation(self):
+        pass
+
+    @unittest.skip("TODO: revisit")
+    def test_inference_batch_single_identical(self):
         pass
diff --git a/tests/pipelines/kandinsky5/test_kandinsky5_i2i.py b/tests/pipelines/kandinsky5/test_kandinsky5_i2i.py
new file mode 100644
index 000000000000..dc832990836f
--- /dev/null
+++ b/tests/pipelines/kandinsky5/test_kandinsky5_i2i.py
@@ -0,0 +1,213 @@
+# Copyright 2025 The Kandinsky Team and The HuggingFace Team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import torch
+from PIL import Image
+from transformers import (
+    AutoProcessor,
+    CLIPTextConfig,
+    CLIPTextModel,
+    CLIPTokenizer,
+    Qwen2_5_VLConfig,
+    Qwen2_5_VLForConditionalGeneration,
+)
+
+from diffusers import (
+    AutoencoderKL,
+    FlowMatchEulerDiscreteScheduler,
+    Kandinsky5I2IPipeline,
+    Kandinsky5Transformer3DModel,
+)
+from diffusers.utils.testing_utils import enable_full_determinism
+
+from ..test_pipelines_common import PipelineTesterMixin
+
+
+enable_full_determinism()
+
+
+class Kandinsky5I2IPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
+    pipeline_class = Kandinsky5I2IPipeline
+
+    batch_params = ["prompt", "negative_prompt"]
+    params = frozenset(["image", "prompt", "height", "width", "num_inference_steps", "guidance_scale"])
+
+    required_optional_params = {
+        "num_inference_steps",
+        "generator",
+        "latents",
+        "return_dict",
+        "callback_on_step_end",
+        "callback_on_step_end_tensor_inputs",
+        "max_sequence_length",
+    }
+    test_xformers_attention = False
+    supports_optional_components = True
+    supports_dduf = False
+    test_attention_slicing = False
+
+    def get_dummy_components(self):
+        torch.manual_seed(0)
+        vae = AutoencoderKL(
+            act_fn="silu",
+            block_out_channels=[32, 64, 64, 64],
+            down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D", "DownEncoderBlock2D", "DownEncoderBlock2D"],
+            force_upcast=True,
+            in_channels=3,
+            latent_channels=16,
+            layers_per_block=1,
+            mid_block_add_attention=False,
+            norm_num_groups=32,
+            out_channels=3,
+            sample_size=64,
+            scaling_factor=0.3611,
+            shift_factor=0.1159,
+            up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D", "UpDecoderBlock2D", "UpDecoderBlock2D"],
+            use_post_quant_conv=False,
+            use_quant_conv=False,
+        )
+
+        scheduler = FlowMatchEulerDiscreteScheduler(shift=7.0)
+
+        qwen_hidden_size = 32
+        torch.manual_seed(0)
+        qwen_config = Qwen2_5_VLConfig(
+            text_config={
+                "hidden_size": qwen_hidden_size,
+                "intermediate_size": qwen_hidden_size,
+                "num_hidden_layers": 2,
+                "num_attention_heads": 2,
+                "num_key_value_heads": 2,
+                "rope_scaling": {
+                    "mrope_section": [2, 2, 4],
+                    "rope_type": "default",
+                    "type": "default",
+                },
+                "rope_theta": 1000000.0,
+            },
+            vision_config={
+                "depth": 2,
+                "hidden_size": qwen_hidden_size,
+                "intermediate_size": qwen_hidden_size,
+                "num_heads": 2,
+                "out_hidden_size": qwen_hidden_size,
+            },
+            hidden_size=qwen_hidden_size,
+            vocab_size=152064,
+            vision_end_token_id=151653,
+            vision_start_token_id=151652,
+            vision_token_id=151654,
+        )
+        text_encoder = Qwen2_5_VLForConditionalGeneration(qwen_config)
+        tokenizer = AutoProcessor.from_pretrained("hf-internal-testing/tiny-random-Qwen2VLForConditionalGeneration")
+
+        clip_hidden_size = 16
+        torch.manual_seed(0)
+        clip_config = CLIPTextConfig(
+            bos_token_id=0,
+            eos_token_id=2,
+            hidden_size=clip_hidden_size,
+            intermediate_size=16,
+            layer_norm_eps=1e-05,
+            num_attention_heads=2,
+            num_hidden_layers=2,
+            pad_token_id=1,
+            vocab_size=1000,
+            projection_dim=clip_hidden_size,
+        )
+        text_encoder_2 = CLIPTextModel(clip_config)
+        tokenizer_2 = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
+
+        torch.manual_seed(0)
+        transformer = Kandinsky5Transformer3DModel(
+            in_visual_dim=16,
+            in_text_dim=qwen_hidden_size,
+            in_text_dim2=clip_hidden_size,
+            time_dim=16,
+            out_visual_dim=16,
+            patch_size=(1, 2, 2),
+            model_dim=16,
+            ff_dim=32,
+            num_text_blocks=1,
+            num_visual_blocks=2,
+            axes_dims=(1, 1, 2),
+            visual_cond=True,
+            attention_type="regular",
+        )
+
+        return {
+            "vae": vae,
+            "text_encoder": text_encoder,
+            "tokenizer": tokenizer,
+            "text_encoder_2": text_encoder_2,
+            "tokenizer_2": tokenizer_2,
+            "transformer": transformer,
+            "scheduler": scheduler,
+        }
+
+    def get_dummy_inputs(self, device, seed=0):
+        if str(device).startswith("mps"):
+            generator = torch.manual_seed(seed)
+        else:
+            generator = torch.Generator(device=device).manual_seed(seed)
+
+        image = Image.new("RGB", (64, 64), color="red")
+
+        return {
+            "image": image,
+            "prompt": "a red square",
+            "height": 64,
+            "width": 64,
+            "num_inference_steps": 2,
+            "guidance_scale": 4.0,
+            "generator": generator,
+            "output_type": "pt",
+            "max_sequence_length": 8,
+        }
+
+    def test_inference(self):
+        device = "cpu"
+        components = self.get_dummy_components()
+        pipe = self.pipeline_class(**components)
+        pipe.resolutions = [(64, 64)]
+        pipe.to(device)
+        pipe.set_progress_bar_config(disable=None)
+
+        inputs = self.get_dummy_inputs(device)
+        output = pipe(**inputs)
+        image = output.image
+
+        self.assertEqual(image.shape, (1, 3, 64, 64))
+
+    @unittest.skip("TODO: Test does not work")
+    def test_encode_prompt_works_in_isolation(self):
+        pass
+
+    @unittest.skip("TODO: revisit, Batch isnot yet supported in this pipeline")
+    def test_num_images_per_prompt(self):
+        pass
+
+    @unittest.skip("TODO: revisit, Batch isnot yet supported in this pipeline")
+    def test_inference_batch_single_identical(self):
+        pass
+
+    @unittest.skip("TODO: revisit, Batch isnot yet supported in this pipeline")
+    def test_inference_batch_consistent(self):
+        pass
+
+    @unittest.skip("TODO: revisit, not working")
+    def test_float16_inference(self):
+        pass
diff --git a/tests/pipelines/kandinsky5/test_kandinsky5_i2v.py b/tests/pipelines/kandinsky5/test_kandinsky5_i2v.py
new file mode 100644
index 000000000000..483c7b66e07b
--- /dev/null
+++ b/tests/pipelines/kandinsky5/test_kandinsky5_i2v.py
@@ -0,0 +1,211 @@
+# Copyright 2025 The Kandinsky Team and The HuggingFace Team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import torch
+from PIL import Image
+from transformers import (
+    AutoProcessor,
+    CLIPTextConfig,
+    CLIPTextModel,
+    CLIPTokenizer,
+    Qwen2_5_VLConfig,
+    Qwen2_5_VLForConditionalGeneration,
+)
+
+from diffusers import (
+    AutoencoderKLHunyuanVideo,
+    FlowMatchEulerDiscreteScheduler,
+    Kandinsky5I2VPipeline,
+    Kandinsky5Transformer3DModel,
+)
+from diffusers.utils.testing_utils import enable_full_determinism
+
+from ..test_pipelines_common import PipelineTesterMixin
+
+
+enable_full_determinism()
+
+
+class Kandinsky5I2VPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
+    pipeline_class = Kandinsky5I2VPipeline
+
+    batch_params = ["prompt", "negative_prompt"]
+    params = frozenset(["image", "prompt", "height", "width", "num_frames", "num_inference_steps", "guidance_scale"])
+
+    required_optional_params = {
+        "num_inference_steps",
+        "generator",
+        "latents",
+        "return_dict",
+        "callback_on_step_end",
+        "callback_on_step_end_tensor_inputs",
+        "max_sequence_length",
+    }
+    test_xformers_attention = False
+    supports_optional_components = True
+    supports_dduf = False
+    test_attention_slicing = False
+
+    def get_dummy_components(self):
+        torch.manual_seed(0)
+        vae = AutoencoderKLHunyuanVideo(
+            act_fn="silu",
+            block_out_channels=[32, 64, 64],
+            down_block_types=[
+                "HunyuanVideoDownBlock3D",
+                "HunyuanVideoDownBlock3D",
+                "HunyuanVideoDownBlock3D",
+            ],
+            in_channels=3,
+            latent_channels=16,
+            layers_per_block=1,
+            mid_block_add_attention=False,
+            norm_num_groups=32,
+            out_channels=3,
+            scaling_factor=0.476986,
+            spatial_compression_ratio=8,
+            temporal_compression_ratio=4,
+            up_block_types=[
+                "HunyuanVideoUpBlock3D",
+                "HunyuanVideoUpBlock3D",
+                "HunyuanVideoUpBlock3D",
+            ],
+        )
+
+        scheduler = FlowMatchEulerDiscreteScheduler(shift=7.0)
+
+        qwen_hidden_size = 32
+        torch.manual_seed(0)
+        qwen_config = Qwen2_5_VLConfig(
+            text_config={
+                "hidden_size": qwen_hidden_size,
+                "intermediate_size": qwen_hidden_size,
+                "num_hidden_layers": 2,
+                "num_attention_heads": 2,
+                "num_key_value_heads": 2,
+                "rope_scaling": {
+                    "mrope_section": [2, 2, 4],
+                    "rope_type": "default",
+                    "type": "default",
+                },
+                "rope_theta": 1000000.0,
+            },
+            vision_config={
+                "depth": 2,
+                "hidden_size": qwen_hidden_size,
+                "intermediate_size": qwen_hidden_size,
+                "num_heads": 2,
+                "out_hidden_size": qwen_hidden_size,
+            },
+            hidden_size=qwen_hidden_size,
+            vocab_size=152064,
+            vision_end_token_id=151653,
+            vision_start_token_id=151652,
+            vision_token_id=151654,
+        )
+        text_encoder = Qwen2_5_VLForConditionalGeneration(qwen_config)
+        tokenizer = AutoProcessor.from_pretrained("hf-internal-testing/tiny-random-Qwen2VLForConditionalGeneration")
+
+        clip_hidden_size = 16
+        torch.manual_seed(0)
+        clip_config = CLIPTextConfig(
+            bos_token_id=0,
+            eos_token_id=2,
+            hidden_size=clip_hidden_size,
+            intermediate_size=16,
+            layer_norm_eps=1e-05,
+            num_attention_heads=2,
+            num_hidden_layers=2,
+            pad_token_id=1,
+            vocab_size=1000,
+            projection_dim=clip_hidden_size,
+        )
+        text_encoder_2 = CLIPTextModel(clip_config)
+        tokenizer_2 = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
+
+        torch.manual_seed(0)
+        transformer = Kandinsky5Transformer3DModel(
+            in_visual_dim=16,
+            in_text_dim=qwen_hidden_size,
+            in_text_dim2=clip_hidden_size,
+            time_dim=16,
+            out_visual_dim=16,
+            patch_size=(1, 2, 2),
+            model_dim=16,
+            ff_dim=32,
+            num_text_blocks=1,
+            num_visual_blocks=2,
+            axes_dims=(1, 1, 2),
+            visual_cond=True,
+            attention_type="regular",
+        )
+
+        return {
+            "vae": vae,
+            "text_encoder": text_encoder,
+            "tokenizer": tokenizer,
+            "text_encoder_2": text_encoder_2,
+            "tokenizer_2": tokenizer_2,
+            "transformer": transformer,
+            "scheduler": scheduler,
+        }
+
+    def get_dummy_inputs(self, device, seed=0):
+        if str(device).startswith("mps"):
+            generator = torch.manual_seed(seed)
+        else:
+            generator = torch.Generator(device=device).manual_seed(seed)
+
+        image = Image.new("RGB", (32, 32), color="red")
+
+        return {
+            "image": image,
+            "prompt": "a red square",
+            "height": 32,
+            "width": 32,
+            "num_frames": 17,
+            "num_inference_steps": 2,
+            "guidance_scale": 4.0,
+            "generator": generator,
+            "output_type": "pt",
+            "max_sequence_length": 8,
+        }
+
+    def test_inference(self):
+        device = "cpu"
+        components = self.get_dummy_components()
+        pipe = self.pipeline_class(**components)
+        pipe.to(device)
+        pipe.set_progress_bar_config(disable=None)
+
+        inputs = self.get_dummy_inputs(device)
+        output = pipe(**inputs)
+        video = output.frames[0]
+
+        # 17 frames, RGB, 32×32
+        self.assertEqual(video.shape, (17, 3, 32, 32))
+
+    @unittest.skip("TODO:Test does not work")
+    def test_encode_prompt_works_in_isolation(self):
+        pass
+
+    @unittest.skip("TODO: revisit")
+    def test_callback_inputs(self):
+        pass
+
+    @unittest.skip("TODO: revisit")
+    def test_inference_batch_single_identical(self):
+        pass
diff --git a/tests/pipelines/kandinsky5/test_kandinsky5_t2i.py b/tests/pipelines/kandinsky5/test_kandinsky5_t2i.py
new file mode 100644
index 000000000000..e961103906a2
--- /dev/null
+++ b/tests/pipelines/kandinsky5/test_kandinsky5_t2i.py
@@ -0,0 +1,207 @@
+# Copyright 2025 The Kandinsky Team and The HuggingFace Team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import torch
+from transformers import (
+    AutoProcessor,
+    CLIPTextConfig,
+    CLIPTextModel,
+    CLIPTokenizer,
+    Qwen2_5_VLConfig,
+    Qwen2_5_VLForConditionalGeneration,
+)
+
+from diffusers import (
+    AutoencoderKL,
+    FlowMatchEulerDiscreteScheduler,
+    Kandinsky5T2IPipeline,
+    Kandinsky5Transformer3DModel,
+)
+from diffusers.utils.testing_utils import enable_full_determinism
+
+from ..test_pipelines_common import PipelineTesterMixin
+
+
+enable_full_determinism()
+
+
+class Kandinsky5T2IPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
+    pipeline_class = Kandinsky5T2IPipeline
+
+    batch_params = ["prompt", "negative_prompt"]
+    params = frozenset(["prompt", "height", "width", "num_inference_steps", "guidance_scale"])
+
+    required_optional_params = {
+        "num_inference_steps",
+        "generator",
+        "latents",
+        "return_dict",
+        "callback_on_step_end",
+        "callback_on_step_end_tensor_inputs",
+        "max_sequence_length",
+    }
+    test_xformers_attention = False
+    supports_optional_components = True
+    supports_dduf = False
+    test_attention_slicing = False
+
+    def get_dummy_components(self):
+        torch.manual_seed(0)
+        vae = AutoencoderKL(
+            act_fn="silu",
+            block_out_channels=[32, 64],
+            down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"],
+            force_upcast=True,
+            in_channels=3,
+            latent_channels=16,
+            layers_per_block=1,
+            mid_block_add_attention=False,
+            norm_num_groups=32,
+            out_channels=3,
+            sample_size=128,
+            scaling_factor=0.3611,
+            shift_factor=0.1159,
+            up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"],
+            use_post_quant_conv=False,
+            use_quant_conv=False,
+        )
+
+        scheduler = FlowMatchEulerDiscreteScheduler(shift=7.0)
+
+        qwen_hidden_size = 32
+        torch.manual_seed(0)
+        qwen_config = Qwen2_5_VLConfig(
+            text_config={
+                "hidden_size": qwen_hidden_size,
+                "intermediate_size": qwen_hidden_size,
+                "num_hidden_layers": 2,
+                "num_attention_heads": 2,
+                "num_key_value_heads": 2,
+                "rope_scaling": {
+                    "mrope_section": [2, 2, 4],
+                    "rope_type": "default",
+                    "type": "default",
+                },
+                "rope_theta": 1000000.0,
+            },
+            vision_config={
+                "depth": 2,
+                "hidden_size": qwen_hidden_size,
+                "intermediate_size": qwen_hidden_size,
+                "num_heads": 2,
+                "out_hidden_size": qwen_hidden_size,
+            },
+            hidden_size=qwen_hidden_size,
+            vocab_size=152064,
+            vision_end_token_id=151653,
+            vision_start_token_id=151652,
+            vision_token_id=151654,
+        )
+        text_encoder = Qwen2_5_VLForConditionalGeneration(qwen_config)
+        tokenizer = AutoProcessor.from_pretrained("hf-internal-testing/tiny-random-Qwen2VLForConditionalGeneration")
+
+        clip_hidden_size = 16
+        torch.manual_seed(0)
+        clip_config = CLIPTextConfig(
+            bos_token_id=0,
+            eos_token_id=2,
+            hidden_size=clip_hidden_size,
+            intermediate_size=16,
+            layer_norm_eps=1e-05,
+            num_attention_heads=2,
+            num_hidden_layers=2,
+            pad_token_id=1,
+            vocab_size=1000,
+            projection_dim=clip_hidden_size,
+        )
+        text_encoder_2 = CLIPTextModel(clip_config)
+        tokenizer_2 = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
+
+        torch.manual_seed(0)
+        transformer = Kandinsky5Transformer3DModel(
+            in_visual_dim=16,
+            in_text_dim=qwen_hidden_size,
+            in_text_dim2=clip_hidden_size,
+            time_dim=16,
+            out_visual_dim=16,
+            patch_size=(1, 2, 2),
+            model_dim=16,
+            ff_dim=32,
+            num_text_blocks=1,
+            num_visual_blocks=2,
+            axes_dims=(1, 1, 2),
+            visual_cond=False,
+            attention_type="regular",
+        )
+
+        return {
+            "vae": vae,
+            "text_encoder": text_encoder,
+            "tokenizer": tokenizer,
+            "text_encoder_2": text_encoder_2,
+            "tokenizer_2": tokenizer_2,
+            "transformer": transformer,
+            "scheduler": scheduler,
+        }
+
+    def get_dummy_inputs(self, device, seed=0):
+        if str(device).startswith("mps"):
+            generator = torch.manual_seed(seed)
+        else:
+            generator = torch.Generator(device=device).manual_seed(seed)
+        return {
+            "prompt": "a red square",
+            "height": 64,
+            "width": 64,
+            "num_inference_steps": 2,
+            "guidance_scale": 4.0,
+            "generator": generator,
+            "output_type": "pt",
+            "max_sequence_length": 8,
+        }
+
+    def test_inference(self):
+        device = "cpu"
+        components = self.get_dummy_components()
+        pipe = self.pipeline_class(**components)
+        pipe.resolutions = [(64, 64)]
+        pipe.to(device)
+        pipe.set_progress_bar_config(disable=None)
+
+        inputs = self.get_dummy_inputs(device)
+        output = pipe(**inputs)
+        image = output.image
+
+        self.assertEqual(image.shape, (1, 3, 16, 16))
+
+    def test_inference_batch_single_identical(self):
+        super().test_inference_batch_single_identical(expected_max_diff=5e-3)
+
+    @unittest.skip("Test not supported")
+    def test_attention_slicing_forward_pass(self):
+        pass
+
+    @unittest.skip("Only SDPA or NABLA (flex)")
+    def test_xformers_memory_efficient_attention(self):
+        pass
+
+    @unittest.skip("All encoders are needed")
+    def test_encode_prompt_works_in_isolation(self):
+        pass
+
+    @unittest.skip("Meant for eiter FP32 or BF16 inference")
+    def test_float16_inference(self):
+        pass
diff --git a/tests/pipelines/ovis_image/__init__.py b/tests/pipelines/ovis_image/__init__.py
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/tests/pipelines/sana_video/__init__.py b/tests/pipelines/sana_video/__init__.py
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/tests/pipelines/sana_video/test_sana_video.py b/tests/pipelines/sana_video/test_sana_video.py
new file mode 100644
index 000000000000..9f360a942a64
--- /dev/null
+++ b/tests/pipelines/sana_video/test_sana_video.py
@@ -0,0 +1,225 @@
+# Copyright 2025 The HuggingFace Team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import gc
+import tempfile
+import unittest
+
+import numpy as np
+import torch
+from transformers import Gemma2Config, Gemma2Model, GemmaTokenizer
+
+from diffusers import AutoencoderKLWan, DPMSolverMultistepScheduler, SanaVideoPipeline, SanaVideoTransformer3DModel
+
+from ...testing_utils import (
+    backend_empty_cache,
+    enable_full_determinism,
+    require_torch_accelerator,
+    slow,
+    torch_device,
+)
+from ..pipeline_params import TEXT_TO_IMAGE_BATCH_PARAMS, TEXT_TO_IMAGE_IMAGE_PARAMS, TEXT_TO_IMAGE_PARAMS
+from ..test_pipelines_common import PipelineTesterMixin
+
+
+enable_full_determinism()
+
+
+class SanaVideoPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
+    pipeline_class = SanaVideoPipeline
+    params = TEXT_TO_IMAGE_PARAMS - {"cross_attention_kwargs"}
+    batch_params = TEXT_TO_IMAGE_BATCH_PARAMS
+    image_params = TEXT_TO_IMAGE_IMAGE_PARAMS
+    image_latents_params = TEXT_TO_IMAGE_IMAGE_PARAMS
+    required_optional_params = frozenset(
+        [
+            "num_inference_steps",
+            "generator",
+            "latents",
+            "return_dict",
+            "callback_on_step_end",
+            "callback_on_step_end_tensor_inputs",
+        ]
+    )
+    test_xformers_attention = False
+    supports_dduf = False
+
+    def get_dummy_components(self):
+        torch.manual_seed(0)
+        vae = AutoencoderKLWan(
+            base_dim=3,
+            z_dim=16,
+            dim_mult=[1, 1, 1, 1],
+            num_res_blocks=1,
+            temperal_downsample=[False, True, True],
+        )
+
+        torch.manual_seed(0)
+        scheduler = DPMSolverMultistepScheduler()
+
+        torch.manual_seed(0)
+        text_encoder_config = Gemma2Config(
+            head_dim=16,
+            hidden_size=8,
+            initializer_range=0.02,
+            intermediate_size=64,
+            max_position_embeddings=8192,
+            model_type="gemma2",
+            num_attention_heads=2,
+            num_hidden_layers=1,
+            num_key_value_heads=2,
+            vocab_size=8,
+            attn_implementation="eager",
+        )
+        text_encoder = Gemma2Model(text_encoder_config)
+        tokenizer = GemmaTokenizer.from_pretrained("hf-internal-testing/dummy-gemma")
+
+        torch.manual_seed(0)
+        transformer = SanaVideoTransformer3DModel(
+            in_channels=16,
+            out_channels=16,
+            num_attention_heads=2,
+            attention_head_dim=12,
+            num_layers=2,
+            num_cross_attention_heads=2,
+            cross_attention_head_dim=12,
+            cross_attention_dim=24,
+            caption_channels=8,
+            mlp_ratio=2.5,
+            dropout=0.0,
+            attention_bias=False,
+            sample_size=8,
+            patch_size=(1, 2, 2),
+            norm_elementwise_affine=False,
+            norm_eps=1e-6,
+            qk_norm="rms_norm_across_heads",
+            rope_max_seq_len=32,
+        )
+
+        components = {
+            "transformer": transformer,
+            "vae": vae,
+            "scheduler": scheduler,
+            "text_encoder": text_encoder,
+            "tokenizer": tokenizer,
+        }
+        return components
+
+    def get_dummy_inputs(self, device, seed=0):
+        if str(device).startswith("mps"):
+            generator = torch.manual_seed(seed)
+        else:
+            generator = torch.Generator(device=device).manual_seed(seed)
+        inputs = {
+            "prompt": "",
+            "negative_prompt": "",
+            "generator": generator,
+            "num_inference_steps": 2,
+            "guidance_scale": 6.0,
+            "height": 32,
+            "width": 32,
+            "frames": 9,
+            "max_sequence_length": 16,
+            "output_type": "pt",
+            "complex_human_instruction": [],
+            "use_resolution_binning": False,
+        }
+        return inputs
+
+    def test_inference(self):
+        device = "cpu"
+
+        components = self.get_dummy_components()
+        pipe = self.pipeline_class(**components)
+        pipe.to(device)
+        pipe.set_progress_bar_config(disable=None)
+
+        inputs = self.get_dummy_inputs(device)
+        video = pipe(**inputs).frames
+        generated_video = video[0]
+        self.assertEqual(generated_video.shape, (9, 3, 32, 32))
+
+    @unittest.skip("Test not supported")
+    def test_attention_slicing_forward_pass(self):
+        pass
+
+    def test_save_load_local(self, expected_max_difference=5e-4):
+        components = self.get_dummy_components()
+        pipe = self.pipeline_class(**components)
+        for component in pipe.components.values():
+            if hasattr(component, "set_default_attn_processor"):
+                component.set_default_attn_processor()
+        pipe.to(torch_device)
+        pipe.set_progress_bar_config(disable=None)
+
+        inputs = self.get_dummy_inputs(torch_device)
+        torch.manual_seed(0)
+        output = pipe(**inputs)[0]
+
+        with tempfile.TemporaryDirectory() as tmpdir:
+            pipe.save_pretrained(tmpdir, safe_serialization=False)
+            pipe_loaded = self.pipeline_class.from_pretrained(tmpdir)
+            for component in pipe_loaded.components.values():
+                if hasattr(component, "set_default_attn_processor"):
+                    component.set_default_attn_processor()
+            pipe_loaded.to(torch_device)
+            pipe_loaded.set_progress_bar_config(disable=None)
+
+        inputs = self.get_dummy_inputs(torch_device)
+        torch.manual_seed(0)
+        output_loaded = pipe_loaded(**inputs)[0]
+
+        max_diff = np.abs(output.detach().cpu().numpy() - output_loaded.detach().cpu().numpy()).max()
+        self.assertLess(max_diff, expected_max_difference)
+
+    # TODO(aryan): Create a dummy gemma model with smol vocab size
+    @unittest.skip(
+        "A very small vocab size is used for fast tests. So, any kind of prompt other than the empty default used in other tests will lead to a embedding lookup error. This test uses a long prompt that causes the error."
+    )
+    def test_inference_batch_consistent(self):
+        pass
+
+    @unittest.skip(
+        "A very small vocab size is used for fast tests. So, any kind of prompt other than the empty default used in other tests will lead to a embedding lookup error. This test uses a long prompt that causes the error."
+    )
+    def test_inference_batch_single_identical(self):
+        pass
+
+    def test_float16_inference(self):
+        # Requires higher tolerance as model seems very sensitive to dtype
+        super().test_float16_inference(expected_max_diff=0.08)
+
+    def test_save_load_float16(self):
+        # Requires higher tolerance as model seems very sensitive to dtype
+        super().test_save_load_float16(expected_max_diff=0.2)
+
+
+@slow
+@require_torch_accelerator
+class SanaVideoPipelineIntegrationTests(unittest.TestCase):
+    prompt = "Evening, backlight, side lighting, soft light, high contrast, mid-shot, centered composition, clean solo shot, warm color. A young Caucasian man stands in a forest."
+
+    def setUp(self):
+        super().setUp()
+        gc.collect()
+        backend_empty_cache(torch_device)
+
+    def tearDown(self):
+        super().tearDown()
+        gc.collect()
+        backend_empty_cache(torch_device)
+
+    @unittest.skip("TODO: test needs to be implemented")
+    def test_sana_video_480p(self):
+        pass
diff --git a/tests/pipelines/sana_video/test_sana_video_i2v.py b/tests/pipelines/sana_video/test_sana_video_i2v.py
new file mode 100644
index 000000000000..36a646ca528f
--- /dev/null
+++ b/tests/pipelines/sana_video/test_sana_video_i2v.py
@@ -0,0 +1,238 @@
+# Copyright 2025 The HuggingFace Team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import gc
+import tempfile
+import unittest
+
+import numpy as np
+import torch
+from PIL import Image
+from transformers import Gemma2Config, Gemma2Model, GemmaTokenizer
+
+from diffusers import (
+    AutoencoderKLWan,
+    FlowMatchEulerDiscreteScheduler,
+    SanaImageToVideoPipeline,
+    SanaVideoTransformer3DModel,
+)
+
+from ...testing_utils import (
+    backend_empty_cache,
+    enable_full_determinism,
+    require_torch_accelerator,
+    slow,
+    torch_device,
+)
+from ..pipeline_params import TEXT_TO_IMAGE_BATCH_PARAMS, TEXT_TO_IMAGE_IMAGE_PARAMS, TEXT_TO_IMAGE_PARAMS
+from ..test_pipelines_common import PipelineTesterMixin
+
+
+enable_full_determinism()
+
+
+class SanaImageToVideoPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
+    pipeline_class = SanaImageToVideoPipeline
+    params = TEXT_TO_IMAGE_PARAMS - {"cross_attention_kwargs"}
+    batch_params = TEXT_TO_IMAGE_BATCH_PARAMS
+    image_params = TEXT_TO_IMAGE_IMAGE_PARAMS
+    image_latents_params = TEXT_TO_IMAGE_IMAGE_PARAMS
+    required_optional_params = frozenset(
+        [
+            "num_inference_steps",
+            "generator",
+            "latents",
+            "return_dict",
+            "callback_on_step_end",
+            "callback_on_step_end_tensor_inputs",
+        ]
+    )
+    test_xformers_attention = False
+    supports_dduf = False
+
+    def get_dummy_components(self):
+        torch.manual_seed(0)
+        vae = AutoencoderKLWan(
+            base_dim=3,
+            z_dim=16,
+            dim_mult=[1, 1, 1, 1],
+            num_res_blocks=1,
+            temperal_downsample=[False, True, True],
+        )
+
+        torch.manual_seed(0)
+        scheduler = FlowMatchEulerDiscreteScheduler()
+
+        torch.manual_seed(0)
+        text_encoder_config = Gemma2Config(
+            head_dim=16,
+            hidden_size=8,
+            initializer_range=0.02,
+            intermediate_size=64,
+            max_position_embeddings=8192,
+            model_type="gemma2",
+            num_attention_heads=2,
+            num_hidden_layers=1,
+            num_key_value_heads=2,
+            vocab_size=8,
+            attn_implementation="eager",
+        )
+        text_encoder = Gemma2Model(text_encoder_config)
+        tokenizer = GemmaTokenizer.from_pretrained("hf-internal-testing/dummy-gemma")
+
+        torch.manual_seed(0)
+        transformer = SanaVideoTransformer3DModel(
+            in_channels=16,
+            out_channels=16,
+            num_attention_heads=2,
+            attention_head_dim=12,
+            num_layers=2,
+            num_cross_attention_heads=2,
+            cross_attention_head_dim=12,
+            cross_attention_dim=24,
+            caption_channels=8,
+            mlp_ratio=2.5,
+            dropout=0.0,
+            attention_bias=False,
+            sample_size=8,
+            patch_size=(1, 2, 2),
+            norm_elementwise_affine=False,
+            norm_eps=1e-6,
+            qk_norm="rms_norm_across_heads",
+            rope_max_seq_len=32,
+        )
+
+        components = {
+            "transformer": transformer,
+            "vae": vae,
+            "scheduler": scheduler,
+            "text_encoder": text_encoder,
+            "tokenizer": tokenizer,
+        }
+        return components
+
+    def get_dummy_inputs(self, device, seed=0):
+        if str(device).startswith("mps"):
+            generator = torch.manual_seed(seed)
+        else:
+            generator = torch.Generator(device=device).manual_seed(seed)
+
+        # Create a dummy image input (PIL Image)
+        image = Image.new("RGB", (32, 32))
+
+        inputs = {
+            "image": image,
+            "prompt": "",
+            "negative_prompt": "",
+            "generator": generator,
+            "num_inference_steps": 2,
+            "guidance_scale": 6.0,
+            "height": 32,
+            "width": 32,
+            "frames": 9,
+            "max_sequence_length": 16,
+            "output_type": "pt",
+            "complex_human_instruction": [],
+            "use_resolution_binning": False,
+        }
+        return inputs
+
+    def test_inference(self):
+        device = "cpu"
+
+        components = self.get_dummy_components()
+        pipe = self.pipeline_class(**components)
+        pipe.to(device)
+        pipe.set_progress_bar_config(disable=None)
+
+        inputs = self.get_dummy_inputs(device)
+        video = pipe(**inputs).frames
+        generated_video = video[0]
+        self.assertEqual(generated_video.shape, (9, 3, 32, 32))
+
+    @unittest.skip("Test not supported")
+    def test_attention_slicing_forward_pass(self):
+        pass
+
+    def test_save_load_local(self, expected_max_difference=5e-4):
+        components = self.get_dummy_components()
+        pipe = self.pipeline_class(**components)
+        for component in pipe.components.values():
+            if hasattr(component, "set_default_attn_processor"):
+                component.set_default_attn_processor()
+        pipe.to(torch_device)
+        pipe.set_progress_bar_config(disable=None)
+
+        inputs = self.get_dummy_inputs(torch_device)
+        torch.manual_seed(0)
+        output = pipe(**inputs)[0]
+
+        with tempfile.TemporaryDirectory() as tmpdir:
+            pipe.save_pretrained(tmpdir, safe_serialization=False)
+            pipe_loaded = self.pipeline_class.from_pretrained(tmpdir)
+            for component in pipe_loaded.components.values():
+                if hasattr(component, "set_default_attn_processor"):
+                    component.set_default_attn_processor()
+            pipe_loaded.to(torch_device)
+            pipe_loaded.set_progress_bar_config(disable=None)
+
+        inputs = self.get_dummy_inputs(torch_device)
+        torch.manual_seed(0)
+        output_loaded = pipe_loaded(**inputs)[0]
+
+        max_diff = np.abs(output.detach().cpu().numpy() - output_loaded.detach().cpu().numpy()).max()
+        self.assertLess(max_diff, expected_max_difference)
+
+    # TODO(aryan): Create a dummy gemma model with smol vocab size
+    @unittest.skip(
+        "A very small vocab size is used for fast tests. So, any kind of prompt other than the empty default used in other tests will lead to a embedding lookup error. This test uses a long prompt that causes the error."
+    )
+    def test_inference_batch_consistent(self):
+        pass
+
+    @unittest.skip(
+        "A very small vocab size is used for fast tests. So, any kind of prompt other than the empty default used in other tests will lead to a embedding lookup error. This test uses a long prompt that causes the error."
+    )
+    def test_inference_batch_single_identical(self):
+        pass
+
+    @unittest.skip("Skipping fp16 test as model is trained with bf16")
+    def test_float16_inference(self):
+        # Requires higher tolerance as model seems very sensitive to dtype
+        super().test_float16_inference(expected_max_diff=0.08)
+
+    @unittest.skip("Skipping fp16 test as model is trained with bf16")
+    def test_save_load_float16(self):
+        # Requires higher tolerance as model seems very sensitive to dtype
+        super().test_save_load_float16(expected_max_diff=0.2)
+
+
+@slow
+@require_torch_accelerator
+class SanaVideoPipelineIntegrationTests(unittest.TestCase):
+    prompt = "Evening, backlight, side lighting, soft light, high contrast, mid-shot, centered composition, clean solo shot, warm color. A young Caucasian man stands in a forest."
+
+    def setUp(self):
+        super().setUp()
+        gc.collect()
+        backend_empty_cache(torch_device)
+
+    def tearDown(self):
+        super().tearDown()
+        gc.collect()
+        backend_empty_cache(torch_device)
+
+    @unittest.skip("TODO: test needs to be implemented")
+    def test_sana_video_480p(self):
+        pass
diff --git a/tests/pipelines/stable_diffusion_2/test_stable_diffusion_latent_upscale.py b/tests/pipelines/stable_diffusion_2/test_stable_diffusion_latent_upscale.py
index 2e4b428dfeb5..285c2fea7ebc 100644
--- a/tests/pipelines/stable_diffusion_2/test_stable_diffusion_latent_upscale.py
+++ b/tests/pipelines/stable_diffusion_2/test_stable_diffusion_latent_upscale.py
@@ -37,6 +37,7 @@
     floats_tensor,
     load_image,
     load_numpy,
+    require_accelerator,
     require_torch_accelerator,
     slow,
     torch_device,
@@ -222,6 +223,7 @@ def test_stable_diffusion_latent_upscaler_multiple_init_images(self):
     def test_attention_slicing_forward_pass(self):
         super().test_attention_slicing_forward_pass(expected_max_diff=7e-3)
 
+    @require_accelerator
     def test_sequential_cpu_offload_forward_pass(self):
         super().test_sequential_cpu_offload_forward_pass(expected_max_diff=3e-3)
 
diff --git a/tests/pipelines/test_pipelines_common.py b/tests/pipelines/test_pipelines_common.py
index 2af4ad0314c3..7db5f4da89ca 100644
--- a/tests/pipelines/test_pipelines_common.py
+++ b/tests/pipelines/test_pipelines_common.py
@@ -36,6 +36,7 @@
 from diffusers.hooks.faster_cache import FasterCacheBlockHook, FasterCacheDenoiserHook
 from diffusers.hooks.first_block_cache import FirstBlockCacheConfig
 from diffusers.hooks.pyramid_attention_broadcast import PyramidAttentionBroadcastHook
+from diffusers.hooks.taylorseer_cache import TaylorSeerCacheConfig
 from diffusers.image_processor import VaeImageProcessor
 from diffusers.loaders import FluxIPAdapterMixin, IPAdapterMixin
 from diffusers.models.attention import AttentionModuleMixin
@@ -103,7 +104,7 @@ def check_qkv_fusion_processors_exist(model):
 def check_qkv_fused_layers_exist(model, layer_names):
     is_fused_submodules = []
     for submodule in model.modules():
-        if not isinstance(submodule, AttentionModuleMixin):
+        if not isinstance(submodule, AttentionModuleMixin) or not submodule._supports_qkv_fusion:
             continue
         is_fused_attribute_set = submodule.fused_projections
         is_fused_layer = True
@@ -1422,7 +1423,18 @@ def test_float16_inference(self, expected_max_diff=5e-2):
     def test_save_load_float16(self, expected_max_diff=1e-2):
         components = self.get_dummy_components()
         for name, module in components.items():
-            if hasattr(module, "half"):
+            # Account for components with _keep_in_fp32_modules
+            if hasattr(module, "_keep_in_fp32_modules") and module._keep_in_fp32_modules is not None:
+                for name, param in module.named_parameters():
+                    if any(
+                        module_to_keep_in_fp32 in name.split(".")
+                        for module_to_keep_in_fp32 in module._keep_in_fp32_modules
+                    ):
+                        param.data = param.data.to(torch_device).to(torch.float32)
+                    else:
+                        param.data = param.data.to(torch_device).to(torch.float16)
+
+            elif hasattr(module, "half"):
                 components[name] = module.to(torch_device).half()
 
         pipe = self.pipeline_class(**components)
@@ -2913,6 +2925,57 @@ def run_forward(pipe):
         )
 
 
+class TaylorSeerCacheTesterMixin:
+    taylorseer_cache_config = TaylorSeerCacheConfig(
+        cache_interval=5,
+        disable_cache_before_step=10,
+        max_order=1,
+        taylor_factors_dtype=torch.bfloat16,
+        use_lite_mode=True,
+    )
+
+    def test_taylorseer_cache_inference(self, expected_atol: float = 0.1):
+        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
+
+        def create_pipe():
+            torch.manual_seed(0)
+            num_layers = 2
+            components = self.get_dummy_components(num_layers=num_layers)
+            pipe = self.pipeline_class(**components)
+            pipe = pipe.to(device)
+            pipe.set_progress_bar_config(disable=None)
+            return pipe
+
+        def run_forward(pipe):
+            torch.manual_seed(0)
+            inputs = self.get_dummy_inputs(device)
+            inputs["num_inference_steps"] = 50
+            return pipe(**inputs)[0]
+
+        # Run inference without TaylorSeerCache
+        pipe = create_pipe()
+        output = run_forward(pipe).flatten()
+        original_image_slice = np.concatenate((output[:8], output[-8:]))
+
+        # Run inference with TaylorSeerCache enabled
+        pipe = create_pipe()
+        pipe.transformer.enable_cache(self.taylorseer_cache_config)
+        output = run_forward(pipe).flatten()
+        image_slice_fbc_enabled = np.concatenate((output[:8], output[-8:]))
+
+        # Run inference with TaylorSeerCache disabled
+        pipe.transformer.disable_cache()
+        output = run_forward(pipe).flatten()
+        image_slice_fbc_disabled = np.concatenate((output[:8], output[-8:]))
+
+        assert np.allclose(original_image_slice, image_slice_fbc_enabled, atol=expected_atol), (
+            "TaylorSeerCache outputs should not differ much."
+        )
+        assert np.allclose(original_image_slice, image_slice_fbc_disabled, atol=1e-4), (
+            "Outputs from normal inference and after disabling cache should not differ."
+        )
+
+
 # Some models (e.g. unCLIP) are extremely likely to significantly deviate depending on which hardware is used.
 # This helper function is used to check that the image doesn't deviate on average more than 10 pixels from a
 # reference image.
diff --git a/tests/pipelines/wan/test_wan_animate.py b/tests/pipelines/wan/test_wan_animate.py
new file mode 100644
index 000000000000..d6d1b09f3620
--- /dev/null
+++ b/tests/pipelines/wan/test_wan_animate.py
@@ -0,0 +1,239 @@
+# Copyright 2025 The HuggingFace Team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import gc
+import unittest
+
+import numpy as np
+import torch
+from PIL import Image
+from transformers import (
+    AutoTokenizer,
+    CLIPImageProcessor,
+    CLIPVisionConfig,
+    CLIPVisionModelWithProjection,
+    T5EncoderModel,
+)
+
+from diffusers import (
+    AutoencoderKLWan,
+    FlowMatchEulerDiscreteScheduler,
+    WanAnimatePipeline,
+    WanAnimateTransformer3DModel,
+)
+
+from ...testing_utils import (
+    backend_empty_cache,
+    enable_full_determinism,
+    require_torch_accelerator,
+    slow,
+    torch_device,
+)
+from ..pipeline_params import TEXT_TO_IMAGE_BATCH_PARAMS, TEXT_TO_IMAGE_IMAGE_PARAMS, TEXT_TO_IMAGE_PARAMS
+from ..test_pipelines_common import PipelineTesterMixin
+
+
+enable_full_determinism()
+
+
+class WanAnimatePipelineFastTests(PipelineTesterMixin, unittest.TestCase):
+    pipeline_class = WanAnimatePipeline
+    params = TEXT_TO_IMAGE_PARAMS - {"cross_attention_kwargs"}
+    batch_params = TEXT_TO_IMAGE_BATCH_PARAMS
+    image_params = TEXT_TO_IMAGE_IMAGE_PARAMS
+    image_latents_params = TEXT_TO_IMAGE_IMAGE_PARAMS
+    required_optional_params = frozenset(
+        [
+            "num_inference_steps",
+            "generator",
+            "latents",
+            "return_dict",
+            "callback_on_step_end",
+            "callback_on_step_end_tensor_inputs",
+        ]
+    )
+    test_xformers_attention = False
+    supports_dduf = False
+
+    def get_dummy_components(self):
+        torch.manual_seed(0)
+        vae = AutoencoderKLWan(
+            base_dim=3,
+            z_dim=16,
+            dim_mult=[1, 1, 1, 1],
+            num_res_blocks=1,
+            temperal_downsample=[False, True, True],
+        )
+
+        torch.manual_seed(0)
+        scheduler = FlowMatchEulerDiscreteScheduler(shift=7.0)
+        text_encoder = T5EncoderModel.from_pretrained("hf-internal-testing/tiny-random-t5")
+        tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-t5")
+
+        torch.manual_seed(0)
+        channel_sizes = {"4": 16, "8": 16, "16": 16}
+        transformer = WanAnimateTransformer3DModel(
+            patch_size=(1, 2, 2),
+            num_attention_heads=2,
+            attention_head_dim=12,
+            in_channels=36,
+            latent_channels=16,
+            out_channels=16,
+            text_dim=32,
+            freq_dim=256,
+            ffn_dim=32,
+            num_layers=2,
+            cross_attn_norm=True,
+            qk_norm="rms_norm_across_heads",
+            image_dim=4,
+            rope_max_seq_len=32,
+            motion_encoder_channel_sizes=channel_sizes,
+            motion_encoder_size=16,
+            motion_style_dim=8,
+            motion_dim=4,
+            motion_encoder_dim=16,
+            face_encoder_hidden_dim=16,
+            face_encoder_num_heads=2,
+            inject_face_latents_blocks=2,
+        )
+
+        torch.manual_seed(0)
+        image_encoder_config = CLIPVisionConfig(
+            hidden_size=4,
+            projection_dim=4,
+            num_hidden_layers=2,
+            num_attention_heads=2,
+            image_size=4,
+            intermediate_size=16,
+            patch_size=1,
+        )
+        image_encoder = CLIPVisionModelWithProjection(image_encoder_config)
+
+        torch.manual_seed(0)
+        image_processor = CLIPImageProcessor(crop_size=4, size=4)
+
+        components = {
+            "transformer": transformer,
+            "vae": vae,
+            "scheduler": scheduler,
+            "text_encoder": text_encoder,
+            "tokenizer": tokenizer,
+            "image_encoder": image_encoder,
+            "image_processor": image_processor,
+        }
+        return components
+
+    def get_dummy_inputs(self, device, seed=0):
+        if str(device).startswith("mps"):
+            generator = torch.manual_seed(seed)
+        else:
+            generator = torch.Generator(device=device).manual_seed(seed)
+
+        num_frames = 17
+        height = 16
+        width = 16
+        face_height = 16
+        face_width = 16
+
+        image = Image.new("RGB", (height, width))
+        pose_video = [Image.new("RGB", (height, width))] * num_frames
+        face_video = [Image.new("RGB", (face_height, face_width))] * num_frames
+
+        inputs = {
+            "image": image,
+            "pose_video": pose_video,
+            "face_video": face_video,
+            "prompt": "dance monkey",
+            "negative_prompt": "negative",
+            "height": height,
+            "width": width,
+            "segment_frame_length": 77,  # TODO: can we set this to num_frames?
+            "num_inference_steps": 2,
+            "mode": "animate",
+            "prev_segment_conditioning_frames": 1,
+            "generator": generator,
+            "guidance_scale": 1.0,
+            "output_type": "pt",
+            "max_sequence_length": 16,
+        }
+        return inputs
+
+    def test_inference(self):
+        """Test basic inference in animation mode."""
+        device = "cpu"
+
+        components = self.get_dummy_components()
+        pipe = self.pipeline_class(**components)
+        pipe.to(device)
+        pipe.set_progress_bar_config(disable=None)
+
+        inputs = self.get_dummy_inputs(device)
+        video = pipe(**inputs).frames[0]
+        self.assertEqual(video.shape, (17, 3, 16, 16))
+
+        expected_video = torch.randn(17, 3, 16, 16)
+        max_diff = np.abs(video - expected_video).max()
+        self.assertLessEqual(max_diff, 1e10)
+
+    def test_inference_replacement(self):
+        """Test the pipeline in replacement mode with background and mask videos."""
+        device = "cpu"
+
+        components = self.get_dummy_components()
+        pipe = self.pipeline_class(**components)
+        pipe.to(device)
+        pipe.set_progress_bar_config(disable=None)
+
+        inputs = self.get_dummy_inputs(device)
+        inputs["mode"] = "replace"
+        num_frames = 17
+        height = 16
+        width = 16
+        inputs["background_video"] = [Image.new("RGB", (height, width))] * num_frames
+        inputs["mask_video"] = [Image.new("L", (height, width))] * num_frames
+
+        video = pipe(**inputs).frames[0]
+        self.assertEqual(video.shape, (17, 3, 16, 16))
+
+    @unittest.skip("Test not supported")
+    def test_attention_slicing_forward_pass(self):
+        pass
+
+    @unittest.skip(
+        "Setting the Wan Animate latents to zero at the last denoising step does not guarantee that the output will be"
+        " zero. I believe this is because the latents are further processed in the outer loop where we loop over"
+        " inference segments."
+    )
+    def test_callback_inputs(self):
+        pass
+
+
+@slow
+@require_torch_accelerator
+class WanAnimatePipelineIntegrationTests(unittest.TestCase):
+    prompt = "A painting of a squirrel eating a burger."
+
+    def setUp(self):
+        super().setUp()
+        gc.collect()
+        backend_empty_cache(torch_device)
+
+    def tearDown(self):
+        super().tearDown()
+        gc.collect()
+        backend_empty_cache(torch_device)
+
+    @unittest.skip("TODO: test needs to be implemented")
+    def test_wan_animate(self):
+        pass
diff --git a/tests/pipelines/z_image/__init__.py b/tests/pipelines/z_image/__init__.py
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/tests/pipelines/z_image/test_z_image.py b/tests/pipelines/z_image/test_z_image.py
new file mode 100644
index 000000000000..79a5fa0de5f0
--- /dev/null
+++ b/tests/pipelines/z_image/test_z_image.py
@@ -0,0 +1,307 @@
+# Copyright 2025 Alibaba Z-Image Team and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import gc
+import os
+import unittest
+
+import numpy as np
+import torch
+from transformers import Qwen2Tokenizer, Qwen3Config, Qwen3Model
+
+from diffusers import AutoencoderKL, FlowMatchEulerDiscreteScheduler, ZImagePipeline, ZImageTransformer2DModel
+
+from ...testing_utils import torch_device
+from ..pipeline_params import TEXT_TO_IMAGE_BATCH_PARAMS, TEXT_TO_IMAGE_IMAGE_PARAMS, TEXT_TO_IMAGE_PARAMS
+from ..test_pipelines_common import PipelineTesterMixin, to_np
+
+
+# Z-Image requires torch.use_deterministic_algorithms(False) due to complex64 RoPE operations
+# Cannot use enable_full_determinism() which sets it to True
+os.environ["CUDA_LAUNCH_BLOCKING"] = "1"
+os.environ["CUBLAS_WORKSPACE_CONFIG"] = ":16:8"
+torch.use_deterministic_algorithms(False)
+torch.backends.cudnn.deterministic = True
+torch.backends.cudnn.benchmark = False
+if hasattr(torch.backends, "cuda"):
+    torch.backends.cuda.matmul.allow_tf32 = False
+
+# Note: Some tests (test_float16_inference, test_save_load_float16) may fail in full suite
+# due to RopeEmbedder cache state pollution between tests. They pass when run individually.
+# This is a known test isolation issue, not a functional bug.
+
+
+class ZImagePipelineFastTests(PipelineTesterMixin, unittest.TestCase):
+    pipeline_class = ZImagePipeline
+    params = TEXT_TO_IMAGE_PARAMS - {"cross_attention_kwargs"}
+    batch_params = TEXT_TO_IMAGE_BATCH_PARAMS
+    image_params = TEXT_TO_IMAGE_IMAGE_PARAMS
+    image_latents_params = TEXT_TO_IMAGE_IMAGE_PARAMS
+    required_optional_params = frozenset(
+        [
+            "num_inference_steps",
+            "generator",
+            "latents",
+            "return_dict",
+            "callback_on_step_end",
+            "callback_on_step_end_tensor_inputs",
+        ]
+    )
+    supports_dduf = False
+    test_xformers_attention = False
+    test_layerwise_casting = True
+    test_group_offloading = True
+
+    def setUp(self):
+        gc.collect()
+        if torch.cuda.is_available():
+            torch.cuda.empty_cache()
+            torch.cuda.synchronize()
+        torch.manual_seed(0)
+        if torch.cuda.is_available():
+            torch.cuda.manual_seed_all(0)
+
+    def tearDown(self):
+        super().tearDown()
+        gc.collect()
+        if torch.cuda.is_available():
+            torch.cuda.empty_cache()
+            torch.cuda.synchronize()
+        torch.manual_seed(0)
+        if torch.cuda.is_available():
+            torch.cuda.manual_seed_all(0)
+
+    def get_dummy_components(self):
+        torch.manual_seed(0)
+        transformer = ZImageTransformer2DModel(
+            all_patch_size=(2,),
+            all_f_patch_size=(1,),
+            in_channels=16,
+            dim=32,
+            n_layers=2,
+            n_refiner_layers=1,
+            n_heads=2,
+            n_kv_heads=2,
+            norm_eps=1e-5,
+            qk_norm=True,
+            cap_feat_dim=16,
+            rope_theta=256.0,
+            t_scale=1000.0,
+            axes_dims=[8, 4, 4],
+            axes_lens=[256, 32, 32],
+        )
+        # `x_pad_token` and `cap_pad_token` are initialized with `torch.empty`.
+        # This can cause NaN data values in our testing environment. Fixating them
+        # helps prevent that issue.
+        with torch.no_grad():
+            transformer.x_pad_token.copy_(torch.ones_like(transformer.x_pad_token.data))
+            transformer.cap_pad_token.copy_(torch.ones_like(transformer.cap_pad_token.data))
+
+        torch.manual_seed(0)
+        vae = AutoencoderKL(
+            in_channels=3,
+            out_channels=3,
+            down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"],
+            up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"],
+            block_out_channels=[32, 64],
+            layers_per_block=1,
+            latent_channels=16,
+            norm_num_groups=32,
+            sample_size=32,
+            scaling_factor=0.3611,
+            shift_factor=0.1159,
+        )
+
+        torch.manual_seed(0)
+        scheduler = FlowMatchEulerDiscreteScheduler()
+
+        torch.manual_seed(0)
+        config = Qwen3Config(
+            hidden_size=16,
+            intermediate_size=16,
+            num_hidden_layers=2,
+            num_attention_heads=2,
+            num_key_value_heads=2,
+            vocab_size=151936,
+            max_position_embeddings=512,
+        )
+        text_encoder = Qwen3Model(config)
+        tokenizer = Qwen2Tokenizer.from_pretrained("hf-internal-testing/tiny-random-Qwen2VLForConditionalGeneration")
+
+        components = {
+            "transformer": transformer,
+            "vae": vae,
+            "scheduler": scheduler,
+            "text_encoder": text_encoder,
+            "tokenizer": tokenizer,
+        }
+        return components
+
+    def get_dummy_inputs(self, device, seed=0):
+        if str(device).startswith("mps"):
+            generator = torch.manual_seed(seed)
+        else:
+            generator = torch.Generator(device=device).manual_seed(seed)
+
+        inputs = {
+            "prompt": "dance monkey",
+            "negative_prompt": "bad quality",
+            "generator": generator,
+            "num_inference_steps": 2,
+            "guidance_scale": 3.0,
+            "cfg_normalization": False,
+            "cfg_truncation": 1.0,
+            "height": 32,
+            "width": 32,
+            "max_sequence_length": 16,
+            "output_type": "pt",
+        }
+
+        return inputs
+
+    def test_inference(self):
+        device = "cpu"
+
+        components = self.get_dummy_components()
+        pipe = self.pipeline_class(**components)
+        pipe.to(device)
+        pipe.set_progress_bar_config(disable=None)
+
+        inputs = self.get_dummy_inputs(device)
+        image = pipe(**inputs).images
+        generated_image = image[0]
+        self.assertEqual(generated_image.shape, (3, 32, 32))
+
+        # fmt: off
+        expected_slice = torch.tensor([0.4622, 0.4532, 0.4714, 0.5087, 0.5371, 0.5405, 0.4492, 0.4479, 0.2984, 0.2783, 0.5409, 0.6577, 0.3952, 0.5524, 0.5262, 0.453])
+        # fmt: on
+
+        generated_slice = generated_image.flatten()
+        generated_slice = torch.cat([generated_slice[:8], generated_slice[-8:]])
+        self.assertTrue(torch.allclose(generated_slice, expected_slice, atol=5e-2))
+
+    def test_inference_batch_single_identical(self):
+        self._test_inference_batch_single_identical(batch_size=3, expected_max_diff=1e-1)
+
+    def test_num_images_per_prompt(self):
+        import inspect
+
+        sig = inspect.signature(self.pipeline_class.__call__)
+
+        if "num_images_per_prompt" not in sig.parameters:
+            return
+
+        components = self.get_dummy_components()
+        pipe = self.pipeline_class(**components)
+        pipe = pipe.to(torch_device)
+        pipe.set_progress_bar_config(disable=None)
+
+        batch_sizes = [1, 2]
+        num_images_per_prompts = [1, 2]
+
+        for batch_size in batch_sizes:
+            for num_images_per_prompt in num_images_per_prompts:
+                inputs = self.get_dummy_inputs(torch_device)
+
+                for key in inputs.keys():
+                    if key in self.batch_params:
+                        inputs[key] = batch_size * [inputs[key]]
+
+                images = pipe(**inputs, num_images_per_prompt=num_images_per_prompt)[0]
+
+                assert images.shape[0] == batch_size * num_images_per_prompt
+
+        del pipe
+        gc.collect()
+        if torch.cuda.is_available():
+            torch.cuda.empty_cache()
+            torch.cuda.synchronize()
+
+    def test_attention_slicing_forward_pass(
+        self, test_max_difference=True, test_mean_pixel_difference=True, expected_max_diff=1e-3
+    ):
+        if not self.test_attention_slicing:
+            return
+
+        components = self.get_dummy_components()
+        pipe = self.pipeline_class(**components)
+        for component in pipe.components.values():
+            if hasattr(component, "set_default_attn_processor"):
+                component.set_default_attn_processor()
+        pipe.to(torch_device)
+        pipe.set_progress_bar_config(disable=None)
+
+        generator_device = "cpu"
+        inputs = self.get_dummy_inputs(generator_device)
+        output_without_slicing = pipe(**inputs)[0]
+
+        pipe.enable_attention_slicing(slice_size=1)
+        inputs = self.get_dummy_inputs(generator_device)
+        output_with_slicing1 = pipe(**inputs)[0]
+
+        pipe.enable_attention_slicing(slice_size=2)
+        inputs = self.get_dummy_inputs(generator_device)
+        output_with_slicing2 = pipe(**inputs)[0]
+
+        if test_max_difference:
+            max_diff1 = np.abs(to_np(output_with_slicing1) - to_np(output_without_slicing)).max()
+            max_diff2 = np.abs(to_np(output_with_slicing2) - to_np(output_without_slicing)).max()
+            self.assertLess(
+                max(max_diff1, max_diff2),
+                expected_max_diff,
+                "Attention slicing should not affect the inference results",
+            )
+
+    def test_vae_tiling(self, expected_diff_max: float = 0.2):
+        generator_device = "cpu"
+        components = self.get_dummy_components()
+
+        pipe = self.pipeline_class(**components)
+        pipe.to("cpu")
+        pipe.set_progress_bar_config(disable=None)
+
+        # Without tiling
+        inputs = self.get_dummy_inputs(generator_device)
+        inputs["height"] = inputs["width"] = 128
+        output_without_tiling = pipe(**inputs)[0]
+
+        # With tiling (standard AutoencoderKL doesn't accept parameters)
+        pipe.vae.enable_tiling()
+        inputs = self.get_dummy_inputs(generator_device)
+        inputs["height"] = inputs["width"] = 128
+        output_with_tiling = pipe(**inputs)[0]
+
+        self.assertLess(
+            (to_np(output_without_tiling) - to_np(output_with_tiling)).max(),
+            expected_diff_max,
+            "VAE tiling should not affect the inference results",
+        )
+
+    def test_pipeline_with_accelerator_device_map(self, expected_max_difference=5e-4):
+        # Z-Image RoPE embeddings (complex64) have slightly higher numerical tolerance
+        super().test_pipeline_with_accelerator_device_map(expected_max_difference=expected_max_difference)
+
+    def test_group_offloading_inference(self):
+        # Block-level offloading conflicts with RoPE cache. Pipeline-level offloading (tested separately) works fine.
+        self.skipTest("Using test_pipeline_level_group_offloading_inference instead")
+
+    def test_save_load_float16(self, expected_max_diff=1e-2):
+        gc.collect()
+        if torch.cuda.is_available():
+            torch.cuda.empty_cache()
+            torch.cuda.synchronize()
+        torch.manual_seed(0)
+        if torch.cuda.is_available():
+            torch.cuda.manual_seed_all(0)
+        super().test_save_load_float16(expected_max_diff=expected_max_diff)
diff --git a/tests/pipelines/z_image/test_z_image_img2img.py b/tests/pipelines/z_image/test_z_image_img2img.py
new file mode 100644
index 000000000000..91b3025b17e8
--- /dev/null
+++ b/tests/pipelines/z_image/test_z_image_img2img.py
@@ -0,0 +1,358 @@
+# Copyright 2025 Alibaba Z-Image Team and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import gc
+import os
+import unittest
+
+import numpy as np
+import torch
+from transformers import Qwen2Tokenizer, Qwen3Config, Qwen3Model
+
+from diffusers import (
+    AutoencoderKL,
+    FlowMatchEulerDiscreteScheduler,
+    ZImageImg2ImgPipeline,
+    ZImageTransformer2DModel,
+)
+from diffusers.utils.testing_utils import floats_tensor
+
+from ...testing_utils import torch_device
+from ..pipeline_params import (
+    IMAGE_TO_IMAGE_IMAGE_PARAMS,
+    TEXT_GUIDED_IMAGE_VARIATION_BATCH_PARAMS,
+    TEXT_GUIDED_IMAGE_VARIATION_PARAMS,
+)
+from ..test_pipelines_common import PipelineTesterMixin, to_np
+
+
+# Z-Image requires torch.use_deterministic_algorithms(False) due to complex64 RoPE operations
+# Cannot use enable_full_determinism() which sets it to True
+# Note: Z-Image does not support FP16 inference due to complex64 RoPE embeddings
+os.environ["CUDA_LAUNCH_BLOCKING"] = "1"
+os.environ["CUBLAS_WORKSPACE_CONFIG"] = ":16:8"
+torch.use_deterministic_algorithms(False)
+torch.backends.cudnn.deterministic = True
+torch.backends.cudnn.benchmark = False
+if hasattr(torch.backends, "cuda"):
+    torch.backends.cuda.matmul.allow_tf32 = False
+
+
+class ZImageImg2ImgPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
+    pipeline_class = ZImageImg2ImgPipeline
+    params = TEXT_GUIDED_IMAGE_VARIATION_PARAMS - {"cross_attention_kwargs"}
+    batch_params = TEXT_GUIDED_IMAGE_VARIATION_BATCH_PARAMS
+    image_params = IMAGE_TO_IMAGE_IMAGE_PARAMS
+    image_latents_params = IMAGE_TO_IMAGE_IMAGE_PARAMS
+    required_optional_params = frozenset(
+        [
+            "num_inference_steps",
+            "strength",
+            "generator",
+            "latents",
+            "return_dict",
+            "callback_on_step_end",
+            "callback_on_step_end_tensor_inputs",
+        ]
+    )
+    supports_dduf = False
+    test_xformers_attention = False
+    test_layerwise_casting = True
+    test_group_offloading = True
+
+    def setUp(self):
+        gc.collect()
+        if torch.cuda.is_available():
+            torch.cuda.empty_cache()
+            torch.cuda.synchronize()
+        torch.manual_seed(0)
+        if torch.cuda.is_available():
+            torch.cuda.manual_seed_all(0)
+
+    def tearDown(self):
+        super().tearDown()
+        gc.collect()
+        if torch.cuda.is_available():
+            torch.cuda.empty_cache()
+            torch.cuda.synchronize()
+        torch.manual_seed(0)
+        if torch.cuda.is_available():
+            torch.cuda.manual_seed_all(0)
+
+    def get_dummy_components(self):
+        torch.manual_seed(0)
+        transformer = ZImageTransformer2DModel(
+            all_patch_size=(2,),
+            all_f_patch_size=(1,),
+            in_channels=16,
+            dim=32,
+            n_layers=2,
+            n_refiner_layers=1,
+            n_heads=2,
+            n_kv_heads=2,
+            norm_eps=1e-5,
+            qk_norm=True,
+            cap_feat_dim=16,
+            rope_theta=256.0,
+            t_scale=1000.0,
+            axes_dims=[8, 4, 4],
+            axes_lens=[256, 32, 32],
+        )
+
+        torch.manual_seed(0)
+        vae = AutoencoderKL(
+            in_channels=3,
+            out_channels=3,
+            down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"],
+            up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"],
+            block_out_channels=[32, 64],
+            layers_per_block=1,
+            latent_channels=16,
+            norm_num_groups=32,
+            sample_size=32,
+            scaling_factor=0.3611,
+            shift_factor=0.1159,
+        )
+
+        torch.manual_seed(0)
+        scheduler = FlowMatchEulerDiscreteScheduler()
+
+        torch.manual_seed(0)
+        config = Qwen3Config(
+            hidden_size=16,
+            intermediate_size=16,
+            num_hidden_layers=2,
+            num_attention_heads=2,
+            num_key_value_heads=2,
+            vocab_size=151936,
+            max_position_embeddings=512,
+        )
+        text_encoder = Qwen3Model(config)
+        tokenizer = Qwen2Tokenizer.from_pretrained("hf-internal-testing/tiny-random-Qwen2VLForConditionalGeneration")
+
+        components = {
+            "transformer": transformer,
+            "vae": vae,
+            "scheduler": scheduler,
+            "text_encoder": text_encoder,
+            "tokenizer": tokenizer,
+        }
+        return components
+
+    def get_dummy_inputs(self, device, seed=0):
+        import random
+
+        if str(device).startswith("mps"):
+            generator = torch.manual_seed(seed)
+        else:
+            generator = torch.Generator(device=device).manual_seed(seed)
+
+        image = floats_tensor((1, 3, 32, 32), rng=random.Random(seed)).to(device)
+
+        inputs = {
+            "prompt": "dance monkey",
+            "negative_prompt": "bad quality",
+            "image": image,
+            "strength": 0.6,
+            "generator": generator,
+            "num_inference_steps": 2,
+            "guidance_scale": 3.0,
+            "cfg_normalization": False,
+            "cfg_truncation": 1.0,
+            "height": 32,
+            "width": 32,
+            "max_sequence_length": 16,
+            "output_type": "np",
+        }
+
+        return inputs
+
+    def test_inference(self):
+        device = "cpu"
+
+        components = self.get_dummy_components()
+        pipe = self.pipeline_class(**components)
+        pipe.to(device)
+        pipe.set_progress_bar_config(disable=None)
+
+        inputs = self.get_dummy_inputs(device)
+        image = pipe(**inputs).images
+        generated_image = image[0]
+        self.assertEqual(generated_image.shape, (32, 32, 3))
+
+    def test_inference_batch_single_identical(self):
+        gc.collect()
+        if torch.cuda.is_available():
+            torch.cuda.empty_cache()
+            torch.cuda.synchronize()
+        torch.manual_seed(0)
+        if torch.cuda.is_available():
+            torch.cuda.manual_seed_all(0)
+        self._test_inference_batch_single_identical(batch_size=3, expected_max_diff=1e-1)
+
+    def test_num_images_per_prompt(self):
+        import inspect
+
+        sig = inspect.signature(self.pipeline_class.__call__)
+
+        if "num_images_per_prompt" not in sig.parameters:
+            return
+
+        components = self.get_dummy_components()
+        pipe = self.pipeline_class(**components)
+        pipe = pipe.to(torch_device)
+        pipe.set_progress_bar_config(disable=None)
+
+        batch_sizes = [1, 2]
+        num_images_per_prompts = [1, 2]
+
+        for batch_size in batch_sizes:
+            for num_images_per_prompt in num_images_per_prompts:
+                inputs = self.get_dummy_inputs(torch_device)
+
+                for key in inputs.keys():
+                    if key in self.batch_params:
+                        inputs[key] = batch_size * [inputs[key]]
+
+                images = pipe(**inputs, num_images_per_prompt=num_images_per_prompt)[0]
+
+                assert images.shape[0] == batch_size * num_images_per_prompt
+
+        del pipe
+        gc.collect()
+        if torch.cuda.is_available():
+            torch.cuda.empty_cache()
+            torch.cuda.synchronize()
+
+    def test_attention_slicing_forward_pass(
+        self, test_max_difference=True, test_mean_pixel_difference=True, expected_max_diff=1e-3
+    ):
+        if not self.test_attention_slicing:
+            return
+
+        components = self.get_dummy_components()
+        pipe = self.pipeline_class(**components)
+        for component in pipe.components.values():
+            if hasattr(component, "set_default_attn_processor"):
+                component.set_default_attn_processor()
+        pipe.to(torch_device)
+        pipe.set_progress_bar_config(disable=None)
+
+        generator_device = "cpu"
+        inputs = self.get_dummy_inputs(generator_device)
+        output_without_slicing = pipe(**inputs)[0]
+
+        pipe.enable_attention_slicing(slice_size=1)
+        inputs = self.get_dummy_inputs(generator_device)
+        output_with_slicing1 = pipe(**inputs)[0]
+
+        pipe.enable_attention_slicing(slice_size=2)
+        inputs = self.get_dummy_inputs(generator_device)
+        output_with_slicing2 = pipe(**inputs)[0]
+
+        if test_max_difference:
+            max_diff1 = np.abs(to_np(output_with_slicing1) - to_np(output_without_slicing)).max()
+            max_diff2 = np.abs(to_np(output_with_slicing2) - to_np(output_without_slicing)).max()
+            self.assertLess(
+                max(max_diff1, max_diff2),
+                expected_max_diff,
+                "Attention slicing should not affect the inference results",
+            )
+
+    def test_vae_tiling(self, expected_diff_max: float = 0.3):
+        import random
+
+        generator_device = "cpu"
+        components = self.get_dummy_components()
+
+        pipe = self.pipeline_class(**components)
+        pipe.to("cpu")
+        pipe.set_progress_bar_config(disable=None)
+
+        # Without tiling
+        inputs = self.get_dummy_inputs(generator_device)
+        inputs["height"] = inputs["width"] = 128
+        # Generate a larger image for the input
+        inputs["image"] = floats_tensor((1, 3, 128, 128), rng=random.Random(0)).to("cpu")
+        output_without_tiling = pipe(**inputs)[0]
+
+        # With tiling (standard AutoencoderKL doesn't accept parameters)
+        pipe.vae.enable_tiling()
+        inputs = self.get_dummy_inputs(generator_device)
+        inputs["height"] = inputs["width"] = 128
+        inputs["image"] = floats_tensor((1, 3, 128, 128), rng=random.Random(0)).to("cpu")
+        output_with_tiling = pipe(**inputs)[0]
+
+        self.assertLess(
+            (to_np(output_without_tiling) - to_np(output_with_tiling)).max(),
+            expected_diff_max,
+            "VAE tiling should not affect the inference results",
+        )
+
+    def test_pipeline_with_accelerator_device_map(self, expected_max_difference=5e-4):
+        # Z-Image RoPE embeddings (complex64) have slightly higher numerical tolerance
+        super().test_pipeline_with_accelerator_device_map(expected_max_difference=expected_max_difference)
+
+    def test_group_offloading_inference(self):
+        # Block-level offloading conflicts with RoPE cache. Pipeline-level offloading (tested separately) works fine.
+        self.skipTest("Using test_pipeline_level_group_offloading_inference instead")
+
+    def test_save_load_float16(self, expected_max_diff=1e-2):
+        # Z-Image does not support FP16 due to complex64 RoPE embeddings
+        self.skipTest("Z-Image does not support FP16 inference")
+
+    def test_float16_inference(self, expected_max_diff=5e-2):
+        # Z-Image does not support FP16 due to complex64 RoPE embeddings
+        self.skipTest("Z-Image does not support FP16 inference")
+
+    def test_strength_parameter(self):
+        """Test that strength parameter affects the output correctly."""
+        device = "cpu"
+        components = self.get_dummy_components()
+        pipe = self.pipeline_class(**components)
+        pipe.to(device)
+        pipe.set_progress_bar_config(disable=None)
+
+        # Test with different strength values
+        inputs_low_strength = self.get_dummy_inputs(device)
+        inputs_low_strength["strength"] = 0.2
+
+        inputs_high_strength = self.get_dummy_inputs(device)
+        inputs_high_strength["strength"] = 0.8
+
+        # Both should complete without errors
+        output_low = pipe(**inputs_low_strength).images[0]
+        output_high = pipe(**inputs_high_strength).images[0]
+
+        # Outputs should be different (different amount of transformation)
+        self.assertFalse(np.allclose(output_low, output_high, atol=1e-3))
+
+    def test_invalid_strength(self):
+        """Test that invalid strength values raise appropriate errors."""
+        device = "cpu"
+        components = self.get_dummy_components()
+        pipe = self.pipeline_class(**components)
+        pipe.to(device)
+
+        inputs = self.get_dummy_inputs(device)
+
+        # Test strength < 0
+        inputs["strength"] = -0.1
+        with self.assertRaises(ValueError):
+            pipe(**inputs)
+
+        # Test strength > 1
+        inputs["strength"] = 1.5
+        with self.assertRaises(ValueError):
+            pipe(**inputs)
diff --git a/tests/quantization/gguf/test_gguf.py b/tests/quantization/gguf/test_gguf.py
index 0f4fd408a7c1..b42764be10d6 100644
--- a/tests/quantization/gguf/test_gguf.py
+++ b/tests/quantization/gguf/test_gguf.py
@@ -16,6 +16,7 @@
     HiDreamImageTransformer2DModel,
     SD3Transformer2DModel,
     StableDiffusion3Pipeline,
+    WanAnimateTransformer3DModel,
     WanTransformer3DModel,
     WanVACETransformer3DModel,
 )
@@ -721,6 +722,33 @@ def get_dummy_inputs(self):
         }
 
 
+class WanAnimateGGUFSingleFileTests(GGUFSingleFileTesterMixin, unittest.TestCase):
+    ckpt_path = "https://huggingface.co/QuantStack/Wan2.2-Animate-14B-GGUF/blob/main/Wan2.2-Animate-14B-Q3_K_S.gguf"
+    torch_dtype = torch.bfloat16
+    model_cls = WanAnimateTransformer3DModel
+    expected_memory_use_in_gb = 9
+
+    def get_dummy_inputs(self):
+        return {
+            "hidden_states": torch.randn((1, 16, 2, 64, 64), generator=torch.Generator("cpu").manual_seed(0)).to(
+                torch_device, self.torch_dtype
+            ),
+            "encoder_hidden_states": torch.randn(
+                (1, 512, 4096),
+                generator=torch.Generator("cpu").manual_seed(0),
+            ).to(torch_device, self.torch_dtype),
+            "control_hidden_states": torch.randn(
+                (1, 96, 2, 64, 64),
+                generator=torch.Generator("cpu").manual_seed(0),
+            ).to(torch_device, self.torch_dtype),
+            "control_hidden_states_scale": torch.randn(
+                (8,),
+                generator=torch.Generator("cpu").manual_seed(0),
+            ).to(torch_device, self.torch_dtype),
+            "timestep": torch.tensor([1]).to(torch_device, self.torch_dtype),
+        }
+
+
 @require_torch_version_greater("2.7.1")
 class GGUFCompileTests(QuantCompileTests, unittest.TestCase):
     torch_dtype = torch.bfloat16
diff --git a/tests/testing_utils.py b/tests/testing_utils.py
index 951ba4128033..4550813259af 100644
--- a/tests/testing_utils.py
+++ b/tests/testing_utils.py
@@ -13,7 +13,6 @@
 import sys
 import tempfile
 import time
-import unittest
 import urllib.parse
 from collections import UserDict
 from contextlib import contextmanager
@@ -24,6 +23,7 @@
 import numpy as np
 import PIL.Image
 import PIL.ImageOps
+import pytest
 import requests
 from numpy.linalg import norm
 from packaging import version
@@ -267,7 +267,7 @@ def slow(test_case):
     Slow tests are skipped by default. Set the RUN_SLOW environment variable to a truthy value to run them.
 
     """
-    return unittest.skipUnless(_run_slow_tests, "test is slow")(test_case)
+    return pytest.mark.skipif(not _run_slow_tests, reason="test is slow")(test_case)
 
 
 def nightly(test_case):
@@ -277,7 +277,7 @@ def nightly(test_case):
     Slow tests are skipped by default. Set the RUN_NIGHTLY environment variable to a truthy value to run them.
 
     """
-    return unittest.skipUnless(_run_nightly_tests, "test is nightly")(test_case)
+    return pytest.mark.skipif(not _run_nightly_tests, reason="test is nightly")(test_case)
 
 
 def is_torch_compile(test_case):
@@ -287,23 +287,23 @@ def is_torch_compile(test_case):
     Compile tests are skipped by default. Set the RUN_COMPILE environment variable to a truthy value to run them.
 
     """
-    return unittest.skipUnless(_run_compile_tests, "test is torch compile")(test_case)
+    return pytest.mark.skipif(not _run_compile_tests, reason="test is torch compile")(test_case)
 
 
 def require_torch(test_case):
     """
     Decorator marking a test that requires PyTorch. These tests are skipped when PyTorch isn't installed.
     """
-    return unittest.skipUnless(is_torch_available(), "test requires PyTorch")(test_case)
+    return pytest.mark.skipif(not is_torch_available(), reason="test requires PyTorch")(test_case)
 
 
 def require_torch_2(test_case):
     """
     Decorator marking a test that requires PyTorch 2. These tests are skipped when it isn't installed.
     """
-    return unittest.skipUnless(is_torch_available() and is_torch_version(">=", "2.0.0"), "test requires PyTorch 2")(
-        test_case
-    )
+    return pytest.mark.skipif(
+        not (is_torch_available() and is_torch_version(">=", "2.0.0")), reason="test requires PyTorch 2"
+    )(test_case)
 
 
 def require_torch_version_greater_equal(torch_version):
@@ -311,8 +311,9 @@ def require_torch_version_greater_equal(torch_version):
 
     def decorator(test_case):
         correct_torch_version = is_torch_available() and is_torch_version(">=", torch_version)
-        return unittest.skipUnless(
-            correct_torch_version, f"test requires torch with the version greater than or equal to {torch_version}"
+        return pytest.mark.skipif(
+            not correct_torch_version,
+            reason=f"test requires torch with the version greater than or equal to {torch_version}",
         )(test_case)
 
     return decorator
@@ -323,8 +324,8 @@ def require_torch_version_greater(torch_version):
 
     def decorator(test_case):
         correct_torch_version = is_torch_available() and is_torch_version(">", torch_version)
-        return unittest.skipUnless(
-            correct_torch_version, f"test requires torch with the version greater than {torch_version}"
+        return pytest.mark.skipif(
+            not correct_torch_version, reason=f"test requires torch with the version greater than {torch_version}"
         )(test_case)
 
     return decorator
@@ -332,19 +333,18 @@ def decorator(test_case):
 
 def require_torch_gpu(test_case):
     """Decorator marking a test that requires CUDA and PyTorch."""
-    return unittest.skipUnless(is_torch_available() and torch_device == "cuda", "test requires PyTorch+CUDA")(
-        test_case
-    )
+    return pytest.mark.skipif(torch_device != "cuda", reason="test requires PyTorch+CUDA")(test_case)
 
 
 def require_torch_cuda_compatibility(expected_compute_capability):
     def decorator(test_case):
         if torch.cuda.is_available():
             current_compute_capability = get_torch_cuda_device_capability()
-            return unittest.skipUnless(
-                float(current_compute_capability) == float(expected_compute_capability),
-                "Test not supported for this compute capability.",
-            )
+            return pytest.mark.skipif(
+                float(current_compute_capability) != float(expected_compute_capability),
+                reason="Test not supported for this compute capability.",
+            )(test_case)
+        return test_case
 
     return decorator
 
@@ -352,9 +352,7 @@ def decorator(test_case):
 # These decorators are for accelerator-specific behaviours that are not GPU-specific
 def require_torch_accelerator(test_case):
     """Decorator marking a test that requires an accelerator backend and PyTorch."""
-    return unittest.skipUnless(is_torch_available() and torch_device != "cpu", "test requires accelerator+PyTorch")(
-        test_case
-    )
+    return pytest.mark.skipif(torch_device == "cpu", reason="test requires accelerator+PyTorch")(test_case)
 
 
 def require_torch_multi_gpu(test_case):
@@ -364,11 +362,11 @@ def require_torch_multi_gpu(test_case):
     -k "multi_gpu"
     """
     if not is_torch_available():
-        return unittest.skip("test requires PyTorch")(test_case)
+        return pytest.mark.skip(reason="test requires PyTorch")(test_case)
 
     import torch
 
-    return unittest.skipUnless(torch.cuda.device_count() > 1, "test requires multiple GPUs")(test_case)
+    return pytest.mark.skipif(torch.cuda.device_count() <= 1, reason="test requires multiple GPUs")(test_case)
 
 
 def require_torch_multi_accelerator(test_case):
@@ -377,27 +375,28 @@ def require_torch_multi_accelerator(test_case):
     without multiple hardware accelerators.
     """
     if not is_torch_available():
-        return unittest.skip("test requires PyTorch")(test_case)
+        return pytest.mark.skip(reason="test requires PyTorch")(test_case)
 
     import torch
 
-    return unittest.skipUnless(
-        torch.cuda.device_count() > 1 or torch.xpu.device_count() > 1, "test requires multiple hardware accelerators"
+    return pytest.mark.skipif(
+        not (torch.cuda.device_count() > 1 or torch.xpu.device_count() > 1),
+        reason="test requires multiple hardware accelerators",
     )(test_case)
 
 
 def require_torch_accelerator_with_fp16(test_case):
     """Decorator marking a test that requires an accelerator with support for the FP16 data type."""
-    return unittest.skipUnless(_is_torch_fp16_available(torch_device), "test requires accelerator with fp16 support")(
-        test_case
-    )
+    return pytest.mark.skipif(
+        not _is_torch_fp16_available(torch_device), reason="test requires accelerator with fp16 support"
+    )(test_case)
 
 
 def require_torch_accelerator_with_fp64(test_case):
     """Decorator marking a test that requires an accelerator with support for the FP64 data type."""
-    return unittest.skipUnless(_is_torch_fp64_available(torch_device), "test requires accelerator with fp64 support")(
-        test_case
-    )
+    return pytest.mark.skipif(
+        not _is_torch_fp64_available(torch_device), reason="test requires accelerator with fp64 support"
+    )(test_case)
 
 
 def require_big_gpu_with_torch_cuda(test_case):
@@ -406,17 +405,17 @@ def require_big_gpu_with_torch_cuda(test_case):
     etc.
     """
     if not is_torch_available():
-        return unittest.skip("test requires PyTorch")(test_case)
+        return pytest.mark.skip(reason="test requires PyTorch")(test_case)
 
     import torch
 
     if not torch.cuda.is_available():
-        return unittest.skip("test requires PyTorch CUDA")(test_case)
+        return pytest.mark.skip(reason="test requires PyTorch CUDA")(test_case)
 
     device_properties = torch.cuda.get_device_properties(0)
     total_memory = device_properties.total_memory / (1024**3)
-    return unittest.skipUnless(
-        total_memory >= BIG_GPU_MEMORY, f"test requires a GPU with at least {BIG_GPU_MEMORY} GB memory"
+    return pytest.mark.skipif(
+        total_memory < BIG_GPU_MEMORY, reason=f"test requires a GPU with at least {BIG_GPU_MEMORY} GB memory"
     )(test_case)
 
 
@@ -430,12 +429,12 @@ def require_big_accelerator(test_case):
     test_case = pytest.mark.big_accelerator(test_case)
 
     if not is_torch_available():
-        return unittest.skip("test requires PyTorch")(test_case)
+        return pytest.mark.skip(reason="test requires PyTorch")(test_case)
 
     import torch
 
     if not (torch.cuda.is_available() or torch.xpu.is_available()):
-        return unittest.skip("test requires PyTorch CUDA")(test_case)
+        return pytest.mark.skip(reason="test requires PyTorch CUDA")(test_case)
 
     if torch.xpu.is_available():
         device_properties = torch.xpu.get_device_properties(0)
@@ -443,30 +442,30 @@ def require_big_accelerator(test_case):
         device_properties = torch.cuda.get_device_properties(0)
 
     total_memory = device_properties.total_memory / (1024**3)
-    return unittest.skipUnless(
-        total_memory >= BIG_GPU_MEMORY,
-        f"test requires a hardware accelerator with at least {BIG_GPU_MEMORY} GB memory",
+    return pytest.mark.skipif(
+        total_memory < BIG_GPU_MEMORY,
+        reason=f"test requires a hardware accelerator with at least {BIG_GPU_MEMORY} GB memory",
     )(test_case)
 
 
 def require_torch_accelerator_with_training(test_case):
     """Decorator marking a test that requires an accelerator with support for training."""
-    return unittest.skipUnless(
-        is_torch_available() and backend_supports_training(torch_device),
-        "test requires accelerator with training support",
+    return pytest.mark.skipif(
+        not (is_torch_available() and backend_supports_training(torch_device)),
+        reason="test requires accelerator with training support",
     )(test_case)
 
 
 def skip_mps(test_case):
     """Decorator marking a test to skip if torch_device is 'mps'"""
-    return unittest.skipUnless(torch_device != "mps", "test requires non 'mps' device")(test_case)
+    return pytest.mark.skipif(torch_device == "mps", reason="test requires non 'mps' device")(test_case)
 
 
 def require_flax(test_case):
     """
     Decorator marking a test that requires JAX & Flax. These tests are skipped when one / both are not installed
     """
-    return unittest.skipUnless(is_flax_available(), "test requires JAX & Flax")(test_case)
+    return pytest.mark.skipif(not is_flax_available(), reason="test requires JAX & Flax")(test_case)
 
 
 def require_compel(test_case):
@@ -474,21 +473,21 @@ def require_compel(test_case):
     Decorator marking a test that requires compel: https://github.com/damian0815/compel. These tests are skipped when
     the library is not installed.
     """
-    return unittest.skipUnless(is_compel_available(), "test requires compel")(test_case)
+    return pytest.mark.skipif(not is_compel_available(), reason="test requires compel")(test_case)
 
 
 def require_onnxruntime(test_case):
     """
     Decorator marking a test that requires onnxruntime. These tests are skipped when onnxruntime isn't installed.
     """
-    return unittest.skipUnless(is_onnx_available(), "test requires onnxruntime")(test_case)
+    return pytest.mark.skipif(not is_onnx_available(), reason="test requires onnxruntime")(test_case)
 
 
 def require_note_seq(test_case):
     """
     Decorator marking a test that requires note_seq. These tests are skipped when note_seq isn't installed.
     """
-    return unittest.skipUnless(is_note_seq_available(), "test requires note_seq")(test_case)
+    return pytest.mark.skipif(not is_note_seq_available(), reason="test requires note_seq")(test_case)
 
 
 def require_accelerator(test_case):
@@ -496,14 +495,14 @@ def require_accelerator(test_case):
     Decorator marking a test that requires a hardware accelerator backend. These tests are skipped when there are no
     hardware accelerator available.
     """
-    return unittest.skipUnless(torch_device != "cpu", "test requires a hardware accelerator")(test_case)
+    return pytest.mark.skipif(torch_device == "cpu", reason="test requires a hardware accelerator")(test_case)
 
 
 def require_torchsde(test_case):
     """
     Decorator marking a test that requires torchsde. These tests are skipped when torchsde isn't installed.
     """
-    return unittest.skipUnless(is_torchsde_available(), "test requires torchsde")(test_case)
+    return pytest.mark.skipif(not is_torchsde_available(), reason="test requires torchsde")(test_case)
 
 
 def require_peft_backend(test_case):
@@ -511,35 +510,35 @@ def require_peft_backend(test_case):
     Decorator marking a test that requires PEFT backend, this would require some specific versions of PEFT and
     transformers.
     """
-    return unittest.skipUnless(USE_PEFT_BACKEND, "test requires PEFT backend")(test_case)
+    return pytest.mark.skipif(not USE_PEFT_BACKEND, reason="test requires PEFT backend")(test_case)
 
 
 def require_timm(test_case):
     """
     Decorator marking a test that requires timm. These tests are skipped when timm isn't installed.
     """
-    return unittest.skipUnless(is_timm_available(), "test requires timm")(test_case)
+    return pytest.mark.skipif(not is_timm_available(), reason="test requires timm")(test_case)
 
 
 def require_bitsandbytes(test_case):
     """
     Decorator marking a test that requires bitsandbytes. These tests are skipped when bitsandbytes isn't installed.
     """
-    return unittest.skipUnless(is_bitsandbytes_available(), "test requires bitsandbytes")(test_case)
+    return pytest.mark.skipif(not is_bitsandbytes_available(), reason="test requires bitsandbytes")(test_case)
 
 
 def require_quanto(test_case):
     """
     Decorator marking a test that requires quanto. These tests are skipped when quanto isn't installed.
     """
-    return unittest.skipUnless(is_optimum_quanto_available(), "test requires quanto")(test_case)
+    return pytest.mark.skipif(not is_optimum_quanto_available(), reason="test requires quanto")(test_case)
 
 
 def require_accelerate(test_case):
     """
     Decorator marking a test that requires accelerate. These tests are skipped when accelerate isn't installed.
     """
-    return unittest.skipUnless(is_accelerate_available(), "test requires accelerate")(test_case)
+    return pytest.mark.skipif(not is_accelerate_available(), reason="test requires accelerate")(test_case)
 
 
 def require_peft_version_greater(peft_version):
@@ -552,8 +551,8 @@ def decorator(test_case):
         correct_peft_version = is_peft_available() and version.parse(
             version.parse(importlib.metadata.version("peft")).base_version
         ) > version.parse(peft_version)
-        return unittest.skipUnless(
-            correct_peft_version, f"test requires PEFT backend with the version greater than {peft_version}"
+        return pytest.mark.skipif(
+            not correct_peft_version, reason=f"test requires PEFT backend with the version greater than {peft_version}"
         )(test_case)
 
     return decorator
@@ -569,9 +568,9 @@ def decorator(test_case):
         correct_transformers_version = is_transformers_available() and version.parse(
             version.parse(importlib.metadata.version("transformers")).base_version
         ) > version.parse(transformers_version)
-        return unittest.skipUnless(
-            correct_transformers_version,
-            f"test requires transformers with the version greater than {transformers_version}",
+        return pytest.mark.skipif(
+            not correct_transformers_version,
+            reason=f"test requires transformers with the version greater than {transformers_version}",
         )(test_case)
 
     return decorator
@@ -582,8 +581,9 @@ def decorator(test_case):
         correct_accelerate_version = is_accelerate_available() and version.parse(
             version.parse(importlib.metadata.version("accelerate")).base_version
         ) > version.parse(accelerate_version)
-        return unittest.skipUnless(
-            correct_accelerate_version, f"Test requires accelerate with the version greater than {accelerate_version}."
+        return pytest.mark.skipif(
+            not correct_accelerate_version,
+            reason=f"Test requires accelerate with the version greater than {accelerate_version}.",
         )(test_case)
 
     return decorator
@@ -594,8 +594,8 @@ def decorator(test_case):
         correct_bnb_version = is_bitsandbytes_available() and version.parse(
             version.parse(importlib.metadata.version("bitsandbytes")).base_version
         ) > version.parse(bnb_version)
-        return unittest.skipUnless(
-            correct_bnb_version, f"Test requires bitsandbytes with the version greater than {bnb_version}."
+        return pytest.mark.skipif(
+            not correct_bnb_version, reason=f"Test requires bitsandbytes with the version greater than {bnb_version}."
         )(test_case)
 
     return decorator
@@ -606,8 +606,9 @@ def decorator(test_case):
         correct_hf_hub_version = version.parse(
             version.parse(importlib.metadata.version("huggingface_hub")).base_version
         ) > version.parse(hf_hub_version)
-        return unittest.skipUnless(
-            correct_hf_hub_version, f"Test requires huggingface_hub with the version greater than {hf_hub_version}."
+        return pytest.mark.skipif(
+            not correct_hf_hub_version,
+            reason=f"Test requires huggingface_hub with the version greater than {hf_hub_version}.",
         )(test_case)
 
     return decorator
@@ -618,8 +619,8 @@ def decorator(test_case):
         correct_gguf_version = is_gguf_available() and version.parse(
             version.parse(importlib.metadata.version("gguf")).base_version
         ) >= version.parse(gguf_version)
-        return unittest.skipUnless(
-            correct_gguf_version, f"Test requires gguf with the version greater than {gguf_version}."
+        return pytest.mark.skipif(
+            not correct_gguf_version, reason=f"Test requires gguf with the version greater than {gguf_version}."
         )(test_case)
 
     return decorator
@@ -630,8 +631,8 @@ def decorator(test_case):
         correct_torchao_version = is_torchao_available() and version.parse(
             version.parse(importlib.metadata.version("torchao")).base_version
         ) >= version.parse(torchao_version)
-        return unittest.skipUnless(
-            correct_torchao_version, f"Test requires torchao with version greater than {torchao_version}."
+        return pytest.mark.skipif(
+            not correct_torchao_version, reason=f"Test requires torchao with version greater than {torchao_version}."
         )(test_case)
 
     return decorator
@@ -642,8 +643,8 @@ def decorator(test_case):
         correct_kernels_version = is_kernels_available() and version.parse(
             version.parse(importlib.metadata.version("kernels")).base_version
         ) >= version.parse(kernels_version)
-        return unittest.skipUnless(
-            correct_kernels_version, f"Test requires kernels with version greater than {kernels_version}."
+        return pytest.mark.skipif(
+            not correct_kernels_version, reason=f"Test requires kernels with version greater than {kernels_version}."
         )(test_case)
 
     return decorator
@@ -653,7 +654,7 @@ def deprecate_after_peft_backend(test_case):
     """
     Decorator marking a test that will be skipped after PEFT backend
     """
-    return unittest.skipUnless(not USE_PEFT_BACKEND, "test skipped in favor of PEFT backend")(test_case)
+    return pytest.mark.skipif(USE_PEFT_BACKEND, reason="test skipped in favor of PEFT backend")(test_case)
 
 
 def get_python_version():
@@ -1064,8 +1065,8 @@ def run_test_in_subprocess(test_case, target_func, inputs=None, timeout=None):
     To run a test in a subprocess. In particular, this can avoid (GPU) memory issue.
 
     Args:
-        test_case (`unittest.TestCase`):
-            The test that will run `target_func`.
+        test_case:
+            The test case object that will run `target_func`.
         target_func (`Callable`):
             The function implementing the actual testing logic.
         inputs (`dict`, *optional*, defaults to `None`):
@@ -1083,7 +1084,7 @@ def run_test_in_subprocess(test_case, target_func, inputs=None, timeout=None):
     input_queue = ctx.Queue(1)
     output_queue = ctx.JoinableQueue(1)
 
-    # We can't send `unittest.TestCase` to the child, otherwise we get issues regarding pickle.
+    # We can't send test case objects to the child, otherwise we get issues regarding pickle.
     input_queue.put(inputs, timeout=timeout)
 
     process = ctx.Process(target=target_func, args=(input_queue, output_queue, timeout))
@@ -1423,6 +1424,8 @@ def _get_expected_safetensors_files(
         offload_to_disk_path: str,
         offload_type: str,
         num_blocks_per_group: Optional[int] = None,
+        block_modules: Optional[List[str]] = None,
+        module_prefix: str = "",
     ) -> Set[str]:
         expected_files = set()
 
@@ -1434,23 +1437,36 @@ def get_hashed_filename(group_id: str) -> str:
             if num_blocks_per_group is None:
                 raise ValueError("num_blocks_per_group must be provided for 'block_level' offloading.")
 
-            # Handle groups of ModuleList and Sequential blocks
+            block_modules_set = set(block_modules) if block_modules is not None else set()
+
+            modules_with_group_offloading = set()
             unmatched_modules = []
             for name, submodule in module.named_children():
-                if not isinstance(submodule, (torch.nn.ModuleList, torch.nn.Sequential)):
-                    unmatched_modules.append(module)
-                    continue
+                if name in block_modules_set:
+                    new_prefix = f"{module_prefix}{name}." if module_prefix else f"{name}."
+                    submodule_files = _get_expected_safetensors_files(
+                        submodule, offload_to_disk_path, offload_type, num_blocks_per_group, block_modules, new_prefix
+                    )
+                    expected_files.update(submodule_files)
+                    modules_with_group_offloading.add(name)
+
+                elif isinstance(submodule, (torch.nn.ModuleList, torch.nn.Sequential)):
+                    for i in range(0, len(submodule), num_blocks_per_group):
+                        current_modules = submodule[i : i + num_blocks_per_group]
+                        if not current_modules:
+                            continue
+                        group_id = f"{module_prefix}{name}_{i}_{i + len(current_modules) - 1}"
+                        expected_files.add(get_hashed_filename(group_id))
+                        for j in range(i, i + len(current_modules)):
+                            modules_with_group_offloading.add(f"{name}.{j}")
+                else:
+                    unmatched_modules.append(submodule)
 
-                for i in range(0, len(submodule), num_blocks_per_group):
-                    current_modules = submodule[i : i + num_blocks_per_group]
-                    if not current_modules:
-                        continue
-                    group_id = f"{name}_{i}_{i + len(current_modules) - 1}"
-                    expected_files.add(get_hashed_filename(group_id))
+            parameters = _gather_parameters_with_no_group_offloading_parent(module, modules_with_group_offloading)
+            buffers = _gather_buffers_with_no_group_offloading_parent(module, modules_with_group_offloading)
 
-            # Handle the group for unmatched top-level modules and parameters
-            for module in unmatched_modules:
-                expected_files.add(get_hashed_filename(f"{module.__class__.__name__}_unmatched_group"))
+            if len(unmatched_modules) > 0 or len(parameters) > 0 or len(buffers) > 0:
+                expected_files.add(get_hashed_filename(f"{module_prefix}{module.__class__.__name__}_unmatched_group"))
 
         elif offload_type == "leaf_level":
             # Handle leaf-level module groups
@@ -1491,12 +1507,13 @@ def _check_safetensors_serialization(
         offload_to_disk_path: str,
         offload_type: str,
         num_blocks_per_group: Optional[int] = None,
+        block_modules: Optional[List[str]] = None,
     ) -> bool:
         if not os.path.isdir(offload_to_disk_path):
             return False, None, None
 
         expected_files = _get_expected_safetensors_files(
-            module, offload_to_disk_path, offload_type, num_blocks_per_group
+            module, offload_to_disk_path, offload_type, num_blocks_per_group, block_modules
         )
         actual_files = set(glob.glob(os.path.join(offload_to_disk_path, "*.safetensors")))
         missing_files = expected_files - actual_files
diff --git a/utils/check_doc_toc.py b/utils/check_doc_toc.py
index 0dd02cde86c1..050b093991e6 100644
--- a/utils/check_doc_toc.py
+++ b/utils/check_doc_toc.py
@@ -21,20 +21,23 @@
 
 PATH_TO_TOC = "docs/source/en/_toctree.yml"
 
+# Titles that should maintain their position and not be sorted alphabetically
+FIXED_POSITION_TITLES = {"overview", "autopipeline"}
+
 
 def clean_doc_toc(doc_list):
     """
     Cleans the table of content of the model documentation by removing duplicates and sorting models alphabetically.
     """
     counts = defaultdict(int)
-    overview_doc = []
+    fixed_position_docs = []
     new_doc_list = []
     for doc in doc_list:
         if "local" in doc:
             counts[doc["local"]] += 1
 
-        if doc["title"].lower() == "overview":
-            overview_doc.append({"local": doc["local"], "title": doc["title"]})
+        if doc["title"].lower() in FIXED_POSITION_TITLES:
+            fixed_position_docs.append({"local": doc["local"], "title": doc["title"]})
         else:
             new_doc_list.append(doc)
 
@@ -57,14 +60,13 @@ def clean_doc_toc(doc_list):
     new_doc.extend([doc for doc in doc_list if "local" not in counts or counts[doc["local"]] == 1])
     new_doc = sorted(new_doc, key=lambda s: s["title"].lower())
 
-    # "overview" gets special treatment and is always first
-    if len(overview_doc) > 1:
-        raise ValueError("{doc_list} has two 'overview' docs which is not allowed.")
-
-    overview_doc.extend(new_doc)
+    # Fixed-position titles maintain their original order
+    result = []
+    for doc in fixed_position_docs:
+        result.append(doc)
 
-    # Sort
-    return overview_doc
+    result.extend(new_doc)
+    return result
 
 
 def check_scheduler_doc(overwrite=False):