From a3fa0c4c8a2fdb31a9278f9fcd86a019a9caef13 Mon Sep 17 00:00:00 2001
From: Jiarui Fang <fangjiarui123@gmail.com>
Date: Fri, 25 Oct 2024 10:32:33 +0800
Subject: [PATCH] upgrade diffusers to 0.31 (#316)

---
 benchmark/run.sh                | 12 +++++++++---
 examples/pixartalpha_example.py |  5 ++++-
 examples/run.sh                 |  9 +++++++--
 examples/sd3_example.py         |  3 ++-
 setup.py                        |  2 +-
 xfuser/__version__.py           |  2 +-
 6 files changed, 24 insertions(+), 9 deletions(-)

diff --git a/benchmark/run.sh b/benchmark/run.sh
index 4adc2ea5..8a4c8731 100644
--- a/benchmark/run.sh
+++ b/benchmark/run.sh
@@ -6,8 +6,11 @@ set -x
 # MODEL="/mnt/models/SD/stable-diffusion-3-medium-diffusers"
 # SCRIPT="./examples/sd3_example.py"
 
-MODEL="/mnt/models/SD/HunyuanDiT-v1.2-Diffusers"
-SCRIPT="./examples/hunyuandit_example.py"
+# MODEL="/mnt/models/SD/HunyuanDiT-v1.2-Diffusers"
+# SCRIPT="./examples/hunyuandit_example.py"
+
+MODEL="/cfs/dit/FLUX.1-dev/"
+SCRIPT="./examples/flux_example.py"
 
 export PYTHONPATH=$PWD:$PYTHONPATH
 
@@ -15,4 +18,7 @@ python benchmark/single_node_latency_test.py \
 --model_id $MODEL \
 --script $SCRIPT \
 --sizes 1024 \
---no_use_resolution_binning
\ No newline at end of file
+--no_use_resolution_binning \
+--num_inference_steps 28 \
+--no_use_cfg_parallel \
+--n_gpus 4
\ No newline at end of file
diff --git a/examples/pixartalpha_example.py b/examples/pixartalpha_example.py
index 4a333352..4d50467a 100644
--- a/examples/pixartalpha_example.py
+++ b/examples/pixartalpha_example.py
@@ -24,6 +24,7 @@ def main():
         engine_config=engine_config,
         torch_dtype=torch.float16,
     ).to(f"cuda:{local_rank}")
+    model_memory = torch.cuda.max_memory_allocated(device=f"cuda:{local_rank}")
     pipe.prepare_run(input_config)
 
     torch.cuda.reset_peak_memory_stats()
@@ -62,7 +63,9 @@ def main():
                 print(img_file)
 
     if get_world_group().rank == get_world_group().world_size - 1:
-        print(f"epoch time: {elapsed_time:.2f} sec, memory: {peak_memory/1e9} GB")
+        print(
+            f"epoch time: {elapsed_time:.2f} sec, model memory: {model_memory/1e9:.2f} GB, overall memory: {peak_memory/1e9:.2f} GB"
+        )
     get_runtime_state().destory_distributed_env()
 
 
diff --git a/examples/run.sh b/examples/run.sh
index 6119ee95..c24f8c05 100644
--- a/examples/run.sh
+++ b/examples/run.sh
@@ -27,8 +27,10 @@ mkdir -p ./results
 # task args
 TASK_ARGS="--height 1024 --width 1024 --no_use_resolution_binning"
 
+
+# On 8 gpus, pp=2, ulysses=2, ring=1, cfg_parallel=2 (split batch)
 N_GPUS=8
-PARALLEL_ARGS="--ulysses_degree 1 --ring_degree 1 --pipefusion_parallel_degree 8"
+PARALLEL_ARGS="--pipefusion_parallel_degree 2 --ulysses_degree 2 --ring_degree 2"
 
 # CFG_ARGS="--use_cfg_parallel"
 
@@ -43,6 +45,9 @@ PARALLEL_ARGS="--ulysses_degree 1 --ring_degree 1 --pipefusion_parallel_degree 8
 # Another compile option is `--use_onediff` which will use onediff's compiler.
 # COMPILE_FLAG="--use_torch_compile"
 
+
+# export CUDA_VISIBLE_DEVICES=4,5,6,7
+
 torchrun --nproc_per_node=$N_GPUS ./examples/$SCRIPT \
 --model $MODEL_ID \
 $PARALLEL_ARGS \
@@ -50,7 +55,7 @@ $TASK_ARGS \
 $PIPEFUSION_ARGS \
 $OUTPUT_ARGS \
 --num_inference_steps $INFERENCE_STEP \
---warmup_steps 0 \
+--warmup_steps 1 \
 --prompt "brown dog laying on the ground with a metal bowl in front of him." \
 $CFG_ARGS \
 $PARALLLEL_VAE \
diff --git a/examples/sd3_example.py b/examples/sd3_example.py
index 5ea5cc27..17de013a 100644
--- a/examples/sd3_example.py
+++ b/examples/sd3_example.py
@@ -66,8 +66,9 @@ def main():
 
     if get_world_group().rank == get_world_group().world_size - 1:
         print(
-            f"{parallel_info} epoch time: {elapsed_time:.2f} sec, parameter memory: {parameter_peak_memory/1e9:.2f} GB, peak memory: {peak_memory/1e9:.2f} GB"
+            f"epoch time: {elapsed_time:.2f} sec, parameter memory: {parameter_peak_memory/1e9:.2f} GB, peak memory: {peak_memory/1e9:.2f} GB"
         )
+
     get_runtime_state().destory_distributed_env()
 
 
diff --git a/setup.py b/setup.py
index 0749593a..452ce123 100644
--- a/setup.py
+++ b/setup.py
@@ -28,7 +28,7 @@ def get_cuda_version():
         install_requires=[
             "torch>=2.1.0",
             "accelerate>=0.33.0",
-            "diffusers@git+https://github.com/huggingface/diffusers",  # NOTE: diffusers>=0.31.0.dev is necessary for CogVideoX and Flux
+            "diffusers>=0.31",  # NOTE: diffusers>=0.31.0 is necessary for CogVideoX and Flux
             "transformers>=4.39.1",
             "sentencepiece>=0.1.99",
             "beautifulsoup4>=4.12.3",
diff --git a/xfuser/__version__.py b/xfuser/__version__.py
index f9aa3e11..e19434e2 100644
--- a/xfuser/__version__.py
+++ b/xfuser/__version__.py
@@ -1 +1 @@
-__version__ = "0.3.2"
+__version__ = "0.3.3"