- add 1xH20 performance

- add 4xH20 performance and 1xH20 performance with torch.compile
xdit-project · Feb 24, 2025 · a50dcfd · a50dcfd
1 parent 90c2d72
commit a50dcfd
Show file tree

Hide file tree

Showing 4 changed files with 178 additions and 235 deletions.
diff --git a/docs/performance/flux.md b/docs/performance/flux.md
@@ -117,15 +117,48 @@ The quality of image generation at 2048px, 3072px, and 4096px resolutions is as
 
 ## Cache Methods
 
-We tested the performance of TeaCache and First-Block-Cache on 4xH20 with SP=4.
+We tested the performance of TeaCache and First-Block-Cache on 4xH20 with SP=4 and 1xH20 respectively.
 The Performance shown as below:
 
 <div align="center">
 
-| Method          | Latency (s) |
-|----------------|--------|
-| Baseline       | 2.02s  |
-| use_teacache   | 1.58s  |
-| use_fbcache    | 0.93s  |
+<table>
+  <tr>
+    <th rowspan="2">Method</th>
+    <th colspan="4">Latency (s)</th>
+  </tr>
+  <tr>
+    <th colspan="2">without torch.compile</th>
+    <th colspan="2">with torch.compile</th>
+  </tr>
+  <tr>
+    <th></th>
+    <th>4xH20</th>
+    <th>1xH20</th>
+    <th>4xH20</th>
+    <th>1xH20</th>
+  </tr>
+  <tr>
+    <td>Baseline</td>
+    <td>2.02s</td>
+    <td>6.10s</td>
+    <td>1.81s</td>
+    <td>5.02s</td>
+  </tr>
+  <tr>
+    <td>use_teacache</td>
+    <td>1.60s</td>
+    <td>4.67s</td>
+    <td>1.55s</td>
+    <td>3.97s</td>
+  </tr>
+  <tr>
+    <td>use_fbcache</td>
+    <td>0.93s</td>
+    <td>2.51s</td>
+    <td>0.86s</td>
+    <td>2.10s</td>
+  </tr>
+</table>
 
 </div>
diff --git a/examples/flux_example.py b/examples/flux_example.py
@@ -51,11 +51,13 @@ def main():
     pipe.prepare_run(input_config, steps=1)
 
     use_cache = engine_args.use_teacache or engine_args.use_fbcache
-    if (use_cache
+    use_cache = (
+        use_cache
         and get_pipeline_parallel_world_size() == 1
         and get_classifier_free_guidance_world_size() == 1
         and get_tensor_model_parallel_world_size() == 1
-    ):
+    )
+    if use_cache:
         cache_args = {
             "rel_l1_thresh": 0.6,
             "return_hidden_states_first": False,
@@ -69,7 +71,11 @@ def main():
         elif engine_args.use_fbcache:
             cache_args["use_cache"] = "Fb"
 
-        pipe.transformer = apply_cache_on_transformer(pipe.transformer, **cache_args)
+        if engine_config.runtime_config.use_torch_compile:
+            pipe.transformer = torch.compile(apply_cache_on_transformer(pipe.original_transformer, **cache_args))
+            pipe.prepare_run(input_config, steps=4)
+        else:
+            pipe.transformer = apply_cache_on_transformer(pipe.transformer, **cache_args)
     torch.cuda.reset_peak_memory_stats()
     start_time = time.time()
     output = pipe(