add atenbucketing pass

ruisizhang123 · ruisizhang123 · commit 6e2a44ff5227 · 2025-10-02T10:54:52.000-07:00
diff --git a/torchtitan/experiments/simple_fsdp/parallelize.py b/torchtitan/experiments/simple_fsdp/parallelize.py
@@ -93,8 +93,45 @@ def parallelize_llama(
         )
         logger.info("Applied Data Parallel (dp mode=%s) to the model", dp_mode)
 
-    if job_config.compile.enable and "model" in job_config.compile.components:
-        torch._inductor.config.reorder_for_peak_memory = False
+    if job_config.compile.enable:
+        from functools import partial
+        bucket_level = ""
+        torch._inductor.config.run_with_post_grad_graph = True
+        if bucket_level == "inductor":
+            # enable simplefsdp's autobucketing and reorder passes (original code in https://github.com/pytorch/pytorch/pull/160282)
+            from autoparallel.auto_bucketing import (
+                simple_fsdp_autobucketing_reordering_pass,
+                simplefsdp_autobucketing_config,
+            )
+
+            torch._inductor.config.allow_buffer_reuse = False
+            torch._inductor.config.reorder_for_peak_memory = False
+            torch._inductor.config.reorder_for_compute_comm_overlap = True
+            simplefsdp_autobucketing_config.save_estimation_path = (
+                "/tmp/torchtitan_simplefsdp_comm_estimation.pkl"
+            )
+            simplefsdp_autobucketing_config.calibrate_number = 20
+            simple_fsdp_autobucketing_reordering_pass = partial(
+                simple_fsdp_autobucketing_reordering_pass,
+                configs=simplefsdp_autobucketing_config,
+            )
+            torch._inductor.config.reorder_for_compute_comm_overlap_passes = [
+                simple_fsdp_autobucketing_reordering_pass
+            ]
+
+            # Don't use both sets of passes at the same time!
+            torch._inductor.config.bucket_all_gathers_fx = "none"
+            torch._inductor.config.bucket_reduce_scatters_fx = "none"
+        elif bucket_level == "aten":
+            from autoparallel.auto_bucketing import aten_autobucketing_reordering_pass, aten_autobucketing_config
+            torch._inductor.config.reorder_for_peak_memory = False
+            torch._inductor.config.reorder_for_compute_comm_overlap = False
+            aten_autobucketing_reordering_pass = partial(
+                aten_autobucketing_reordering_pass,
+                configs=aten_autobucketing_config,
+            )
+            torch._inductor.config.post_grad_custom_post_pass = aten_autobucketing_reordering_pass
+
         model = torch.compile(model, fullgraph=True)
 
     return model
diff --git a/torchtitan/models/llama3/train_configs/llama3_8b.toml b/torchtitan/models/llama3/train_configs/llama3_8b.toml
@@ -34,7 +34,7 @@ local_batch_size = 1
 seq_len = 8192
 max_norm = 1.0  # grad norm clipping
 steps = 1000
-dataset = "c4"
+dataset = "c4_test"
 
 [parallelism]
 data_parallel_replicate_degree = 1