add atenbucketing pass

ruisizhang123 · ruisizhang123 · commit c62d60f45746 · 2025-10-02T10:40:01.000-07:00
diff --git a/torchtitan/config/job_config.py b/torchtitan/config/job_config.py
@@ -737,7 +737,7 @@ class Experimental:
 
     autop_force_bf16: bool = False
 
-    enable_simplefsdp_passes: bool = False
+    enable_autobucketing_passes: str = ""
 
 @dataclass
 class Validation:
diff --git a/torchtitan/experiments/auto_parallel/README.md b/torchtitan/experiments/auto_parallel/README.md
@@ -4,8 +4,10 @@ requires installing git@github.com:pytorch-labs/autoparallel.git
 
 `CONFIG_FILE="./torchtitan/models/llama3/train_configs/debug_model.toml" ./run_train.sh --model.name llama3_auto_parallel --parallelism.tensor_parallel_degree 4`
 
-Use simplefsdp's autobucketing pass:
+Use autobucketing pass:
 
-`CONFIG_FILE="./torchtitan/models/llama3/train_configs/debug_model.toml" ./run_train.sh --model.name llama3_auto_parallel --parallelism.tensor_parallel_degree 4 --experimental.enable_simplefsdp_passes --compile.enable`
+`CONFIG_FILE="./torchtitan/models/llama3/train_configs/debug_model.toml" ./run_train.sh --model.name llama3_auto_parallel --parallelism.tensor_parallel_degree 4 --experimental.enable_autobucketing_passes "aten" --compile.enable`
+
+Set `experimental.enable_autobucketing_passes` to
 
 (or llama3-8b.toml)
diff --git a/torchtitan/experiments/simple_fsdp/parallelize.py b/torchtitan/experiments/simple_fsdp/parallelize.py
@@ -93,8 +93,45 @@ def parallelize_llama(
         )
         logger.info("Applied Data Parallel (dp mode=%s) to the model", dp_mode)
 
-    if job_config.compile.enable and "model" in job_config.compile.components:
-        torch._inductor.config.reorder_for_peak_memory = False
+    if job_config.compile.enable:
+        from functools import partial
+        bucket_level = ""
+        torch._inductor.config.run_with_post_grad_graph = False
+        if bucket_level == "inductor":
+            # enable simplefsdp's autobucketing and reorder passes (original code in https://github.com/pytorch/pytorch/pull/160282)
+            from autoparallel.auto_bucketing import (
+                simple_fsdp_autobucketing_reordering_pass,
+                simplefsdp_autobucketing_config,
+            )
+
+            torch._inductor.config.allow_buffer_reuse = False
+            torch._inductor.config.reorder_for_peak_memory = False
+            torch._inductor.config.reorder_for_compute_comm_overlap = True
+            simplefsdp_autobucketing_config.save_estimation_path = (
+                "/tmp/torchtitan_simplefsdp_comm_estimation.pkl"
+            )
+            simplefsdp_autobucketing_config.calibrate_number = 20
+            simple_fsdp_autobucketing_reordering_pass = partial(
+                simple_fsdp_autobucketing_reordering_pass,
+                configs=simplefsdp_autobucketing_config,
+            )
+            torch._inductor.config.reorder_for_compute_comm_overlap_passes = [
+                simple_fsdp_autobucketing_reordering_pass
+            ]
+
+            # Don't use both sets of passes at the same time!
+            torch._inductor.config.bucket_all_gathers_fx = "none"
+            torch._inductor.config.bucket_reduce_scatters_fx = "none"
+        elif bucket_level == "aten":
+            from autoparallel.auto_bucketing import aten_autobucketing_reordering_pass, aten_autobucketing_config
+            torch._inductor.config.reorder_for_peak_memory = False
+            torch._inductor.config.reorder_for_compute_comm_overlap = False
+            aten_autobucketing_reordering_pass = partial(
+                aten_autobucketing_reordering_pass,
+                configs=aten_autobucketing_config,
+            )
+            torch._inductor.config.post_grad_custom_post_pass = aten_autobucketing_reordering_pass
+
         model = torch.compile(model, fullgraph=True)
 
     return model
diff --git a/torchtitan/models/llama3/train_configs/llama3_8b.toml b/torchtitan/models/llama3/train_configs/llama3_8b.toml
@@ -34,7 +34,7 @@ local_batch_size = 1
 seq_len = 8192
 max_norm = 1.0  # grad norm clipping
 steps = 1000
-dataset = "c4"
+dataset = "c4_test"
 
 [parallelism]
 data_parallel_replicate_degree = 1
diff --git a/torchtitan/train.py b/torchtitan/train.py
@@ -128,9 +128,8 @@ def __init__(self, job_config: JobConfig):
         torch._inductor.config.force_disable_caches = True
         # this is necessary for working with reordering passes. Just leave it set for all the jobs for now.
         torch._inductor.config.allow_buffer_reuse = False
-
         # allow configuring inductor comms optimizations from torchtitan commandline
-        if job_config.experimental.enable_simplefsdp_passes:
+        if job_config.experimental.enable_autobucketing_passes == "inductor":
             # enable simplefsdp's autobucketing and reorder passes (original code in https://github.com/pytorch/pytorch/pull/160282)
             from autoparallel.auto_bucketing import (
                 simple_fsdp_autobucketing_reordering_pass,
@@ -143,6 +142,7 @@ def __init__(self, job_config: JobConfig):
             simplefsdp_autobucketing_config.save_estimation_path = (
                 "/tmp/torchtitan_simplefsdp_comm_estimation.pkl"
             )
+            simplefsdp_autobucketing_config.calibrate_number = 20
             simple_fsdp_autobucketing_reordering_pass = partial(
                 simple_fsdp_autobucketing_reordering_pass,
                 configs=simplefsdp_autobucketing_config,
@@ -154,6 +154,15 @@ def __init__(self, job_config: JobConfig):
             # Don't use both sets of passes at the same time!
             torch._inductor.config.bucket_all_gathers_fx = "none"
             torch._inductor.config.bucket_reduce_scatters_fx = "none"
+        elif job_config.experimental.enable_autobucketing_passes == "aten":
+            from autoparallel.auto_bucketing import aten_autobucketing_reordering_pass, aten_autobucketing_config
+            torch._inductor.config.reorder_for_peak_memory = False
+            torch._inductor.config.reorder_for_compute_comm_overlap = False
+            aten_autobucketing_reordering_pass = partial(
+                aten_autobucketing_reordering_pass,
+                configs=aten_autobucketing_config,
+            )
+            torch._inductor.config.post_grad_custom_post_pass = aten_autobucketing_reordering_pass
         else:
             torch._inductor.config.bucket_all_gathers_fx = (
                 job_config.experimental.bucket_all_gathers_fx