From 3ccd12c87667e93dbacd72c0da24060c85421c96 Mon Sep 17 00:00:00 2001
From: Will Constable <whc@meta.com>
Date: Thu, 12 Jun 2025 21:08:11 -0700
Subject: [PATCH] [WIP] Integrate autoparallel into torchtitan

TODO
- try converting model params into fake tensors
- figure out init fn
- integrate torchtitan configs for DP/TP to control autop

Hack an init_fn for llama3 and observe loss decreasing with autoparallel

"""
[rank0]:[titan] 2025-06-16 16:24:16,593 - root - INFO - Training starts at step 1.
[rank0]:[titan] 2025-06-16 16:24:23,544 - root - INFO - step:  1  loss:  8.1880  memory:  4.88GiB(6.16%)  tps: 28
[rank0]:[titan] 2025-06-16 16:24:23,545 - root - INFO - Synchronizing and adjusting timeout for all ProcessGroups to 0:01:40
[rank0]:[titan] 2025-06-16 16:24:23,842 - root - INFO - step:  2  loss:  8.1610  memory:  4.90GiB(6.20%)  tps: 13,785
[rank0]:[titan] 2025-06-16 16:24:24,135 - root - INFO - step:  3  loss:  8.0871  memory:  4.90GiB(6.20%)  tps: 14,006
[rank0]:[titan] 2025-06-16 16:24:24,433 - root - INFO - step:  4  loss:  7.9516  memory:  4.90GiB(6.20%)  tps: 13,770
[rank0]:[titan] 2025-06-16 16:24:24,727 - root - INFO - step:  5  loss:  7.8552  memory:  4.90GiB(6.20%)  tps: 13,959
[rank0]:[titan] 2025-06-16 16:24:25,023 - root - INFO - step:  6  loss:  7.7732  memory:  4.90GiB(6.20%)  tps: 13,859
[rank0]:[titan] 2025-06-16 16:24:25,324 - root - INFO - step:  7  loss:  7.6987  memory:  4.90GiB(6.20%)  tps: 13,664
[rank0]:[titan] 2025-06-16 16:24:25,617 - root - INFO - step:  8  loss:  7.6779  memory:  4.90GiB(6.20%)  tps: 13,985
[rank0]:[titan] 2025-06-16 16:24:25,911 - root - INFO - step:  9  loss:  7.6043  memory:  4.90GiB(6.20%)  tps: 13,962
[rank0]:[titan] 2025-06-16 16:24:26,207 - root - INFO - step: 10  loss:  7.5778  memory:  4.90GiB(6.20%)  tps: 13,891
"""

Adopt new autoparallel API with meta-init model

Allows reverting a lot of the hacks in the original integration that
were caused by not creating a model obj in the train.py due to passing a
model_fn builder to autop.

Fixes to align with latest autoparallel

Add inductor config knobs for comms optimizations to torchtitan

Make inductor always run compile passes

basically, this is an annoying workaround for debugging iteratively.

1- you run the model, it compiles, but something weird happens
2- you enable some logging or tlparse, rerun. but inductor decides not
to run your pass anymore, its results are cached.

since (2) has confused me horribly on more than one occasion, i just
disable caching for now

Drop hacky llama3_init_fn and use autop init_weights feature

Relying on https://github.com/pytorch-labs/autoparallel/pull/20, this
lets us automatically apply a user's init_weights fn to the autoparallel
model.

Verified this works with

`CONFIG_FILE="./torchtitan/models/llama3/train_configs/debug_model.toml" ./run_train.sh --model.name llama3_auto_parallel --parallelism.tensor_parallel_degree 4 --training.dataset c4`

```
[rank0]:[titan] 2025-07-02 16:18:02,007 - root - INFO - Training starts at step 1.
[rank0]:[titan] 2025-07-02 16:18:08,224 - root - INFO - step:  1  loss:  8.1848  memory:  1.09GiB(1.14%)  tps: 77  tflops: 0.01  mfu: 0.00%
[rank0]:[titan] 2025-07-02 16:18:08,224 - root - INFO - Synchronizing and adjusting timeout for all ProcessGroups to 0:01:40
[rank0]:[titan] 2025-07-02 16:18:08,310 - root - INFO - step:  2  loss:  8.1619  memory:  1.15GiB(1.21%)  tps: 48,138  tflops: 3.46  mfu: 0.35
%
[rank0]:[titan] 2025-07-02 16:18:08,356 - root - INFO - step:  3  loss:  8.1140  memory:  1.15GiB(1.21%)  tps: 88,440  tflops: 6.36  mfu: 0.64
%
[rank0]:[titan] 2025-07-02 16:18:08,406 - root - INFO - step:  4  loss:  8.0099  memory:  1.15GiB(1.21%)  tps: 82,626  tflops: 5.94  mfu: 0.60
%
[rank0]:[titan] 2025-07-02 16:18:08,457 - root - INFO - step:  5  loss:  7.8928  memory:  1.15GiB(1.21%)  tps: 81,594  tflops: 5.87  mfu: 0.59
%
[rank0]:[titan] 2025-07-02 16:18:08,508 - root - INFO - step:  6  loss:  7.7758  memory:  1.15GiB(1.21%)  tps: 79,607  tflops: 5.72  mfu: 0.58
%
[rank0]:[titan] 2025-07-02 16:18:08,559 - root - INFO - step:  7  loss:  7.6221  memory:  1.15GiB(1.21%)  tps: 81,448  tflops: 5.86  mfu: 0.59
%
[rank0]:[titan] 2025-07-02 16:18:08,611 - root - INFO - step:  8  loss:  7.5578  memory:  1.15GiB(1.21%)  tps: 79,732  tflops: 5.73  mfu: 0.58
%
[rank0]:[titan] 2025-07-02 16:18:08,659 - root - INFO - step:  9  loss:  7.3851  memory:  1.15GiB(1.21%)  tps: 85,655  tflops: 6.16  mfu: 0.62
%
[rank0]:[titan] 2025-07-02 16:18:08,709 - root - INFO - step: 10  loss:  7.3361  memory:  1.15GiB(1.21%)  tps: 81,855  tflops: 5.89  mfu: 0.60
%
[rank0]:[titan] 2025-07-02 16:18:08,709 - root - INFO - Sleeping 2 seconds for other ranks to complete
```

fix lint
---
 torchtitan/config_manager.py                  | 22 ++++++
 torchtitan/experiments/__init__.py            |  1 +
 .../experiments/auto_parallel/README.md       |  7 ++
 .../experiments/auto_parallel/__init__.py     | 31 ++++++++
 .../auto_parallel/parallelize_llama.py        | 77 +++++++++++++++++++
 torchtitan/train.py                           | 27 +++++--
 6 files changed, 159 insertions(+), 6 deletions(-)
 create mode 100644 torchtitan/experiments/auto_parallel/README.md
 create mode 100644 torchtitan/experiments/auto_parallel/__init__.py
 create mode 100644 torchtitan/experiments/auto_parallel/parallelize_llama.py

diff --git a/torchtitan/config_manager.py b/torchtitan/config_manager.py
index 5f1a1e8b7f..1a45a7800c 100644
--- a/torchtitan/config_manager.py
+++ b/torchtitan/config_manager.py
@@ -664,6 +664,28 @@ class Experimental:
     needs to ensure that the path can be imported.
     """
 
+    reorder_for_compute_comm_overlap: bool = False
+    """
+    Whether to enable inductor comm reordering passes
+    """
+
+    reorder_for_compute_comm_overlap_passes: list[str] = field(
+        default_factory=lambda: [
+            "sink_waits",
+            "reorder_communication_preserving_peak_memory",
+        ]
+    )
+    """
+    Sequence of reordering passes (names of functions inside _inductor.comms) to call,
+    if reorder_for_compute_comm_overlap is enabled.
+    """
+
+    reorder_prefetch_limit: int | None = None
+    """
+    How many ops to allow moving any individual collective, if 'reorder_communication_preserving_peak_memory'
+    pass is enabled. default of None means unlimited
+    """
+
 
 @dataclass
 class Validation:
diff --git a/torchtitan/experiments/__init__.py b/torchtitan/experiments/__init__.py
index 4c54bdc13e..b7ff983e97 100644
--- a/torchtitan/experiments/__init__.py
+++ b/torchtitan/experiments/__init__.py
@@ -4,5 +4,6 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
+import torchtitan.experiments.auto_parallel  # noqa: F401
 import torchtitan.experiments.llama4  # noqa: F401
 import torchtitan.experiments.simple_fsdp  # noqa: F401
diff --git a/torchtitan/experiments/auto_parallel/README.md b/torchtitan/experiments/auto_parallel/README.md
new file mode 100644
index 0000000000..ef66a59166
--- /dev/null
+++ b/torchtitan/experiments/auto_parallel/README.md
@@ -0,0 +1,7 @@
+## Auto Parallel
+
+requires installing git@github.com:pytorch-labs/autoparallel.git
+
+`CONFIG_FILE="./torchtitan/models/llama3/train_configs/debug_model.toml" ./run_train.sh --model.name llama3_auto_parallel --parallelism.tensor_parallel_degree 4`
+
+(or llama3-8b.toml)
diff --git a/torchtitan/experiments/auto_parallel/__init__.py b/torchtitan/experiments/auto_parallel/__init__.py
new file mode 100644
index 0000000000..8f5a876b4e
--- /dev/null
+++ b/torchtitan/experiments/auto_parallel/__init__.py
@@ -0,0 +1,31 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+#
+# Copyright (c) Meta Platforms, Inc. All Rights Reserved.
+
+from torchtitan.components.loss import build_cross_entropy_loss
+from torchtitan.components.lr_scheduler import build_lr_schedulers
+from torchtitan.components.optimizer import build_optimizers
+from torchtitan.components.tokenizer import build_hf_tokenizer
+from torchtitan.datasets.hf_datasets import build_hf_dataloader
+from torchtitan.models.llama3 import llama3_configs, pipeline_llama, Transformer
+from torchtitan.protocols.train_spec import register_train_spec, TrainSpec
+from .parallelize_llama import parallelize_llama
+
+register_train_spec(
+    TrainSpec(
+        name="llama3_auto_parallel",
+        cls=Transformer,
+        config=llama3_configs,
+        parallelize_fn=parallelize_llama,
+        pipelining_fn=pipeline_llama,
+        build_optimizers_fn=build_optimizers,
+        build_lr_schedulers_fn=build_lr_schedulers,
+        build_dataloader_fn=build_hf_dataloader,
+        build_tokenizer_fn=build_hf_tokenizer,
+        build_loss_fn=build_cross_entropy_loss,
+    )
+)
diff --git a/torchtitan/experiments/auto_parallel/parallelize_llama.py b/torchtitan/experiments/auto_parallel/parallelize_llama.py
new file mode 100644
index 0000000000..bb7f1204df
--- /dev/null
+++ b/torchtitan/experiments/auto_parallel/parallelize_llama.py
@@ -0,0 +1,77 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import time
+
+import torch
+
+from autoparallel.api import AutoParallel
+
+from torch.distributed import DeviceMesh
+from torch.distributed.tensor.placement_types import Replicate, Shard
+
+from torchtitan.config_manager import JobConfig
+from torchtitan.distributed import ParallelDims
+
+from torchtitan.tools.logging import logger
+
+
+def parallelize_llama(
+    model,
+    world_mesh: DeviceMesh,
+    parallel_dims: ParallelDims,
+    job_config: JobConfig,
+):
+    """
+    Apply tensor parallelism, activation checkpointing, torch.compile, and data
+    parallelism to the model.
+
+    NOTE: The passed-in model preferably should be on meta device. Otherwise,
+    the model must fit on GPU or CPU memory.
+    """
+
+    def input_fn():
+        global_batch_size = job_config.training.global_batch_size
+        if global_batch_size < 0:
+            # This global batch size results in 1 gradient accumulation
+            # step.
+            dp_degree = world_mesh["dp"].size()
+            global_batch_size = job_config.training.local_batch_size * dp_degree
+        return torch.rand(
+            (global_batch_size, job_config.training.seq_len), device="cuda"
+        )
+
+    # TODO make autop work correctly with different combinations of DP, DP+TP, TP, and support DDP / HSDP
+    assert (
+        len(world_mesh.shape) == 2
+    ), "Only support 2D mesh (DP, TP) for now- OK if one has size=1"
+    assert parallel_dims.dp_shard_enabled is True, "DDP not supported yet"
+    assert parallel_dims.dp_replicate_enabled is False, "DDP not supported yet"
+    assert parallel_dims.cp_enabled is False, "CP not supported yet"
+    assert parallel_dims.pp_enabled is False, "PP not supported yet"
+
+    # bail out
+    # model = model_fn()
+    # return model
+
+    autop = AutoParallel(model, input_fn, world_mesh)
+    autop.add_parameter_memory_constraint(low=None, high=None)
+
+    x_sharding = (Shard(0), Replicate())
+
+    autop.add_input_constraints([x_sharding])
+    autop.add_output_constraints([x_sharding])
+    t0 = time.time()
+    sharding_placement = autop.optimize_placement()
+    t1 = time.time()
+    logger.info(f"AutoParallel took {t1 - t0} seconds")
+    parallel_mod = autop.apply_placement(sharding_placement)
+
+    if job_config.training.compile:
+        torch._inductor.config.reorder_for_peak_memory = False
+        parallel_mod = torch.compile(parallel_mod, fullgraph=True)
+
+    return parallel_mod
diff --git a/torchtitan/train.py b/torchtitan/train.py
index 3dc8a61b28..9626d8a5a4 100644
--- a/torchtitan/train.py
+++ b/torchtitan/train.py
@@ -12,6 +12,7 @@
 
 import torch
 from torch.distributed.elastic.multiprocessing.errors import record
+from torch.distributed.tensor import DTensor
 
 import torchtitan.components.ft as ft
 import torchtitan.protocols.train_spec as train_spec_module
@@ -116,6 +117,21 @@ def __init__(self, job_config: JobConfig):
             gc_freq=job_config.training.gc_freq, debug=job_config.training.gc_debug
         )
 
+        # TODO(whc)
+        # I do this becuase otherwise sometimes inductor will skip re-running passes like comms reordering
+        torch._inductor.config.force_disable_caches = True
+
+        # allow configuring inductor comms optimizations from torchtitan commandline
+        torch._inductor.config.reorder_for_compute_comm_overlap = (
+            job_config.experimental.reorder_for_compute_comm_overlap
+        )
+        torch._inductor.config.reorder_for_compute_comm_overlap_passes = (
+            job_config.experimental.reorder_for_compute_comm_overlap_passes
+        )
+        torch._inductor.config.reorder_prefetch_limit = (
+            job_config.experimental.reorder_prefetch_limit
+        )
+
         # Set random seed, and maybe enable deterministic mode
         # (mainly for debugging, expect perf loss).
         dist_utils.set_determinism(
@@ -141,20 +157,19 @@ def __init__(self, job_config: JobConfig):
         )
 
         # build model (using meta init)
-        model_cls = self.train_spec.cls
         model_args = self.train_spec.config[job_config.model.flavor]
+        model_cls = self.train_spec.cls
         # set the model args from training job configs
         model_args.update_from_config(job_config, tokenizer)
-
         logger.info(
             f"Building {self.train_spec.name} {job_config.model.flavor} with {model_args}"
         )
+
         with torch.device("meta"):
             model = model_cls(model_args)
-
-        # Build the collection of model converters. No-op if `model.converters` empty
-        model_converters = build_model_converters(job_config, parallel_dims)
-        model_converters.convert(model)
+            # Build the collection of model converters. No-op if `model.converters` empty
+            model_converters = build_model_converters(job_config, parallel_dims)
+            model_converters.convert(model)
 
         # metrics logging
         build_metrics_processor_fn = (