[Compiler Toolkit] Add annotations to MoE (#1937)

SherlockNoMad · web-flow · commit cfae061027a1 · 2025-10-27T11:44:30.000-07:00
Stack from [ghstack](https://github.com/ezyang/ghstack) (oldest at bottom): * __->__ #1937 * #1906 sample output ``` [rank0]: # Annotation: {'EP': 'dispatch'} File: /data/users/bahuang/pytorch/torch/distributed/_functional_collectives.py:485 in all_to_all_single, code: tensor = torch.ops._c10d_functional.all_to_all_single( # type: ignore[attr-defined] [rank0]: tensor_3: "i64[8]" = torch.ops._c10d_functional.all_to_all_single(num_tokens_per_expert_3, [4, 4], [4, 4], '11') [rank0]: [rank0]: # Annotation: {'EP': 'dispatch'} File: /data/users/bahuang/pytorch/torch/distributed/_functional_collectives.py:136 in wait_tensor, code: return torch.ops._c10d_functional.wait_tensor(tensor) # type: ignore[attr-defined] [rank0]: num_tokens_per_expert_group_2: "i64[8]" = torch.ops._c10d_functional.wait_tensor(tensor_3); tensor_3 = None ``` ``` **[rank0]: # Annotation: {'EP': 'combine'} File: /data/users/bahuang/pytorch/torch/distributed/_functional_collectives.py:522 in all_to_all_single_autograd, code: tensor = torch.ops._c10d_functional_autograd.all_to_all_single( # type: ignore[attr-defined] [rank0]: slice_20: "bf16[u18 + u19, 256]" = torch.ops.aten.slice.Tensor(index_put_6, 0, 0, -1); index_put_6 = None [rank0]: all_to_all_single_14: "bf16[u16 + u17, 256]" = torch.ops._c10d_functional.all_to_all_single.default(slice_20, [_local_scalar_dense_16, _local_scalar_dense_17], [_local_scalar_dense_18, _local_scalar_dense_19], '11'); slice_20 = None [rank0]: [rank0]: # Annotation: {'EP': 'combine'} File: /data/users/bahuang/pytorch/torch/distributed/_functional_collectives.py:528 in all_to_all_single_autograd, code: return _FromTorchTensor.apply(tensor) [rank0]: wait_tensor_136: "bf16[u16 + u17, 256]" = torch.ops._c10d_functional.wait_tensor.default(all_to_all_single_14); all_to_all_single_14 = None [rank0]: ```
diff --git a/torchtitan/experiments/compiler_toolkit/deepseek_v3/parallelize.py b/torchtitan/experiments/compiler_toolkit/deepseek_v3/parallelize.py
@@ -14,13 +14,17 @@
 from torch._guards import tracing
 
 from torch.distributed.tensor import DTensor, Replicate
+
+from torch.fx.traceback import annotate_fn
 from torchtitan.config import JobConfig
 from torchtitan.distributed import ParallelDims
+from torchtitan.distributed.expert_parallel import ExpertParallel
 
 from torchtitan.experiments.compiler_toolkit.graph_utils import export_joint
 from torchtitan.experiments.simple_fsdp.deepseek_v3.parallelize import (
     parallelize_deepseekv3 as simple_fsdp_parallelize_deepseekv3,
 )
+from torchtitan.models.moe.moe import MoE
 from torchtitan.tools.logging import logger
 
 
@@ -128,12 +132,25 @@ def wrapper_fn(args):
     return wrapper_fn
 
 
+def annotate_model() -> None:
+    # annotate the MoE with dispatch, compute and combine
+    ExpertParallel._token_dispatch = annotate_fn({"EP": "dispatch"})(
+        ExpertParallel._token_dispatch
+    )
+    ExpertParallel._token_combine = annotate_fn({"EP": "combine"})(
+        ExpertParallel._token_combine
+    )
+    MoE.forward = annotate_fn({"EP": "compute"})(MoE.forward)
+
+
 def parallelize_deepseekv3(
     model: nn.Module,
     parallel_dims: ParallelDims,
     job_config: JobConfig,
 ) -> CompiledModule:
 
+    annotate_model()
+
     # Diable torch.compile over the model in the compiler toolkit style workflow
     with disable_compile(job_config):
         model = simple_fsdp_parallelize_deepseekv3(model, parallel_dims, job_config)