Annotate EP with dispatch/compute/combine

SherlockNoMad · SherlockNoMad · commit f47d26d74d27 · 2025-10-16T21:40:31.000-07:00
ghstack-source-id: faa8a54 Pull Request resolved: #1907
diff --git a/torchtitan/distributed/expert_parallel.py b/torchtitan/distributed/expert_parallel.py
@@ -73,6 +73,7 @@ def __init__(self):
         self.permuted_indices = None
 
     # performing all-to-all dispatch on the input
+    @torch.fx.traceback.annotate_fn({"EP": "dispatch"})
     def _token_dispatch(self, mod, inputs, device_mesh):
         # annotate module input placements/sharding with input_layouts
         routed_input, num_tokens_per_expert = inputs
@@ -145,6 +146,7 @@ def _partition_fn(name, mod, device_mesh):
             mod.register_parameter(name, dist_param)
 
     # performing all-to-all combine on the output
+    @torch.fx.traceback.annotate_fn({"EP": "combine"})
     def _token_combine(self, mod, routed_output, device_mesh):
         routed_output = _unpermute(
             routed_output, self.input_shape, self.permuted_indices
diff --git a/torchtitan/models/moe/moe.py b/torchtitan/models/moe/moe.py
@@ -139,6 +139,7 @@ def __init__(
         self.w3 = nn.Parameter(torch.empty(num_experts, hidden_dim, dim))
         self.use_grouped_mm = use_grouped_mm
 
+    @torch.fx.traceback.annotate_fn({"EP": "compute"})
     def forward(
         self,
         x: torch.Tensor,