Implicit overlap of shared expert compute and token combine communication (#1741)

mreso · web-flow · commit 476a965f9343 · 2025-09-23T09:08:39.000-07:00
This PR moves the computation of the shared expert before the possible scoring of the routed expert output which leads to an implicit overlap between shared expert compute and token combine communication in MoE models. Repro (lowered the layer number to 2): ``` CONFIG_FILE="./torchtitan/models/deepseek_v3/train_configs/deepseek_v3_16b.toml" ./run_train.sh --profiling.enable_profiling --profiling.profile_freq 10 --training.steps 10 ``` Trace before the change: <img width="1503" height="625" alt="Screenshot 2025-09-23 at 12 08 31 AM" src="https://github.com/user-attachments/assets/bbcc41cf-6497-482e-972e-d917baf4498e" /> Trace after the change (note that all-to-all comm is now overlapping shared expert compute): <img width="1503" height="625" alt="Screenshot 2025-09-23 at 12 04 56 AM" src="https://github.com/user-attachments/assets/3504e77c-aa14-46fd-8e47-e247b88d7b9c" /> cc @tianyu-l @xmfan
diff --git a/torchtitan/models/moe.py b/torchtitan/models/moe.py
@@ -417,18 +417,20 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
         # shape (bs*slen*top_k, dim)
         routed_output = self.experts(routed_input, num_tokens_per_expert)
 
-        if not self.score_before_experts:
-            routed_output = (
-                routed_output.to(torch.float32)
-                * top_scores_experts_sorted.reshape(-1, 1)
-            ).to(x.dtype)
-
         # shared expert
+        # Note: we execute the shared expert before scoring the output of the routed expert
+        # to "implicitly" overlap the shared expert compute with token combine communication
         if self.shared_experts is not None:
             out = self.shared_experts(x)
         else:
             out = torch.zeros_like(x)
 
+        if not self.score_before_experts:
+            routed_output = (
+                routed_output.to(torch.float32)
+                * top_scores_experts_sorted.reshape(-1, 1)
+            ).to(x.dtype)
+
         out = out.scatter_add(
             dim=0, index=token_indices_experts_sorted, src=routed_output
         )