use sync based mxfp8 a2a

danielvegamyhre · danielvegamyhre · commit 751a472f28f3 · 2025-09-30T14:17:27.000-07:00
diff --git a/torchtitan/distributed/expert_parallel.py b/torchtitan/distributed/expert_parallel.py
@@ -22,6 +22,7 @@
 )
 from torch.distributed.tensor.parallel import ParallelStyle
 
+from torchtitan.tools.logging import logger
 from torchtitan.tools.utils import _round_up
 
 
@@ -87,20 +88,60 @@ class ExpertParallel(ParallelStyle):
 
     Args:
         a2a_impl (str): The implementation of all-to-all. Default is "default". Options are ["default","mxfp8"].
-        max_tokens_per_ep_rank (int): The maximum number of tokens per expert rank. Only used for "mxfp8".
     """
 
-    def __init__(self, a2a_impl: str = "default", max_tokens_per_ep_rank: int = -1):
+    def __init__(self, a2a_impl: str = "default"):
         super().__init__()
         self.input_splits = None
         self.output_splits = None
-        self.a2a_impl = a2a_impl
-        self.max_tokens_per_ep_rank = max_tokens_per_ep_rank
+        self.a2a_func = self._get_a2a_func(a2a_impl)
+
+    def _get_a2a_func(self, a2a_impl: str):
+        if a2a_impl == "default":
+            logger.info("Using default all-to-all implementation")
+            return all_to_all_single_autograd
+        elif a2a_impl == "mxfp8":
+            logger.info("Using mxfp8 all-to-all implementation")
+            from torchao.prototype.moe_training.kernels.mxfp8.comms import (
+                mxfp8_sync_all_to_all_v,
+            )
+
+            return mxfp8_sync_all_to_all_v
+        else:
+            raise ValueError(f"Unknown a2a_impl: {a2a_impl}")
 
     # performing all-to-all dispatch on the input
     def _token_dispatch(self, mod, inputs, device_mesh):
         # annotate module input placements/sharding with input_layouts
         routed_input, num_tokens_per_expert = inputs
+        ep_size = device_mesh.size(0)
+
+        # generate the input splits and output splits for all-to-all
+        with torch.no_grad():
+            num_tokens_per_expert_group = all_to_all_single(
+                num_tokens_per_expert,
+                None,
+                None,
+                group=device_mesh.get_group(),
+            )
+            # Need to wait explicitly because it is used by a triton kernel later
+            # which doesn't realize that AsyncCollectiveTensor needs unwrapping
+            num_tokens_per_expert_group = torch.ops._c10d_functional.wait_tensor(
+                num_tokens_per_expert_group
+            )
+            input_splits = (
+                num_tokens_per_expert.view(ep_size, -1)
+                .sum(dim=1)
+                .to(torch.device("cpu"), non_blocking=True)
+            )
+            # NOTE: this would incur a device-to-host sync
+            output_splits = (
+                num_tokens_per_expert_group.view(ep_size, -1)
+                .sum(dim=1)
+                .to(torch.device("cpu"), non_blocking=False)
+            )
+            self.input_splits = input_splits.tolist()
+            self.output_splits = output_splits.tolist()
 
         # NOTE: After this all-to-all, the routed input is put on proper EP rank.
         # However, the num_tokens_per_expert_group is not of the final target format
@@ -111,25 +152,12 @@ def _token_dispatch(self, mod, inputs, device_mesh):
         # We need to perform another shuffle to get the correct format -- this is done via the function
         # generate_permute_indices in moe.py, which also does padding to make sure the number of tokens
         # each expert gets locally is a multiple of ALIGN_SIZE_M.
-        if self.a2a_impl == "mxfp8":
-            (
-                routed_input,
-                self.input_splits,
-                self.output_splits,
-                num_tokens_per_expert_group,
-            ) = mxfp8_a2a_dispatch(
-                routed_input,
-                num_tokens_per_expert,
-                device_mesh,
-                self.max_tokens_per_ep_rank,
-            )
-        else:
-            (
-                routed_input,
-                self.input_splits,
-                self.output_splits,
-                num_tokens_per_expert_group,
-            ) = default_a2a_dispatch(routed_input, num_tokens_per_expert, device_mesh)
+        routed_input = self.a2a_func(
+            routed_input,
+            self.output_splits,
+            self.input_splits,
+            device_mesh.get_group(),
+        )
         return routed_input, num_tokens_per_expert_group
 
     @staticmethod
@@ -141,25 +169,13 @@ def _partition_fn(name, mod, device_mesh):
 
     # performing all-to-all combine on the output
     def _token_combine(self, mod, routed_output, device_mesh):
-        if self.a2a_impl == "mxfp8":
-            from torchao.prototype.moe_training.kernels.mxfp8.comms import (
-                mxfp8_on_device_all_to_all_v,
-            )
-
-            # For a2a combine, output splits are the input splits, and input splits are the output splits.
-            routed_output, self.input_splits = mxfp8_on_device_all_to_all_v(
-                routed_output,
-                self.output_splits,
-                self.max_tokens_per_ep_rank,
-                device_mesh.get_group().group_name,
-            )
-        else:
-            routed_output = all_to_all_single_autograd(
-                routed_output,
-                self.input_splits,
-                self.output_splits,
-                device_mesh.get_group(),
-            )
+        # For a2a combine, input splits and output splits are opposite of a2a dispatch.
+        routed_output = self.a2a_func(
+            routed_output,
+            self.input_splits,
+            self.output_splits,
+            device_mesh.get_group(),
+        )
         return routed_output
 
     def _apply(self, module: nn.Module, device_mesh: DeviceMesh) -> nn.Module:
@@ -349,105 +365,3 @@ def _apply(self, module: nn.Module, device_mesh: DeviceMesh) -> nn.Module:
             input_fn=self._prepare_inputput_fn,
             output_fn=self._prepare_output_fn,
         )
-
-
-def default_a2a_dispatch(
-    routed_input: torch.Tensor,
-    num_tokens_per_expert: torch.Tensor,
-    device_mesh: DeviceMesh,
-):
-    """
-    Default implementation of all-to-all dispatch. Incurs device-to-host sync.
-
-    Returns:
-        routed_input: the local tokens after all-to-all dispatch
-        input_splits: the input splits for all-to-all dispatch
-        output_splits: the output splits for all-to-all dispatch
-        num_tokens_per_expert_group: the number of tokens per EP rank after all-to-all dispatch
-    """
-    ep_degree = device_mesh.size(0)
-    # generate the input splits and output splits for all-to-all
-    with torch.no_grad():
-        num_tokens_per_expert_group = all_to_all_single(
-            num_tokens_per_expert,
-            None,
-            None,
-            group=device_mesh.get_group(),
-        )
-        # Need to wait explicitly because it is used by a triton kernel later
-        # which doesn't realize that AsyncCollectiveTensor needs unwrapping
-        num_tokens_per_expert_group = torch.ops._c10d_functional.wait_tensor(
-            num_tokens_per_expert_group
-        )
-        input_splits = (
-            num_tokens_per_expert.view(ep_degree, -1)
-            .sum(dim=1)
-            .to(torch.device("cpu"), non_blocking=True)
-        )
-        # NOTE: this would incur a device-to-host sync
-        output_splits = (
-            num_tokens_per_expert_group.view(ep_degree, -1)
-            .sum(dim=1)
-            .to(torch.device("cpu"), non_blocking=False)
-        )
-        input_splits_list = input_splits.tolist()
-        output_splits_list = output_splits.tolist()
-
-    # perform all-to-all
-    routed_input = all_to_all_single_autograd(
-        routed_input,
-        output_splits_list,
-        input_splits_list,
-        device_mesh.get_group(),
-    )
-    return (
-        routed_input,
-        input_splits_list,
-        output_splits_list,
-        num_tokens_per_expert_group,
-    )
-
-
-def mxfp8_a2a_dispatch(
-    routed_input: torch.Tensor,
-    num_tokens_per_expert: torch.Tensor,
-    device_mesh: DeviceMesh,
-    max_tokens_per_ep_rank: int,
-):
-    """
-    Perform on-device all-to-all dispatch with dynamically quantized mxfp8 inputs to save network bandwidth
-    and avoid device-to-host sync.
-
-    Returns:
-        routed_input: the local tokens after all-to-all dispatch
-        input_splits: the input splits for all-to-all dispatch
-        output_splits: the output splits for all-to-all dispatch
-        num_tokens_per_expert_group: the number of tokens per EP rank after all-to-all dispatch
-    """
-    from torchao.prototype.moe_training.kernels.mxfp8.comms import (
-        mxfp8_on_device_all_to_all_v,
-    )
-
-    ep_degree = device_mesh.size(0)
-    input_splits_per_ep_rank = num_tokens_per_expert.view(ep_degree, -1).sum(dim=1)
-    num_tokens_per_expert_group = all_to_all_single(
-        num_tokens_per_expert,
-        None,
-        None,
-        group=device_mesh.get_group(),
-    )
-    num_tokens_per_expert_group = torch.ops._c10d_functional.wait_tensor(
-        num_tokens_per_expert_group
-    )
-    routed_input, output_splits_per_ep_rank = mxfp8_on_device_all_to_all_v(
-        routed_input,
-        input_splits_per_ep_rank,
-        max_tokens_per_ep_rank,
-        device_mesh.get_group().group_name,
-    )
-    return (
-        routed_input,
-        input_splits_per_ep_rank,
-        output_splits_per_ep_rank,
-        num_tokens_per_expert_group,
-    )
diff --git a/torchtitan/experiments/llama4/infra/parallelize.py b/torchtitan/experiments/llama4/infra/parallelize.py
@@ -94,13 +94,6 @@ def parallelize_llama(
         )
         maybe_enable_async_tp(job_config, world_mesh["tp"])
 
-    # Worst case = single expert receives all tokens
-    # TODO: explore using token dropping to avoid this huge overallocation
-    max_tokens_per_ep_rank = (
-        job_config.training.seq_len
-        * job_config.training.local_batch_size
-        * model.model_args.moe_args.num_experts
-    )
     if parallel_dims.tp_enabled or parallel_dims.ep_enabled:
         apply_moe_ep_tp(
             model,
@@ -115,7 +108,6 @@ def parallelize_llama(
             ),
             etp_enabled=parallel_dims.etp_enabled,
             a2a_impl=job_config.parallelism.expert_parallel_a2a_impl,
-            max_tokens_per_ep_rank=max_tokens_per_ep_rank,
         )
 
     model_compile_enabled = (
@@ -447,8 +439,7 @@ def apply_moe_ep_tp(
     ep_mesh: DeviceMesh | None,
     ep_tp_mesh: DeviceMesh | None,
     etp_enabled: bool,
-    a2a_impl: str,
-    max_tokens_per_ep_rank: int = -1, # Only used for mxfp8 a2a
+    a2a_impl: str = "default",
 ):
     for transformer_block in model.layers.values():
         if not transformer_block.moe_enabled:
@@ -498,17 +489,13 @@ def apply_moe_ep_tp(
         elif tp_mesh is None:
             experts_mesh = ep_mesh
             # input / output sharding on the batch / tokens dim
-            experts_plan = ExpertParallel(
-                a2a_impl=a2a_impl, max_tokens_per_ep_rank=max_tokens_per_ep_rank
-            )
+            experts_plan = ExpertParallel(a2a_impl=a2a_impl)
         elif etp_enabled:
             experts_mesh = ep_tp_mesh
             experts_plan = ExpertTensorParallel(tp_mesh=tp_mesh, ep_mesh=ep_mesh)
         else:
             experts_mesh = ep_mesh
-            experts_plan = ExpertParallel(
-                a2a_impl=a2a_impl, max_tokens_per_ep_rank=max_tokens_per_ep_rank
-            )
+            experts_plan = ExpertParallel(a2a_impl=a2a_impl)
 
         parallelize_module(
             module=transformer_block.moe.experts,