[llama4] add Grouped GEMM support for MoE (#1084)

tianyu-l · web-flow · commit dbb34ccbdfd3 · 2025-04-10T14:28:52.000-07:00
This PR 1. adds grouped gemm support (pytorch/pytorch#150374) for llama4 MoE. In general, it avoids device/host syncs; for TP, it avoided the sharding prop overhead caused by varying number of tokens for individual experts. The speedup on the debug model is ~4x with/without TP. I'm deliberately keeping the for-loop implementation for now, for comparison and readability purposes. 2. moves the MoE indices kernel from the deepseek folder to the kernel folder. In order for TP to work, it requires some pytorch-side changes (e.g. DTensor support for `torch._grouped_mm`), for which I will submit PRs soon. A issue is that the grouped gemm version doesn't work well with AdamW optimizer, which is to be investigated. cc: @janeyx99
diff --git a/torchtitan/experiments/deepseek_v3/model.py b/torchtitan/experiments/deepseek_v3/model.py
@@ -37,12 +37,12 @@
 import torch.utils.checkpoint
 
 from attn_mask_utils import _prepare_4d_causal_attention_mask
-from indices import generate_permute_indices
 from model_config import ModelArgs
 from symm_mem_recipes import OnDeviceAllToAllV
 from torch import nn
 from torch.distributed._functional_collectives import all_to_all_single_autograd
 
+from torchtitan.experiments.kernels.moe.indices import generate_permute_indices
 from torchtitan.experiments.kernels.triton_mg_group_gemm.torchao_pr import (
     ALIGN_SIZE_M,
     grouped_gemm_forward,
diff --git a/torchtitan/experiments/kernels/moe/indices.py b/torchtitan/experiments/kernels/moe/indices.py
diff --git a/torchtitan/experiments/llama4/README.md b/torchtitan/experiments/llama4/README.md
@@ -1,8 +1,8 @@
 **The Llama 4 folder is still under development.**
 
 #### Available features
-- Llama 4 model definition (text-only), including the MoE architecture with token-choice routing
-- Basic FSDP, TP, PP, CP support
+- Llama 4 model definition (text-only), including the MoE architecture with token-choice routing using efficient bfloat16 Grouped MM kernels
+- FSDP, TP, PP, CP support
 - DCP checkpoint conversion scripts
 
 #### Download Llama 4 tokenizer
@@ -17,13 +17,13 @@ python scripts/download_tokenizer.py --repo_id meta-llama/Llama-4-Scout-17B-16E
     - load balance loss for token-choice MoE
     - alternative expert-choice MoE
     - multimodal support
-- Kernel integration
-    - efficient bfloat16 GroupedGEMM kernels (from PyTorch core)
-    - efficient float8 GroupedGEMM kernels (from torchao)
 - Parallelism
-    - performant TP implementation and torch.compile support for MoE layers
     - Context Parallel support for FlexAttention, iRoPE, and multimodal inputs
     - Expert Parallel support
+- torch.compile
+    - for MoE layers
+- Quantization
+    - efficient float8 GroupedGEMM kernels (from torchao)
 - Testing
     - perfomance and loss converging tests
     - CI integration
diff --git a/torchtitan/experiments/llama4/infra/expert_parallel.py b/torchtitan/experiments/llama4/infra/expert_parallel.py
@@ -8,21 +8,21 @@
 from functools import partial
 from typing import Optional, Tuple
 
+import torch
 import torch.nn as nn
 from torch.distributed.tensor import (
     DeviceMesh,
     distribute_module,
     distribute_tensor,
     DTensor,
-    Partial,
     Replicate,
     Shard,
 )
 from torch.distributed.tensor.parallel import ParallelStyle
 from torch.distributed.tensor.placement_types import Placement
 
 
-# implementation of Tensor Parallel on the non-shared experts in MoE
+# implementation of Tensor Parallel for the GroupedExperts in MoE
 class TensorParallel(ParallelStyle):
     def __init__(
         self,
@@ -32,33 +32,31 @@ def __init__(
         use_local_output: bool = True,
     ):
         super().__init__()
-        self.input_layouts = input_layouts or (Replicate(), None)
-        self.output_layout = output_layout or Partial()
-        self.desired_input_layouts = (Replicate(), None)
+        self.input_layouts = input_layouts or (Replicate(), Replicate())
+        self.output_layout = output_layout or Replicate()
+        self.desired_input_layouts = (Replicate(), Replicate())
         self.use_local_output = use_local_output
 
     @staticmethod
     def _prepare_input_fn(
         input_layouts, desired_input_layouts, mod, inputs, device_mesh
     ):
-        # TODO: figure out dynamo support for instance method and switch this to instance method
-
+        prepared_inputs = []
         # annotate module input placements/sharding with input_layouts
-        input_tensor, input_layout, desired_input_layout = (
-            inputs[0],
-            input_layouts[0],
-            desired_input_layouts[0],
-        )
-        if not isinstance(input_tensor, DTensor):
-            input_tensor = DTensor.from_local(
-                input_tensor, device_mesh, (input_layout,), run_check=False
-            )
-
-        if input_layouts != desired_input_layouts:
-            input_tensor = input_tensor.redistribute(
-                placements=(desired_input_layout,), async_op=True
-            )
-        return (input_tensor, *inputs[1:])
+        for inp, input_layout, desired_input_layout in zip(
+            inputs, input_layouts, desired_input_layouts
+        ):
+            if isinstance(inp, torch.Tensor):
+                if not isinstance(inp, DTensor):
+                    inp = DTensor.from_local(
+                        inp, device_mesh, (input_layout,), run_check=False
+                    )
+                if input_layout != desired_input_layout:
+                    inp = inp.redistribute(
+                        placements=(desired_input_layout,), async_op=True
+                    )
+            prepared_inputs.append(inp)
+        return tuple(prepared_inputs)
 
     def _partition_fn(self, name, module, device_mesh):
         module.register_parameter(
diff --git a/torchtitan/experiments/llama4/infra/parallelize_llama.py b/torchtitan/experiments/llama4/infra/parallelize_llama.py
@@ -149,8 +149,8 @@ def apply_moe_tp(
             # replicate computation for the router
             "moe.router.gate": NoParallel(),
             # input Replicate, output Partial
-            "moe.experts": TensorParallel(),
-            "moe.shared_expert": TensorParallel(),
+            "moe.experts": TensorParallel(output_layout=Partial()),
+            "moe.shared_expert": TensorParallel(output_layout=Partial()),
         }
         parallelize_module(
             module=transformer_block,
diff --git a/torchtitan/experiments/llama4/model/args.py b/torchtitan/experiments/llama4/model/args.py
@@ -47,6 +47,7 @@ class TransformerModelArgs(BaseModelArgs):
     interleave_moe_layer_step: int = 2
     # token-choice
     top_k: int = 1
+    use_grouped_mm: bool = True  # grouped mm or for-loop for the experts computation
 
     def update_from_config(self, job_config: JobConfig, tokenizer: Tokenizer) -> None:
         self.norm_type = job_config.model.norm_type
diff --git a/torchtitan/experiments/llama4/model/moe.py b/torchtitan/experiments/llama4/model/moe.py
@@ -17,53 +17,72 @@ def __init__(
         dim: int,
         hidden_dim: int,
         num_experts: int,
+        use_grouped_mm: bool,
     ):
         super().__init__()
         self.num_experts = num_experts
         self.w1 = nn.Parameter(torch.empty(num_experts, dim, hidden_dim))
         self.w2 = nn.Parameter(torch.empty(num_experts, hidden_dim, dim))
         self.w3 = nn.Parameter(torch.empty(num_experts, dim, hidden_dim))
+        self.use_grouped_mm = use_grouped_mm
 
     def forward(
         self,
         x: torch.Tensor,
-        num_local_tokens_per_expert: torch.Tensor | None = None,
+        num_local_tokens_per_expert: torch.Tensor | list[int] | None = None,
     ) -> torch.Tensor:
-        if num_local_tokens_per_expert is not None:
-            # a tuple of tensors indexed by experts
-            # each with shape (tokens_per_expert(varying), dim)
-            x = torch.split(
-                x,
-                split_size_or_sections=num_local_tokens_per_expert.tolist(),
-                dim=0,
-            )
-            out_experts_splits = []
-            for expert_idx, x_expert in enumerate(x):
-                w1, w2, w3 = (
-                    self.w1[expert_idx],
-                    self.w2[expert_idx],
-                    self.w3[expert_idx],
+        # TODO: keeping this for loop implementation for comparison
+        #       and readability, will remove later
+        if not self.use_grouped_mm:
+            if num_local_tokens_per_expert is not None:
+                # a tuple of tensors indexed by experts
+                # each with shape (tokens_per_expert(varying), dim)
+                x = torch.split(
+                    x,
+                    split_size_or_sections=num_local_tokens_per_expert,
+                    dim=0,
                 )
-                h = F.silu(torch.matmul(x_expert, w1))
-                h = h * torch.matmul(x_expert, w3)
-                h = torch.matmul(h, w2)
-                # h shape (tokens_per_expert(varying), dim)
-                out_experts_splits.append(h)
-            out = torch.cat(out_experts_splits, dim=0)
-
-            # TODO:optimize with GroupedGEMM
+                out_experts_splits = []
+                for expert_idx, x_expert in enumerate(x):
+                    w1, w2, w3 = (
+                        self.w1[expert_idx],
+                        self.w2[expert_idx],
+                        self.w3[expert_idx],
+                    )
+                    h = F.silu(torch.matmul(x_expert, w1))
+                    h = h * torch.matmul(x_expert, w3)
+                    h = torch.matmul(h, w2)
+                    # h shape (tokens_per_expert(varying), dim)
+                    out_experts_splits.append(h)
+                out = torch.cat(out_experts_splits, dim=0)
+            else:
+                # x shape (num_experts, tokens_per_expert, dim)
+                h = F.silu(torch.bmm(x, self.w1))
+                h = h * torch.bmm(x, self.w3)
+                # out shape (num_experts, tokens_per_expert, dim)
+                out = torch.bmm(h, self.w2)
+
+            return out
+
+        # grouped mm implementation
+        if num_local_tokens_per_expert is not None:
             # https://github.com/pytorch/pytorch/pull/150374
-            # _gouped_mm requires shapes to be multiple of 8
-            # offsets = torch.cumsum(num_local_tokens_per_expert, dim=0, dtype=torch.int32)
-            # h = F.silu(torch._grouped_mm(x, self.w1.transpose(-2, -1), offs=offsets, out_dtype=torch.bfloat16))
-            # h = h * torch._grouped_mm(x, self.w3.transpose(-2, -1), offs=offsets, out_dtype=torch.bfloat16)
-            # out = torch._grouped_mm(h, self.w2.transpose(-2, -1), offs=offsets, out_dtype=torch.bfloat16)
+            # NOTE: torch._gouped_mm requires bf16 dtypes
+            #       and shapes to be multiple of 8
+            offsets = torch.cumsum(
+                num_local_tokens_per_expert, dim=0, dtype=torch.int32
+            )
+            # grouped mm between a 2D tensor and a 3D tensor
+            assert x.dim() == 2
         else:
-            # x shape (num_experts, tokens_per_expert, dim)
-            h = F.silu(torch.bmm(x, self.w1))
-            h = h * torch.bmm(x, self.w3)
-            # out shape (num_experts, tokens_per_expert, dim)
-            out = torch.bmm(h, self.w2)
+            offsets = None
+            # fall back to regular bmm between 3D tensors
+            assert x.dim() == 3
+
+        h = F.silu(torch._grouped_mm(x, self.w1, offs=offsets))
+        h = h * torch._grouped_mm(x, self.w3, offs=offsets)
+        out = torch._grouped_mm(h, self.w2, offs=offsets)
+
         return out
 
     def init_weights(self, init_std: float):
@@ -166,14 +185,23 @@ def __init__(self, model_args: TransformerModelArgs):
             hidden_dim = int(hidden_dim / hidden_dim_denom)
         hidden_dim += -hidden_dim % model_args.multiple_of
 
+        self.use_grouped_mm = model_args.use_grouped_mm
         self.experts = GroupedExperts(
-            dim=dim, hidden_dim=hidden_dim, num_experts=num_experts
+            dim=dim,
+            hidden_dim=hidden_dim,
+            num_experts=num_experts,
+            use_grouped_mm=self.use_grouped_mm,
         )
         self.router = TokenChoiceTopKRouter(
             dim=dim, num_experts=num_experts, top_k=model_args.top_k
         )
         self.shared_expert = (
-            GroupedExperts(dim=dim, hidden_dim=hidden_dim, num_experts=1)
+            GroupedExperts(
+                dim=dim,
+                hidden_dim=hidden_dim,
+                num_experts=1,
+                use_grouped_mm=self.use_grouped_mm,
+            )
             if model_args.use_shared_expert
             else None
         )
@@ -206,6 +234,36 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
         )
         routed_input = routed_input * top_scores.reshape(-1, 1)
 
+        if self.use_grouped_mm:
+            # NOTE: In order to use torch._grouped_mm, we need to make sure
+            # the number of tokens each expert gets is a multiple of 16.
+            # The following kernel helps achieve this via padding, without
+            # incurring synchronization between device and host.
+            from torchtitan.experiments.kernels.moe.indices import (
+                generate_permute_indices,
+            )
+
+            ALIGN_SIZE_M = 16
+
+            with torch.no_grad():
+                permuted_indices, m_sizes = generate_permute_indices(
+                    num_local_tokens_per_expert,
+                    self.experts.num_experts,
+                    1,
+                    token_indices.shape[0] + self.experts.num_experts * ALIGN_SIZE_M,
+                    ALIGN_SIZE_M,
+                )
+            num_local_tokens_per_expert = m_sizes
+            token_indices = torch.vstack(
+                (token_indices, token_indices.new_zeros((dim)))
+            )
+            token_indices = token_indices[permuted_indices, :]
+            routed_input = torch.vstack((routed_input, routed_input.new_zeros((dim))))
+            routed_input = routed_input[permuted_indices, :]
+        else:
+            # NOTE: this would incur a synchronization between device and host
+            num_local_tokens_per_expert = num_local_tokens_per_expert.tolist()
+
         # shape (bs*slen*top_k, dim)
         routed_output = self.experts(routed_input, num_local_tokens_per_expert)
 
diff --git a/torchtitan/experiments/llama4/train_configs/debug_model.toml b/torchtitan/experiments/llama4/train_configs/debug_model.toml
@@ -29,7 +29,9 @@ use_flex_attn = false
 attn_mask_type = "causal"  # causal / block_causal
 
 [optimizer]
-name = "AdamW"
+# TODO: currently grouped mm in MoE doesn't work with AdamW, need to investigate
+# name = "AdamW"
+name = "Adam"
 lr = 4e-3
 eps = 1e-15
 
diff --git a/torchtitan/train.py b/torchtitan/train.py
@@ -391,7 +391,7 @@ def train_step(self, input_dict: dict[str, torch.Tensor], labels: torch.Tensor):
                 dist_utils.dist_max(loss, world_mesh["dp_cp"]),
             )
         else:
-            global_avg_loss = global_max_loss = loss.item()
+            global_avg_loss = global_max_loss = loss.detach().item()
 
         self.metrics_processor.log(self.step, global_avg_loss, global_max_loss)
 

Original file line number	Diff line number	Diff line change
`@@ -391,7 +391,7 @@ def train_step(self, input_dict: dict[str, torch.Tensor], labels: torch.Tensor):`
`391`	`391`	`dist_utils.dist_max(loss, world_mesh["dp_cp"]),`
`392`	`392`	`)`
`393`	`393`	`else:`
`394`		`- global_avg_loss = global_max_loss = loss.item()`
	`394`	`+ global_avg_loss = global_max_loss = loss.detach().item()`
`395`	`395`
`396`	`396`	`self.metrics_processor.log(self.step, global_avg_loss, global_max_loss)`
`397`	`397`