From 3c838f4eeb84e90485bdc773f86841676acc2043 Mon Sep 17 00:00:00 2001 From: liuyun7345 Date: Tue, 20 Jan 2026 23:24:50 +0800 Subject: [PATCH 1/3] fix(moe): improve error message for GroupedMLP FP8 support This commit improves the error message when users try to use FP8 with the legacy GroupedMLP implementation. The new message clearly explains: 1. The legacy GroupedMLP only supports bf16 2. For FP8 support, users should use TEGroupedMLP 3. How to enable TEGroupedMLP (--no-moe-use-legacy-grouped-gemm) 4. The required TransformerEngine version (>= 1.9) Fixes #1564 --- megatron/core/transformer/moe/experts.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/megatron/core/transformer/moe/experts.py b/megatron/core/transformer/moe/experts.py index 62fb7a148c8..e2f963062e6 100644 --- a/megatron/core/transformer/moe/experts.py +++ b/megatron/core/transformer/moe/experts.py @@ -222,7 +222,11 @@ def forward( permuted_probs: torch.Tensor, ): """Forward step of the GroupedMLP.""" - assert self.config.bf16, "Currently GroupedMLP for MoE only supports bf16." + assert self.config.bf16, ( + "The legacy GroupedMLP only supports bf16. " + "For FP8 support, please use TEGroupedMLP by setting " + "'--no-moe-use-legacy-grouped-gemm' and ensuring TransformerEngine >= 1.9 is installed." + ) if self.activation_recompute: self.activation_checkpoint = tensor_parallel.CheckpointWithoutOutput() From 9ce23e196e9a8c14c843217ea7e6916d8e9a5aba Mon Sep 17 00:00:00 2001 From: Xin Yao Date: Wed, 21 Jan 2026 09:29:58 +0800 Subject: [PATCH 2/3] Update experts.py --- megatron/core/transformer/moe/experts.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/megatron/core/transformer/moe/experts.py b/megatron/core/transformer/moe/experts.py index e2f963062e6..f37c7bec3b1 100644 --- a/megatron/core/transformer/moe/experts.py +++ b/megatron/core/transformer/moe/experts.py @@ -224,8 +224,9 @@ def forward( """Forward step of the GroupedMLP.""" assert self.config.bf16, ( "The legacy GroupedMLP only supports bf16. " - "For FP8 support, please use TEGroupedMLP by setting " - "'--no-moe-use-legacy-grouped-gemm' and ensuring TransformerEngine >= 1.9 is installed." + "For FP8 support, please use TEGroupedMLP instead, which is adopted by default " + "when TransformerEngine >= 1.9 is installed and '--moe-use-legacy-grouped-gemm' is " + "*not* set." ) if self.activation_recompute: self.activation_checkpoint = tensor_parallel.CheckpointWithoutOutput() From 945ecce2ec0a71c575dcec8a00e7dabd3c4e25b8 Mon Sep 17 00:00:00 2001 From: Xin Yao Date: Wed, 21 Jan 2026 09:31:05 +0800 Subject: [PATCH 3/3] Update experts.py --- megatron/core/transformer/moe/experts.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/megatron/core/transformer/moe/experts.py b/megatron/core/transformer/moe/experts.py index f37c7bec3b1..cf15faeaf90 100644 --- a/megatron/core/transformer/moe/experts.py +++ b/megatron/core/transformer/moe/experts.py @@ -224,7 +224,7 @@ def forward( """Forward step of the GroupedMLP.""" assert self.config.bf16, ( "The legacy GroupedMLP only supports bf16. " - "For FP8 support, please use TEGroupedMLP instead, which is adopted by default " + "For FP16/FP8 support, please use TEGroupedMLP instead, which is adopted by default " "when TransformerEngine >= 1.9 is installed and '--moe-use-legacy-grouped-gemm' is " "*not* set." )