[TRTLLM-8827] [feat] Enable low precision alltoall for Cutlass and TRTLLMGen backends (#8675)

kaiyux · coderabbitai[bot] · web-flow · commit 227c2884413d · 2025-10-29T07:56:48.000+08:00
Signed-off-by: Kaiyu Xie &lt;26294424+kaiyux@users.noreply.github.com&gt;
Co-authored-by: coderabbitai[bot] &lt;136622811+coderabbitai[bot]@users.noreply.github.com&gt;
diff --git a/tensorrt_llm/_torch/modules/fused_moe/fused_moe_cutlass.py b/tensorrt_llm/_torch/modules/fused_moe/fused_moe_cutlass.py
@@ -6,11 +6,12 @@
 
 from tensorrt_llm._mnnvl_utils import MnnvlMemory, MnnvlMoe
 from tensorrt_llm._torch.distributed.moe_alltoall import MoeAlltoAll
+from tensorrt_llm.logger import logger
 
 from ...distributed import allgather
 from ...model_config import ModelConfig
 from ...utils import AuxStreamType, EventType, Fp4QuantizedTensor, ceil_div
-from .interface import MoE
+from .interface import AlltoallMethodType, MoE
 
 # isort: off
 from .quantization import (
@@ -140,28 +141,44 @@ def __init__(
         self.has_been_profiled_min_latency = False
 
         # TODO: AlltoAll code is largely duplicated with WideEPMoE. Consider refactor and reuse in the future.
+        self.alltoall_method_type = self.select_alltoall_method_type()
+        logger.info_once(
+            f"{self.__class__.__name__} selects alltoall_method_type {self.alltoall_method_type!r}",
+            key="alltoall_method_type")
         self.alltoall_workspace = None
         self.alltoall_prepare_workspace = None
+        self.use_low_precision_combine = False
         if self.enable_alltoall:
-            if self.moe_alltoall_backend == "mnnvllatency":
-                MnnvlMemory.initialize()
-                self.alltoall_workspace = MnnvlMoe.get_moe_workspaces(
-                    model_config.mapping)
-                self.alltoall_prepare_workspace = MnnvlMoe.get_moe_prepare_workspace(
-                    model_config.mapping)
-            elif self.moe_alltoall_backend == "mnnvlthroughput":
-                workspace_mb = int(
-                    os.environ.get("TRTLLM_MOE_A2A_WORKSPACE_MB", "512"))
-                self.moe_a2a = MoeAlltoAll(
-                    mapping=self.mapping,
-                    max_num_tokens_per_rank=model_config.max_num_tokens,
-                    top_k=self.routing_method.experts_per_token,
-                    num_experts=self.num_experts,
-                    workspace_size_per_rank=workspace_mb * 1024 * 1024,
+            self.use_low_precision_combine = model_config.use_low_precision_moe_combine
+
+            if self.alltoall_method_type == AlltoallMethodType.MNNVL:
+                if self.moe_alltoall_backend == "mnnvllatency":
+                    MnnvlMemory.initialize()
+                    self.alltoall_workspace = MnnvlMoe.get_moe_workspaces(
+                        model_config.mapping)
+                    self.alltoall_prepare_workspace = MnnvlMoe.get_moe_prepare_workspace(
+                        model_config.mapping)
+                elif self.moe_alltoall_backend == "mnnvlthroughput":
+                    workspace_mb = int(
+                        os.environ.get("TRTLLM_MOE_A2A_WORKSPACE_MB", "512"))
+                    self.moe_a2a = MoeAlltoAll(
+                        mapping=self.mapping,
+                        max_num_tokens_per_rank=model_config.max_num_tokens,
+                        top_k=self.routing_method.experts_per_token,
+                        num_experts=self.num_experts,
+                        workspace_size_per_rank=workspace_mb * 1024 * 1024,
+                    )
+                else:
+                    raise ValueError(
+                        f"Unsupported moe alltoall backend: {self.moe_alltoall_backend}"
+                    )
+            elif self.alltoall_method_type == AlltoallMethodType.DeepEP or self.alltoall_method_type == AlltoallMethodType.DeepEPLowLatency:
+                raise NotImplementedError(
+                    "DeepEP and DeepEPLowLatency are not supported for CutlassFusedMoE yet"
                 )
             else:
-                raise ValueError(
-                    f"Unsupported moe alltoall backend: {self.moe_alltoall_backend}"
+                raise NotImplementedError(
+                    f"Not available alltoall method type: {self.alltoall_method_type!r}"
                 )
 
         # If True, the router weight will be multiplied on the input rather than at the end of FC2
@@ -204,13 +221,38 @@ def has_int8_woq_per_channel(self):
         return self.quant_config.layer_quant_mode.is_int8_weight_only(
         ) and not self.quant_config.layer_quant_mode.has_per_group_scaling()
 
+    def select_alltoall_method_type(self) -> AlltoallMethodType:
+        all2all_method_type = os.environ.get("TRTLLM_FORCE_ALLTOALL_METHOD")
+        if all2all_method_type is not None:
+            if AlltoallMethodType[all2all_method_type] in [
+                    AlltoallMethodType.DeepEP,
+                    AlltoallMethodType.DeepEPLowLatency
+            ]:
+                raise NotImplementedError(
+                    "DeepEP and DeepEPLowLatency are not supported for CutlassFusedMoE yet"
+                )
+            return AlltoallMethodType[all2all_method_type]
+
+        if not self.mapping.enable_attention_dp:
+            return AlltoallMethodType.NotEnabled
+
+        if self.mapping.tp_size == 1:
+            return AlltoallMethodType.NotEnabled
+
+        if os.environ.get("TRTLLM_MOE_DISABLE_ALLTOALLV", "0") == "1":
+            return AlltoallMethodType.NotEnabled
+
+        if not (self.mapping.moe_ep_size > self.routing_method.experts_per_token
+                and MnnvlMemory.supports_mnnvl()):
+            return AlltoallMethodType.NotEnabled
+
+        return AlltoallMethodType.MNNVL
+
     @cached_property
     def enable_alltoall(self):
-        return (self.mapping.moe_ep_size > self.routing_method.experts_per_token
-                and self.mapping.enable_attention_dp
-                and self.mapping.tp_size > 1
-                and os.environ.get("TRTLLM_MOE_DISABLE_ALLTOALLV", "0") != "1"
-                and MnnvlMemory.supports_mnnvl())
+        """ enable_alltoall (bool): whether to enable alltoall instead of allgather/reducescatter
+        """
+        return self.alltoall_method_type != AlltoallMethodType.NotEnabled
 
     @cached_property
     def moe_alltoall_backend(self):
@@ -510,6 +552,8 @@ def forward_chunk(
                         ep_rank=self.ep_rank,
                         ep_size=self.ep_size,
                         top_k=top_k,
+                        use_low_precision_combine=self.
+                        use_low_precision_combine,
                         token_count=token_count)
             elif self.moe_alltoall_backend == "mnnvlthroughput":
                 hidden = final_hidden_states.shape[-1]
diff --git a/tensorrt_llm/_torch/modules/fused_moe/fused_moe_trtllm_gen.py b/tensorrt_llm/_torch/modules/fused_moe/fused_moe_trtllm_gen.py
@@ -7,13 +7,14 @@
 
 from tensorrt_llm._mnnvl_utils import MnnvlMemory, MnnvlMoe
 from tensorrt_llm._utils import get_sm_version
+from tensorrt_llm.logger import logger
 
 from ...custom_ops.trtllm_gen_custom_ops import \
     fp4_block_scale_fake_output_without_finalize
 from ...distributed import allgather
 from ...model_config import ModelConfig
 from ...utils import Fp4QuantizedTensor, ceil_div
-from .interface import MoE, MoEWeightLoadingMode
+from .interface import AlltoallMethodType, MoE, MoEWeightLoadingMode
 from .quantization import (DeepSeekFP8BlockScalesFusedMoEMethod,
                            NVFP4TRTLLMGenFusedMoEMethod,
                            W4A8MXFP4FP8TRTLLMGenFusedMoEMethod,
@@ -109,27 +110,68 @@ def __init__(
         assert len(
             self.initial_local_expert_ids) == self.expert_size_per_partition
 
+        # TODO: AlltoAll code is largely duplicated with WideEPMoE. Consider refactor and reuse in the future.
+        self.alltoall_method_type = self.select_alltoall_method_type()
+        logger.info_once(
+            f"{self.__class__.__name__} selects alltoall_method_type {self.alltoall_method_type!r}",
+            key="alltoall_method_type")
         self.alltoall_workspace = None
         self.alltoall_prepare_workspace = None
+        self.use_low_precision_combine = False
         if self.enable_alltoall:
-            MnnvlMemory.initialize()
-            self.alltoall_workspace = MnnvlMoe.get_moe_workspaces(
-                model_config.mapping)
-            self.alltoall_prepare_workspace = MnnvlMoe.get_moe_prepare_workspace(
-                model_config.mapping)
+            self.use_low_precision_combine = model_config.use_low_precision_moe_combine
+
+            if self.alltoall_method_type == AlltoallMethodType.MNNVL:
+                MnnvlMemory.initialize()
+                self.alltoall_workspace = MnnvlMoe.get_moe_workspaces(
+                    model_config.mapping)
+                self.alltoall_prepare_workspace = MnnvlMoe.get_moe_prepare_workspace(
+                    model_config.mapping)
+            elif self.alltoall_method_type == AlltoallMethodType.DeepEP or self.alltoall_method_type == AlltoallMethodType.DeepEPLowLatency:
+                raise NotImplementedError(
+                    "DeepEP and DeepEPLowLatency are not supported for TRTLLMGenFusedMoE yet"
+                )
+            else:
+                raise NotImplementedError(
+                    f"Not available alltoall method type: {self.alltoall_method_type!r}"
+                )
 
         self._weights_created = False
         if not model_config.skip_create_weights_in_init:
             self.create_weights()
 
+    def select_alltoall_method_type(self) -> AlltoallMethodType:
+        all2all_method_type = os.environ.get("TRTLLM_FORCE_ALLTOALL_METHOD")
+        if all2all_method_type is not None:
+            if AlltoallMethodType[all2all_method_type] in [
+                    AlltoallMethodType.DeepEP,
+                    AlltoallMethodType.DeepEPLowLatency
+            ]:
+                raise NotImplementedError(
+                    "DeepEP and DeepEPLowLatency are not supported for CutlassFusedMoE yet"
+                )
+            return AlltoallMethodType[all2all_method_type]
+
+        if not self.mapping.enable_attention_dp:
+            return AlltoallMethodType.NotEnabled
+
+        if self.mapping.tp_size == 1:
+            return AlltoallMethodType.NotEnabled
+
+        if os.environ.get("TRTLLM_MOE_DISABLE_ALLTOALLV", "0") == "1":
+            return AlltoallMethodType.NotEnabled
+
+        if not (self.mapping.moe_ep_size > self.routing_method.experts_per_token
+                and MnnvlMemory.supports_mnnvl()):
+            return AlltoallMethodType.NotEnabled
+
+        return AlltoallMethodType.MNNVL
+
     @cached_property
     def enable_alltoall(self):
-        mapping = self.mapping
-        routing_experts = self.routing_method.experts_per_token
-        return (mapping.moe_ep_size > routing_experts
-                and mapping.enable_attention_dp and mapping.tp_size > 1
-                and os.environ.get("TRTLLM_MOE_DISABLE_ALLTOALLV", "0") != "1"
-                and MnnvlMemory.supports_mnnvl())
+        """ enable_alltoall (bool): whether to enable alltoall instead of allgather/reducescatter
+        """
+        return self.alltoall_method_type != AlltoallMethodType.NotEnabled
 
     def _check_configs(self):
         assert self.has_deepseek_fp8_block_scales \
@@ -608,6 +650,7 @@ def forward_impl(
                 ep_rank=self.ep_rank,
                 ep_size=self.ep_size,
                 top_k=top_k,
+                use_low_precision_combine=self.use_low_precision_combine,
                 token_count=token_count,
             )
 
diff --git a/tensorrt_llm/_torch/modules/fused_moe/fused_moe_wide_ep.py b/tensorrt_llm/_torch/modules/fused_moe/fused_moe_wide_ep.py
@@ -1,5 +1,4 @@
 import os
-from enum import IntEnum
 from typing import Dict, List, Optional, Tuple, Union
 
 import torch
@@ -15,7 +14,7 @@
 from ...model_config import ModelConfig
 from ...utils import AuxStreamType, EventType, Fp4QuantizedTensor
 from .deep_ep_utils import buffer_pool, deep_ep_installed
-from .interface import MoE
+from .interface import AlltoallMethodType, MoE
 from .moe_load_balancer import get_moe_load_balancer
 from .ops import MoEOp, MoEOpSelector
 from .quantization import (DeepSeekFP8BlockScalesFusedMoEMethod,
@@ -26,18 +25,6 @@
 from .routing import BaseMoeRoutingMethod
 
 
-# The type of alltoall method
-class AlltoallMethodType(IntEnum):
-    # Not available
-    NotEnabled = 0
-    # MNNVL
-    MNNVL = 1
-    # DeepEP intranode or internode: CUDA Graphs are supported, IBGDA is required by internode
-    DeepEP = 2
-    # DeepEP low latency: CUDA Graphs are supported, IBGDA is required
-    DeepEPLowLatency = 3
-
-
 class WideEPMoE(MoE):
     """
     Fused Mixture of Experts (MoE) Layer with for wide EP.
diff --git a/tensorrt_llm/_torch/modules/fused_moe/interface.py b/tensorrt_llm/_torch/modules/fused_moe/interface.py
@@ -1,6 +1,6 @@
 import weakref
 from abc import abstractmethod
-from enum import Enum
+from enum import Enum, IntEnum
 from typing import Dict, List, Optional, Union, final
 
 import torch
@@ -22,6 +22,18 @@ class MoEWeightLoadingMode(Enum):
     W4A8_CUSTOM = 2
 
 
+# The type of alltoall method
+class AlltoallMethodType(IntEnum):
+    # Not available
+    NotEnabled = 0
+    # MNNVL
+    MNNVL = 1
+    # DeepEP intranode or internode: CUDA Graphs are supported, IBGDA is required by internode
+    DeepEP = 2
+    # DeepEP low latency: CUDA Graphs are supported, IBGDA is required
+    DeepEPLowLatency = 3
+
+
 def extract_extra_attrs(layer_idx: str):
     extra_attrs = get_model_extra_attrs()
     assert extra_attrs is not None, "Model extra attrs are not set"
diff --git a/tests/unittest/_torch/modules/test_fused_moe.py b/tests/unittest/_torch/modules/test_fused_moe.py
@@ -25,9 +25,8 @@
     CuteDslFusedMoE
 from tensorrt_llm._torch.modules.fused_moe.fused_moe_deepgemm import \
     DeepGemmFusedMoE
-from tensorrt_llm._torch.modules.fused_moe.fused_moe_wide_ep import \
-    AlltoallMethodType
-from tensorrt_llm._torch.modules.fused_moe.interface import MoEWeightLoadingMode
+from tensorrt_llm._torch.modules.fused_moe.interface import (
+    AlltoallMethodType, MoEWeightLoadingMode)
 
 # isort and yapf will fight against each other here, so we disable isort
 # isort: off