NVFP4CuteDslFusedMoEMethod

syuoni · syuoni · commit 313cfc3ae6c1 · 2025-11-20T08:22:52.000Z
Signed-off-by: Enwei Zhu &lt;21126786+syuoni@users.noreply.github.com&gt;
diff --git a/tensorrt_llm/_torch/modules/fused_moe/fused_moe_cute_dsl.py b/tensorrt_llm/_torch/modules/fused_moe/fused_moe_cute_dsl.py
@@ -10,7 +10,7 @@
 from ...model_config import ModelConfig
 from ...utils import AuxStreamType, Fp4QuantizedTensor, ceil_div
 from .fused_moe_cutlass import CutlassFusedMoE
-from .quantization import MoEWeightLoadingMode
+from .quantization import MoEWeightLoadingMode, NVFP4CuteDslFusedMoEMethod
 from .routing import BaseMoeRoutingMethod
 
 
@@ -150,8 +150,7 @@ def cute_dsl_nvfp4_grouped_gemm_ref(
 
 
 class CuteDslFusedMoE(CutlassFusedMoE):
-    """
-    Python Flow of Fused Mixture of Experts (MoE) Layer.
+    """CuteDSL flow of fused mixture of experts (MoE) Layer.
 
     Args:
         num_experts (int): Number of experts in the MoE layer.
@@ -162,11 +161,6 @@ class CuteDslFusedMoE(CutlassFusedMoE):
         dtype (Optional[torch.dtype]): Data type for the weights.
         reduce_results (bool): Whether to reduce the results across devices.
         model_config (ModelConfig): Configuration object for the model.
-
-    This backend is composed of multiple custom ops:
-    1. moe_permute_op: permute the input tensor and the expert selected tensor.
-    2. cute_dsl_fp8_group_blockwise_gemm_ref: a reference implementation of the cute_dsl_fp8_group_blockwise_gemm.
-    3. moe_finalize_scale_op: finalize the scale of the output tensor.
     """
 
     def __init__(
@@ -201,6 +195,13 @@ def __init__(
             layer_idx=layer_idx,
         )
 
+    def _get_quant_method(self):
+        if self.quant_config is not None and self.quant_config.layer_quant_mode.has_any_quant(
+                exclude_kv_cache=True):
+            if self.quant_config.layer_quant_mode.has_nvfp4():
+                return NVFP4CuteDslFusedMoEMethod()
+        return super()._get_quant_method()
+
     def forward_chunk_unquantized(
             self,
             x: Union[torch.Tensor, Fp4QuantizedTensor],
diff --git a/tensorrt_llm/_torch/modules/fused_moe/fused_moe_deepgemm.py b/tensorrt_llm/_torch/modules/fused_moe/fused_moe_deepgemm.py
@@ -348,8 +348,7 @@ def set_strides(workspace: torch.Tensor, g: int, m: int, k: int):
 
 
 class DeepGemmFusedMoE(CutlassFusedMoE):
-    """
-    Python Flow of Fused Mixture of Experts (MoE) Layer.
+    """DeepGEMM flow of fused mixture of experts (MoE) Layer.
 
     Args:
         num_experts (int): Number of experts in the MoE layer.
@@ -360,11 +359,6 @@ class DeepGemmFusedMoE(CutlassFusedMoE):
         dtype (Optional[torch.dtype]): Data type for the weights.
         reduce_results (bool): Whether to reduce the results across devices.
         model_config (ModelConfig): Configuration object for the model.
-
-    This backend is composed of multiple custom ops:
-    1. moe_permute_op: permute the input tensor and the expert selected tensor.
-    2. cute_dsl_fp8_group_blockwise_gemm_ref: a reference implementation of the cute_dsl_fp8_group_blockwise_gemm.
-    3. moe_finalize_scale_op: finalize the scale of the output tensor.
     """
 
     # To reuse pytorch memory segments allocated during graph capture.
diff --git a/tensorrt_llm/_torch/modules/fused_moe/quantization.py b/tensorrt_llm/_torch/modules/fused_moe/quantization.py
@@ -1828,35 +1828,6 @@ def setup_quant_scales(self, module: torch.nn.Module):
             fc2_global=module.fc2_alpha,
         )
 
-    def post_load_weights(self, module: torch.nn.Module):
-        super().post_load_weights(module)
-        if module.moe_backend == "CUTEDSL":
-            # Interleave FC1 weight and scales for GEMM1 + SwiGLU fusion.
-            w3_w1_weight = module.w3_w1_weight.data.view(float4_e2m1x2)
-            m = w3_w1_weight.size(1)
-            n = w3_w1_weight.size(2) * 2
-            w3_w1_weight_interleaved = interleave_linear_and_gate(w3_w1_weight,
-                                                                  group_size=64,
-                                                                  dim=1)
-            w3_w1_weight_interleaved = w3_w1_weight_interleaved.view(
-                module.w3_w1_weight.data.dtype)
-            module.w3_w1_weight.data.copy_(w3_w1_weight_interleaved)
-
-            w3_w1_weight_scale = module.quant_scales.fc1_weight_block.data.view(
-                float4_sf_dtype)
-            w3_w1_weight_scale_unswizzled = unswizzle_sf(
-                w3_w1_weight_scale, m, n).view(-1, m,
-                                               n // module.scaling_vector_size)
-            w3_w1_weight_scale_unswizzled_interleaved = interleave_linear_and_gate(
-                w3_w1_weight_scale_unswizzled, group_size=64, dim=1)
-            w3_w1_weight_scale_interleaved = swizzle_sf(
-                w3_w1_weight_scale_unswizzled_interleaved, m,
-                n).view(-1, m, n // module.scaling_vector_size)
-            w3_w1_weight_scale_interleaved = w3_w1_weight_scale_interleaved.view(
-                module.quant_scales.fc1_weight_block.data.dtype)
-            module.quant_scales.fc1_weight_block.data.copy_(
-                w3_w1_weight_scale_interleaved)
-
 
 class NVFP4CutlassFusedMoEMethod(NVFP4FusedMoEMethod):
     weight_dtype = FUSED_MOE_NVFP4_WEIGHT_DTYPE
@@ -1935,6 +1906,38 @@ def load_expert_w2_weight_scale_nvfp4(self, module: torch.nn.Module,
         dst_w2_weight_scale.copy_(dst_w2_weight_scale_interleaved)
 
 
+class NVFP4CuteDslFusedMoEMethod(NVFP4CutlassFusedMoEMethod):
+
+    def post_load_weights(self, module: torch.nn.Module):
+        super().post_load_weights(module)
+
+        # Interleave FC1 weight and scales for GEMM1 + SwiGLU fusion.
+        w3_w1_weight = module.w3_w1_weight.data.view(float4_e2m1x2)
+        m = w3_w1_weight.size(1)
+        n = w3_w1_weight.size(2) * 2
+        w3_w1_weight_interleaved = interleave_linear_and_gate(w3_w1_weight,
+                                                              group_size=64,
+                                                              dim=1)
+        w3_w1_weight_interleaved = w3_w1_weight_interleaved.view(
+            module.w3_w1_weight.data.dtype)
+        module.w3_w1_weight.data.copy_(w3_w1_weight_interleaved)
+
+        w3_w1_weight_scale = module.quant_scales.fc1_weight_block.data.view(
+            float4_sf_dtype)
+        w3_w1_weight_scale_unswizzled = unswizzle_sf(
+            w3_w1_weight_scale, m, n).view(-1, m,
+                                           n // module.scaling_vector_size)
+        w3_w1_weight_scale_unswizzled_interleaved = interleave_linear_and_gate(
+            w3_w1_weight_scale_unswizzled, group_size=64, dim=1)
+        w3_w1_weight_scale_interleaved = swizzle_sf(
+            w3_w1_weight_scale_unswizzled_interleaved, m,
+            n).view(-1, m, n // module.scaling_vector_size)
+        w3_w1_weight_scale_interleaved = w3_w1_weight_scale_interleaved.view(
+            module.quant_scales.fc1_weight_block.data.dtype)
+        module.quant_scales.fc1_weight_block.data.copy_(
+            w3_w1_weight_scale_interleaved)
+
+
 class NVFP4TRTLLMGenFusedMoEMethod(NVFP4FusedMoEMethod):
     weight_dtype = float4_sf_dtype
     block_scales_dtype = torch.float8_e4m3fn