NVIDIA
diff --git a/‎tensorrt_llm/_torch/custom_ops/cute_dsl_custom_ops.py‎
Lines changed: 304 additions & 4 deletions b/‎tensorrt_llm/_torch/custom_ops/cute_dsl_custom_ops.py‎
Lines changed: 304 additions & 4 deletions
diff --git a/‎tensorrt_llm/_torch/cute_dsl_kernels/blackwell/blockscaled_contiguous_grouped_gemm.py‎
Lines changed: 2 additions & 3 deletions b/‎tensorrt_llm/_torch/cute_dsl_kernels/blackwell/blockscaled_contiguous_grouped_gemm.py‎
Lines changed: 2 additions & 3 deletions
diff --git a/‎tensorrt_llm/_torch/cute_dsl_kernels/blackwell/blockscaled_contiguous_grouped_gemm_finalize_fusion.py‎
Lines changed: 2 additions & 3 deletions b/‎tensorrt_llm/_torch/cute_dsl_kernels/blackwell/blockscaled_contiguous_grouped_gemm_finalize_fusion.py‎
Lines changed: 2 additions & 3 deletions
@@ -26,6 +26,8 @@
         Sm100BlockScaledContiguousGroupedGemmKernel
     from ..cute_dsl_kernels.blackwell.blockscaled_contiguous_grouped_gemm_finalize_fusion import \
         Sm100BlockScaledContiguousGroupedGemmFinalizeFusionKernel
+    from ..cute_dsl_kernels.blackwell.blockscaled_contiguous_grouped_gemm_swiglu_fusion import \
+        Sm100BlockScaledContiguousGroupedGemmSwigluFusionKernel
     from ..cute_dsl_kernels.blackwell.dense_blockscaled_gemm_persistent import \
         Sm100BlockScaledPersistentDenseGemmKernel
     from ..cute_dsl_kernels.blackwell.utils import make_ptr
@@ -440,7 +442,7 @@ def generate_permuted_idx_to_expanded_idx(
 
         def inputs_pre_hook(self,
                             inputs: List[torch.Tensor]) -> List[torch.Tensor]:
-            a, b, a_sf, b_sf, alpha, tile_idx_to_group_idx, num_non_exiting_tiles = inputs
+            a, b, a_sf, b_sf, alpha, tile_idx_to_group_idx, num_non_exiting_tiles, *others = inputs
             num_tokens = self.infer_num_tokens(a.size(0))
             num_tokens_per_expert = self.generate_num_tokens_per_expert(
                 num_tokens)
@@ -461,7 +463,7 @@ def inputs_pre_hook(self,
                 [num_non_exiting_tiles_val],
                 dtype=num_non_exiting_tiles.dtype,
                 device=num_non_exiting_tiles.device)
-            return a, b, a_sf, b_sf, alpha, tile_idx_to_group_idx, num_non_exiting_tiles
+            return a, b, a_sf, b_sf, alpha, tile_idx_to_group_idx, num_non_exiting_tiles, *others
 
         def inputs_pre_hook_finalize_fusion(
                 self, inputs: List[torch.Tensor]) -> List[torch.Tensor]:
@@ -623,7 +625,7 @@ def forward(self, inputs: List[torch.Tensor],
             assert tile_idx_to_group_idx.dtype == torch.int32
             assert tile_idx_to_group_idx.size() == (num_tiles, )
             assert num_non_exiting_tiles.dtype == torch.int32
-            assert num_non_exiting_tiles.size() == (1, )
+            assert num_non_exiting_tiles.numel() == 1
 
             c = torch.empty(m, n, dtype=self.output_dtype, device=a.device)
 
@@ -900,7 +902,7 @@ def forward(self, inputs: List[torch.Tensor],
             assert permuted_idx_to_expanded_idx.dtype == torch.int32
             assert permuted_idx_to_expanded_idx.size() == (m, )
             assert num_non_exiting_tiles.dtype == torch.int32
-            assert num_non_exiting_tiles.size() == (1, )
+            assert num_non_exiting_tiles.numel() == 1
             assert token_final_scales.dtype == torch.float32
             assert token_final_scales.dim() == 2
             num_tokens = token_final_scales.size(0)
@@ -1091,6 +1093,304 @@ def _(
                            dtype=output_dtype,
                            device=input.device)
 
+    class Sm100BlockScaledContiguousGroupedGemmSwigluFusionRunner(
+            TunableRunner):
+        kernel_class = Sm100BlockScaledContiguousGroupedGemmSwigluFusionKernel
+        kernel_cache = dict()
+        tuning_config_cache = dict()
+
+        def __init__(self,
+                     num_experts: int,
+                     top_k: int,
+                     num_local_experts: int,
+                     local_expert_offset: int,
+                     tile_size: int,
+                     scaling_vector_size: int = 16):
+            super().__init__()
+            self.num_experts = num_experts
+            self.top_k = top_k
+            self.num_local_experts = num_local_experts
+            self.local_expert_offset = local_expert_offset
+            self.tile_size = tile_size
+            self.scaling_vector_size = scaling_vector_size
+
+            if get_sm_version() != 100:
+                raise ValueError(
+                    f"SM version {get_sm_version()} is not supported for {self.__class__.__name__}, it only supports SM 100"
+                )
+
+        def get_valid_tactics(
+            self,
+            inputs: List[torch.Tensor],
+            profile: OptimizationProfile,
+            **kwargs,
+        ) -> List[Tuple[int, int]]:
+            a, b, *_ = inputs
+            m, k = a.size(0), a.size(1) * 2
+            l, n = b.size(0), b.size(1)
+
+            # TODO: Add full shmoo
+            mma_tiler_mn_candidates = [(128, 128), (128, 256)]
+            cluster_shape_mn_candidates = [(1, 1), (1, 2)]
+
+            valid_tactics = []
+            for mma_tiler_mn, cluster_shape_mn in itertools.product(
+                    mma_tiler_mn_candidates, cluster_shape_mn_candidates):
+                if self.__class__.kernel_class.can_implement(
+                        ab_dtype=cutlass.Float4E2M1FN,
+                        sf_dtype=cutlass.Float8E4M3FN,
+                        sf_vec_size=self.scaling_vector_size,
+                        acc_dtype=cutlass.Float32,
+                        c_dtype=cutlass.Float4E2M1FN,
+                        use_2cta_instrs=False,
+                        mma_tiler_mn=mma_tiler_mn,
+                        cluster_shape_mn=cluster_shape_mn,
+                        m=m,
+                        n=n,
+                        k=k,
+                        l=l,
+                        a_major="k",
+                        b_major="k",
+                        c_major="n",
+                        m_aligned=self.tile_size,
+                ):
+                    valid_tactics.append((mma_tiler_mn, cluster_shape_mn))
+
+            return valid_tactics
+
+        def get_tuning_config(self) -> TuningConfig:
+            key = hash(self)
+            if key not in self.__class__.tuning_config_cache:
+                helper = GroupedGemmInputsHelper(self.num_experts, self.top_k,
+                                                 self.num_local_experts,
+                                                 self.local_expert_offset,
+                                                 self.tile_size)
+                self.__class__.tuning_config_cache[key] = TuningConfig(
+                    dynamic_tensor_specs=(DynamicTensorSpec(
+                        0, 0, helper.gen_tuning_buckets,
+                        helper.map_to_tuning_buckets), ),
+                    constraint_specs=(ConstraintSpec(2, 0,
+                                                     fp4_scale_infer_shape),
+                                      ConstraintSpec(
+                                          5, 0,
+                                          helper.infer_shape_max_num_tiles)),
+                    inputs_pre_hook=helper.inputs_pre_hook,
+                )
+            return self.__class__.tuning_config_cache[key]
+
+        def forward(self, inputs: List[torch.Tensor],
+                    tactic: Optional[tuple]) -> torch.Tensor:
+            a, b, a_sf, b_sf, alpha, tile_idx_to_group_idx, num_non_exiting_tiles, global_sf = inputs
+            assert a.dtype == torch.float4_e2m1fn_x2
+            assert a.dim() == 2
+            assert b.dtype == torch.float4_e2m1fn_x2
+            assert b.dim() == 3
+            assert a_sf.dtype == torch.uint8
+            assert a_sf.dim() == 1
+            assert b_sf.dtype == torch.uint8
+            assert b_sf.dim() == 3
+            assert alpha.dtype == torch.float32
+            assert alpha.dim() == 1
+
+            m, k = a.size(0), a.size(1) * 2
+            l, n = b.size(0), b.size(1)
+            scale_k = k // self.scaling_vector_size
+            interm_size = n // 2
+            assert m % self.tile_size == 0
+            assert k % (self.scaling_vector_size * 4) == 0
+            assert n % (self.scaling_vector_size * 4 * 2) == 0
+            assert b.size(2) * 2 == k
+            assert a_sf.size(0) == m * scale_k
+            assert b_sf.size(0) == l
+            assert b_sf.size(1) == n
+            assert b_sf.size(2) == scale_k
+            assert alpha.size(0) == l
+
+            num_tiles = m // self.tile_size
+            assert tile_idx_to_group_idx.dtype == torch.int32
+            assert tile_idx_to_group_idx.size() == (num_tiles, )
+            assert num_non_exiting_tiles.dtype == torch.int32
+            assert num_non_exiting_tiles.numel() == 1
+            assert global_sf.dtype == torch.float32
+            assert global_sf.numel() == 1
+
+            c = torch.empty(m, interm_size // 2, dtype=a.dtype, device=a.device)
+            c_sf = torch.empty(m * interm_size // self.scaling_vector_size,
+                               dtype=a_sf.dtype,
+                               device=a_sf.device)
+
+            a_ptr = make_ptr(cutlass.Float4E2M1FN,
+                             a.data_ptr(),
+                             cute.AddressSpace.gmem,
+                             assumed_align=32)
+            b_ptr = make_ptr(cutlass.Float4E2M1FN,
+                             b.data_ptr(),
+                             cute.AddressSpace.gmem,
+                             assumed_align=32)
+            a_sf_ptr = make_ptr(cutlass.Float8E4M3FN,
+                                a_sf.data_ptr(),
+                                cute.AddressSpace.gmem,
+                                assumed_align=16)
+            b_sf_ptr = make_ptr(cutlass.Float8E4M3FN,
+                                b_sf.data_ptr(),
+                                cute.AddressSpace.gmem,
+                                assumed_align=16)
+            alpha_ptr = make_ptr(cutlass.Float32, alpha.data_ptr(),
+                                 cute.AddressSpace.gmem)
+            tile_idx_to_group_idx_ptr = make_ptr(
+                cutlass.Int32, tile_idx_to_group_idx.data_ptr(),
+                cute.AddressSpace.gmem)
+            num_non_exiting_tiles_ptr = make_ptr(
+                cutlass.Int32, num_non_exiting_tiles.data_ptr(),
+                cute.AddressSpace.gmem)
+            global_sf_ptr = make_ptr(cutlass.Float32, global_sf.data_ptr(),
+                                     cute.AddressSpace.gmem)
+            c_ptr = make_ptr(cutlass.Float4E2M1FN,
+                             c.data_ptr(),
+                             cute.AddressSpace.gmem,
+                             assumed_align=32)
+            c_sf_ptr = make_ptr(cutlass.Float8E4M3FN,
+                                c_sf.data_ptr(),
+                                cute.AddressSpace.gmem,
+                                assumed_align=16)
+
+            torch_stream = torch.cuda.current_stream()
+            stream = cuda.CUstream(torch_stream.cuda_stream)
+
+            if isinstance(tactic, tuple):
+                mma_tiler_mn, cluster_shape_mn = tactic
+            else:
+                mma_tiler_mn, cluster_shape_mn = (128, 128), (1, 1)
+
+            cache_key = (self.scaling_vector_size, self.tile_size, mma_tiler_mn,
+                         cluster_shape_mn)
+            if cache_key not in self.__class__.kernel_cache:
+                gemm = self.__class__.kernel_class(
+                    sf_vec_size=self.scaling_vector_size,
+                    acc_dtype=cutlass.Float32,
+                    use_2cta_instrs=False,
+                    mma_tiler_mn=mma_tiler_mn,
+                    cluster_shape_mn=cluster_shape_mn,
+                    vectorized_f32=True,
+                )
+                # Compute max active clusters on current device
+                hardware_info = cutlass.utils.HardwareInfo()
+                max_active_clusters = hardware_info.get_max_active_clusters(
+                    cluster_shape_mn[0] * cluster_shape_mn[1])
+
+                compiled_gemm = cute.compile(
+                    gemm.wrapper,
+                    a_ptr,
+                    b_ptr,
+                    a_sf_ptr,
+                    b_sf_ptr,
+                    c_ptr,
+                    c_sf_ptr,
+                    alpha_ptr,
+                    tile_idx_to_group_idx_ptr,
+                    num_non_exiting_tiles_ptr,
+                    global_sf_ptr,
+                    m,
+                    n,
+                    k,
+                    l,
+                    tile_size=self.tile_size,
+                    scaling_vector_size=self.scaling_vector_size,
+                    max_active_clusters=max_active_clusters,
+                    stream=stream,
+                )
+                self.__class__.kernel_cache[cache_key] = compiled_gemm
+            else:
+                compiled_gemm = self.__class__.kernel_cache[cache_key]
+
+            compiled_gemm(
+                a_ptr,
+                b_ptr,
+                a_sf_ptr,
+                b_sf_ptr,
+                c_ptr,
+                c_sf_ptr,
+                alpha_ptr,
+                tile_idx_to_group_idx_ptr,
+                num_non_exiting_tiles_ptr,
+                global_sf_ptr,
+                m,
+                n,
+                k,
+                l,
+                stream=stream,
+            )
+            return c, c_sf
+
+    @torch.library.custom_op(
+        "trtllm::cute_dsl_nvfp4_grouped_gemm_swiglu_blackwell",
+        mutates_args=(),
+        device_types="cuda")
+    def cute_dsl_nvfp4_grouped_gemm_swiglu_blackwell(
+        input: torch.Tensor,
+        weight: torch.Tensor,
+        input_scale: torch.Tensor,
+        weight_scale: torch.Tensor,
+        alpha: torch.Tensor,
+        tile_idx_to_group_idx: torch.Tensor,
+        num_non_exiting_tiles: torch.Tensor,
+        global_sf: torch.Tensor,
+        num_experts: int,
+        top_k: int,
+        num_local_experts: int,
+        local_expert_offset: int,
+        tile_size: int,
+        scaling_vector_size: int = 16,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        tuner = AutoTuner.get()
+
+        runner = Sm100BlockScaledContiguousGroupedGemmSwigluFusionRunner(
+            num_experts, top_k, num_local_experts, local_expert_offset,
+            tile_size, scaling_vector_size)
+        inputs = [
+            input, weight, input_scale, weight_scale, alpha,
+            tile_idx_to_group_idx, num_non_exiting_tiles, global_sf
+        ]
+
+        _, best_tactic = tuner.choose_one(
+            "trtllm::cute_dsl_nvfp4_grouped_gemm_swiglu_blackwell",
+            [runner],
+            runner.get_tuning_config(),
+            inputs,
+        )
+        output = runner(inputs, tactic=best_tactic)
+        return output
+
+    @torch.library.register_fake(
+        "trtllm::cute_dsl_nvfp4_grouped_gemm_swiglu_blackwell")
+    def _(
+        input: torch.Tensor,
+        weight: torch.Tensor,
+        input_scale: torch.Tensor,
+        weight_scale: torch.Tensor,
+        alpha: torch.Tensor,
+        tile_idx_to_group_idx: torch.Tensor,
+        num_non_exiting_tiles: torch.Tensor,
+        global_sf: torch.Tensor,
+        num_experts: int,
+        top_k: int,
+        num_local_experts: int,
+        local_expert_offset: int,
+        tile_size: int,
+        scaling_vector_size: int = 16,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        m = input.size(0)
+        n = weight.size(1)
+        interm_size = n // 2
+        output = torch.empty(m,
+                             interm_size // 2,
+                             dtype=input.dtype,
+                             device=input.device)
+        output_scale = torch.empty(m * interm_size // scaling_vector_size,
+                                   dtype=input_scale.dtype,
+                                   device=input_scale.device)
+        return output, output_scale
+
     class FusedMoEInputsHelper:
 
         def __init__(self, num_experts: int, top_k: int, num_local_experts: int,
 
@@ -52,6 +52,8 @@
 import cutlass.utils.blockscaled_layout as blockscaled_utils
 from cutlass.cute.nvgpu import cpasync, tcgen05
 
+from .utils import is_power_of_2
+
 
 class Sm100BlockScaledContiguousGroupedGemmKernel:
     """This class implements batched matrix multiplication (C = A x SFA x B x SFB) with support for various data types
@@ -2052,9 +2054,6 @@ def is_valid_mma_tiler_and_cluster_shape(
             is_valid = False
 
         # Skip invalid cluster shape
-        def is_power_of_2(x: int) -> bool:
-            return x > 0 and (x & (x - 1)) == 0
-
         if (
             cluster_shape_mn[0] * cluster_shape_mn[1] > 16
             or cluster_shape_mn[0] <= 0
 
@@ -39,6 +39,8 @@
 from cutlass.cute.nvgpu import cpasync, tcgen05
 from cutlass.cutlass_dsl import T, dsl_user_op
 
+from .utils import is_power_of_2
+
 """
 High-performance persistent blockscaled contiguous grouped dense GEMM (C = alpha * (SFA * A) * (SFB * B)) example for
 the NVIDIA Blackwell architecture using CUTE DSL.
@@ -2060,9 +2062,6 @@ def is_valid_mma_tiler_and_cluster_shape(
             is_valid = False
 
         # Skip invalid cluster shape
-        def is_power_of_2(x: int) -> bool:
-            return x > 0 and (x & (x - 1)) == 0
-
         if (
             cluster_shape_mn[0] * cluster_shape_mn[1] > 16
             or cluster_shape_mn[0] <= 0