update finalize fusion

syuoni · syuoni · commit 52fb85e8f49d · 2025-11-19T01:55:57.000Z
Signed-off-by: Enwei Zhu &lt;21126786+syuoni@users.noreply.github.com&gt;
diff --git a/tensorrt_llm/_torch/custom_ops/cute_dsl_custom_ops.py b/tensorrt_llm/_torch/custom_ops/cute_dsl_custom_ops.py
@@ -339,6 +339,9 @@ def __init__(self, num_experts: int, top_k: int, num_local_experts: int,
             self.num_local_experts = num_local_experts
             self.local_expert_offset = local_expert_offset
             self.tile_size = tile_size
+            # Padding values should never be accessed.
+            # Intentionally use a large padding value to expose issues early.
+            self.pad_val = int(2e9)
 
         def get_max_num_tiles(self, num_tokens: int) -> int:
             num_expanded_tokens = num_tokens * self.top_k
@@ -431,8 +434,7 @@ def generate_permuted_idx_to_expanded_idx(
                         permuted_idx_to_expanded_idx.append(expanded_idx)
                         colmajor_expanded_idx += 1
                     else:
-                        # TODO: Remove this WAR.
-                        permuted_idx_to_expanded_idx.append(-1)
+                        permuted_idx_to_expanded_idx.append(self.pad_val)
             return permuted_idx_to_expanded_idx
 
         def inputs_pre_hook(self,
@@ -450,7 +452,8 @@ def inputs_pre_hook(self,
             assert num_padding_tiles_val >= 0
 
             tile_idx_to_group_idx = torch.tensor(
-                tile_idx_to_group_idx_list + [int(1e9)] * num_padding_tiles_val,
+                tile_idx_to_group_idx_list +
+                [self.pad_val] * num_padding_tiles_val,
                 dtype=tile_idx_to_group_idx.dtype,
                 device=tile_idx_to_group_idx.device)
             num_non_exiting_tiles = torch.tensor(
@@ -481,15 +484,17 @@ def inputs_pre_hook_finalize_fusion(
                        ) == num_non_exiting_tiles_val * self.tile_size
 
             tile_idx_to_group_idx = torch.tensor(
-                tile_idx_to_group_idx_list + [int(1e9)] * num_padding_tiles_val,
+                tile_idx_to_group_idx_list +
+                [self.pad_val] * num_padding_tiles_val,
                 dtype=tile_idx_to_group_idx.dtype,
                 device=tile_idx_to_group_idx.device)
             tile_idx_to_mn_limit = torch.tensor(
-                tile_idx_to_mn_limit_list + [int(1e9)] * num_padding_tiles_val,
+                tile_idx_to_mn_limit_list +
+                [self.pad_val] * num_padding_tiles_val,
                 dtype=tile_idx_to_mn_limit.dtype,
                 device=tile_idx_to_mn_limit.device)
             permuted_idx_to_expanded_idx = torch.tensor(
-                permuted_idx_to_expanded_idx_list + [int(1e9)] *
+                permuted_idx_to_expanded_idx_list + [self.pad_val] *
                 (num_padding_tiles_val * self.tile_size),
                 dtype=permuted_idx_to_expanded_idx.dtype,
                 device=permuted_idx_to_expanded_idx.device)
diff --git a/tensorrt_llm/_torch/cute_dsl_kernels/blackwell/blockscaled_contiguous_grouped_gemm_finalize_fusion.py b/tensorrt_llm/_torch/cute_dsl_kernels/blackwell/blockscaled_contiguous_grouped_gemm_finalize_fusion.py
@@ -174,7 +174,7 @@ def vectorized_atomic_add_fp32x2(rOut_epi_packed, scatter_out_offset, loc=None,
             rOut_epi_packed[0].ir_value(),
             rOut_epi_packed[1].ir_value(),
         ],
-        "red.global.v2.f32.add [$0], {$1, $1};",
+        "red.global.v2.f32.add [$0], {$1, $2};",
         "l,f,f",
         has_side_effects=True,
     )
@@ -190,7 +190,7 @@ def atomic_add_func(rOut_epi_packed, scatter_out_offset, loc=None, ip=None):
                 rOut_epi_packed.ir_value(),
             ],
             "red.global.add.f32 [$0], $1;",
-            "=l,f",
+            "l,f",
             has_side_effects=True,
             loc=loc,
             ip=ip,
@@ -498,6 +498,7 @@ def __call__(
         gemm_output_major: cutlass.Constexpr,
         tile_idx_to_expert_idx: cute.Tensor,
         num_non_exiting_tiles: cute.Tensor,
+        tile_idx_to_mn_limit: cute.Tensor,
         alpha: cute.Tensor,
         max_active_clusters: cutlass.Constexpr,
         stream: cuda.CUstream,
@@ -739,6 +740,7 @@ class SharedStorage:
             out,
             tile_idx_to_expert_idx,
             num_non_exiting_tiles,
+            tile_idx_to_mn_limit,
             alpha,
             permuted_idx_to_expanded_idx,
             token_final_scales,
@@ -821,6 +823,7 @@ def kernel(
         out: cute.Tensor,
         tile_idx_to_expert_idx: cute.Tensor,
         num_non_exiting_tiles: cute.Tensor,
+        tile_idx_to_mn_limit: cute.Tensor,
         alpha: cute.Tensor,
         permuted_idx_to_expanded_idx: cute.Tensor,
         token_final_scales: cute.Tensor,
@@ -1612,7 +1615,9 @@ def kernel(
                 token_scale = self.final_scale_dtype(0.0)
                 topK = token_final_scales.shape[1]
 
-                if expanded_idx >= 0:
+                tile_mn_limit = tile_idx_to_mn_limit[mma_tile_coord_mnl[0]]
+
+                if permuted_row < tile_mn_limit:
                     token_idx = expanded_idx // topK
                     topk_idx = expanded_idx % topK
                     token_scale = token_final_scales[(token_idx, topk_idx)]
@@ -1652,24 +1657,25 @@ def kernel(
 
                     rOut_epi.store(acc_vec_finalized.to(self.out_dtype))
 
-                    coord_n = mma_tile_coord_mnl[1] * self.cta_tile_shape_mnk[
-                        1
-                    ] + subtile_idx * cute.size(tTR_rAcc)
-
-                    for index in cutlass.range(loop_size):
-                        scatter_out_offset = cute.domain_offset((0, coord_n, 0), scatter_out)
-                        if cutlass.const_expr(self.out_dtype == cutlass.BFloat16):
-                            rOut_epi_packed = rOut_epi[index, None, None]
-                            vectorized_atomic_add_bf16x8(rOut_epi_packed, scatter_out_offset)
-                            coord_n += cute.size(rOut_epi_packed)
-                        elif cutlass.const_expr(self.out_dtype == cutlass.Float32):
-                            rOut_epi_packed = rOut_epi[index, None]
-                            vectorized_atomic_add_fp32x2(rOut_epi_packed, scatter_out_offset)
-                            coord_n += cute.size(rOut_epi_packed)
-                        else:
-                            rOut_epi_packed = rOut_epi[index]
-                            atomic_add_func(rOut_epi_packed, scatter_out_offset)
-                            coord_n += 1
+                    if permuted_row < tile_mn_limit:
+                        coord_n = mma_tile_coord_mnl[1] * self.cta_tile_shape_mnk[
+                            1
+                        ] + subtile_idx * cute.size(tTR_rAcc)
+
+                        for index in cutlass.range(loop_size):
+                            scatter_out_offset = cute.domain_offset((0, coord_n, 0), scatter_out)
+                            if cutlass.const_expr(self.out_dtype == cutlass.BFloat16):
+                                rOut_epi_packed = rOut_epi[index, None, None]
+                                vectorized_atomic_add_bf16x8(rOut_epi_packed, scatter_out_offset)
+                                coord_n += cute.size(rOut_epi_packed)
+                            elif cutlass.const_expr(self.out_dtype == cutlass.Float32):
+                                rOut_epi_packed = rOut_epi[index, None]
+                                vectorized_atomic_add_fp32x2(rOut_epi_packed, scatter_out_offset)
+                                coord_n += cute.size(rOut_epi_packed)
+                            else:
+                                rOut_epi_packed = rOut_epi[index]
+                                atomic_add_func(rOut_epi_packed, scatter_out_offset)
+                                coord_n += 1
                     self.epilog_sync_barrier.arrive_and_wait()
                 #
                 # Async arrive accumulator buffer empty
@@ -1697,10 +1703,6 @@ def kernel(
             tmem.relinquish_alloc_permit()
             self.epilog_sync_barrier.arrive_and_wait()
             tmem.free(tmem_ptr)
-            #
-            # Wait for C store complete
-            #
-            # c_pipeline.producer_tail()
 
     def epilog_tmem_copy_and_partition(
         self,
@@ -1858,9 +1860,6 @@ def _compute_stages(
         # Start with total smem per CTA (capacity / occupancy)
         # Subtract reserved bytes and initial C stages bytes
         # Divide remaining by bytes needed per A/B stage
-        # cute.printf("num_smem_capacity: {}, occupancy: {}, mbar_helpers_bytes: {}, c_bytes: {}", num_smem_capacity,
-        # occupancy, mbar_helpers_bytes, c_bytes)
-        # cute.printf("ab_bytes_per_stage: {}", ab_bytes_per_stage)
         num_ab_stage = (num_smem_capacity // occupancy - (mbar_helpers_bytes)) // ab_bytes_per_stage
 
         # Refine epilogue stages:
@@ -2282,9 +2281,9 @@ def wrapper(
         tile_idx_to_group_idx = cute.make_tensor(
             tile_idx_to_group_idx_ptr, layout=cute.make_layout((num_tiles,))
         )
-        # tile_idx_to_mn_limit = cute.make_tensor(
-        #     tile_idx_to_mn_limit_ptr, layout=cute.make_layout((num_tiles,))
-        # )
+        tile_idx_to_mn_limit = cute.make_tensor(
+            tile_idx_to_mn_limit_ptr, layout=cute.make_layout((num_tiles,))
+        )
         permuted_idx_to_expanded_idx = cute.make_tensor(
             permuted_idx_to_expanded_idx_ptr, layout=cute.make_layout((m,))
         )
@@ -2305,6 +2304,7 @@ def wrapper(
             "n",
             tile_idx_to_group_idx,
             num_non_exiting_tiles,
+            tile_idx_to_mn_limit,
             alpha,
             max_active_clusters=max_active_clusters,
             stream=stream,
diff --git a/tensorrt_llm/_torch/modules/fused_moe/fused_moe_cute_dsl.py b/tensorrt_llm/_torch/modules/fused_moe/fused_moe_cute_dsl.py
@@ -412,13 +412,6 @@ def forward_chunk_nvfp4(
             tile_tokens_dim=tile_size,
         )
 
-        # TODO: Remove this WAR.
-        max_num_permuted_tokens = permuted_idx_to_expanded_idx.size(0)
-        permuted_idx_to_expanded_idx.masked_fill_(
-            torch.arange(max_num_permuted_tokens, device='cuda')
-            >= tile_idx_to_mn_limit.repeat_interleave(tile_size), -1)
-        x_sf.masked_fill_(x_sf.view(torch.float8_e4m3fn).isnan(), 0)
-
         x = torch.ops.trtllm.cute_dsl_nvfp4_grouped_gemm_finalize_blackwell(
             input=x.view(torch.float4_e2m1fn_x2),
             weight=self.w2_weight.view(torch.float4_e2m1fn_x2),
diff --git a/tests/unittest/_torch/thop/parallel/test_cute_dsl_moe.py b/tests/unittest/_torch/thop/parallel/test_cute_dsl_moe.py
@@ -467,14 +467,7 @@ def test_nvfp4_grouped_gemm_finalize_blackwell(
         tile_tokens_dim=tile_size,
     )
 
-    # TODO: Remove this WAR.
     max_num_permuted_tokens = permuted_idx_to_expanded_idx.size(0)
-    permuted_idx_to_expanded_idx.masked_fill_(
-        torch.arange(max_num_permuted_tokens, device="cuda")
-        >= tile_idx_to_mn_limit.repeat_interleave(tile_size),
-        -1,
-    )
-
     a = torch.randint(
         -100, 100, (max_num_permuted_tokens, hidden_size // 2), dtype=torch.int32, device="cuda"
     )

Original file line number	Diff line number	Diff line change
`@@ -467,14 +467,7 @@ def test_nvfp4_grouped_gemm_finalize_blackwell(`
`467`	`467`	`tile_tokens_dim=tile_size,`
`468`	`468`	`)`
`469`	`469`
`470`		`- # TODO: Remove this WAR.`
`471`	`470`	`max_num_permuted_tokens = permuted_idx_to_expanded_idx.size(0)`
`472`		`- permuted_idx_to_expanded_idx.masked_fill_(`
`473`		`- torch.arange(max_num_permuted_tokens, device="cuda")`
`474`		`- >= tile_idx_to_mn_limit.repeat_interleave(tile_size),`
`475`		`- -1,`
`476`		`- )`
`477`		`-`
`478`	`471`	`a = torch.randint(`
`479`	`472`	`-100, 100, (max_num_permuted_tokens, hidden_size // 2), dtype=torch.int32, device="cuda"`
`480`	`473`	`)`