fixed tests, readme, removed rms torch pattern

MrGeva · MrGeva · commit 81bb39d33fe3 · 2025-11-19T05:42:50.000-08:00
Signed-off-by: Eran Geva &lt;19514940+MrGeva@users.noreply.github.com&gt;
diff --git a/tensorrt_llm/_torch/auto_deploy/custom_ops/README.md b/tensorrt_llm/_torch/auto_deploy/custom_ops/README.md
@@ -17,13 +17,15 @@ The table below lists the operators ordered by their backend.
 | `torch.ops.auto_deploy.torch_attention` | Grouped SDPA implementation with `bsnd` and `bnsd` layout supported |
 | `torch.ops.auto_deploy.torch_attention_repeat_kv` | KV repetition for attention |
 | `torch.ops.auto_deploy.torch_attention_sdpa` | Standard SDPA implementation |
-| `torch.ops.auto_deploy.torch_dist_all_gather` | Distributed all-gather operation |
-| `torch.ops.auto_deploy.torch_dist_all_reduce` | Distributed all-reduce operation |
+| `torch.ops.auto_deploy.torch_dist_all_gather` | Distributed all-gather operation (PyTorch backend, demollm mode) |
+| `torch.ops.auto_deploy.torch_dist_all_reduce` | Distributed all-reduce operation (PyTorch backend, demollm mode) |
+| `torch.ops.auto_deploy.torch_fused_linear_all_reduce` | Fused linear layer followed by all-reduce (PyTorch backend, demollm mode) |
+| `torch.ops.auto_deploy.torch_fused_fp8_linear_all_reduce` | Fused FP8 linear layer followed by all-reduce (PyTorch backend, demollm mode) |
 | `torch.ops.auto_deploy.torch_linear_simple` | Simple linear layer implementation |
 | `torch.ops.auto_deploy.torch_moe` | Mixture of Experts implementation |
 | `torch.ops.auto_deploy.torch_moe_fused` | Fused Mixture of Experts implementation |
 | `torch.ops.auto_deploy.torch_quant_fn` | Generic quantization function that scales, rounds, and clamps input values |
-| `torch.ops.auto_deploy.torch_quant_fused_fp8_linear_all_reduce` | Fused FP8 linear layer followed by all-reduce operation |
+| `torch.ops.auto_deploy.torch_quant_fused_fp8_linear_all_reduce` | Legacy name for `torch_fused_fp8_linear_all_reduce` |
 | `torch.ops.auto_deploy.torch_quant_nvfp4_linear` | FP4 quantized linear layer |
 | `torch.ops.auto_deploy.torch_quant_fp8_linear` | FP8 quantized linear layer |
 | `torch.ops.auto_deploy.torch_rope_with_complex_freqs` | RoPE with complex frequencies |
@@ -38,4 +40,10 @@ The table below lists the operators ordered by their backend.
 | `torch.ops.auto_deploy.triton_rope_on_flattened_inputs` | Triton RoPE on flattened inputs |
 | `torch.ops.auto_deploy.triton_rope_with_input_pos` | Triton RoPE with input positions |
 | `torch.ops.auto_deploy.trtllm_moe_fused` | TensorRT LLM fused MoE implementation |
-| `torch.ops.auto_deploy.trtllm_dist_fused_linear_all_reduce` | TensorRT LLM fused linear layer followed by all-reduce operation |
+| `torch.ops.auto_deploy.trtllm_dist_all_gather` | Distributed all-gather operation (TRT-LLM backend, MPI mode) |
+| `torch.ops.auto_deploy.trtllm_dist_all_reduce` | Distributed all-reduce operation (TRT-LLM backend, MPI mode) |
+| `torch.ops.auto_deploy.trtllm_dist_fused_linear_all_reduce` | Legacy name for `trtllm_fused_linear_all_reduce` |
+| `torch.ops.auto_deploy.trtllm_fused_linear_all_reduce` | Fused linear layer followed by all-reduce (TRT-LLM backend, MPI mode) |
+| `torch.ops.auto_deploy.trtllm_fused_fp8_linear_all_reduce` | Fused FP8 linear layer followed by all-reduce (TRT-LLM backend, MPI mode) |
+| `torch.ops.dist.torch_fused_allreduce_residual_rmsnorm` | Fused all-reduce + residual add + RMSNorm (PyTorch backend, demollm mode) |
+| `torch.ops.dist.trtllm_fused_allreduce_residual_rmsnorm` | Fused all-reduce + residual add + RMSNorm (TRT-LLM backend, MPI mode) |
diff --git a/tensorrt_llm/_torch/auto_deploy/transform/library/collectives.py b/tensorrt_llm/_torch/auto_deploy/transform/library/collectives.py
@@ -92,17 +92,6 @@ def replacement_fn(x: torch.Tensor, residual: torch.Tensor, weight: torch.Tensor
 # Instantiate Pattern and Replacement Functions
 # ============================================================================
 
-# Torch backend (demollm mode)
-_allreduce_residual_rmsnorm_pattern_torch = _make_allreduce_residual_rmsnorm_pattern(
-    torch.ops.auto_deploy.torch_dist_all_reduce, add_order="residual_first"
-)
-_allreduce_residual_rmsnorm_pattern2_torch = _make_allreduce_residual_rmsnorm_pattern(
-    torch.ops.auto_deploy.torch_dist_all_reduce, add_order="x_first"
-)
-_allreduce_residual_rmsnorm_repl_torch = _make_allreduce_residual_rmsnorm_replacement(
-    torch.ops.dist.torch_fused_allreduce_residual_rmsnorm
-)
-
 # TRT-LLM backend (MPI mode)
 _allreduce_residual_rmsnorm_pattern_trtllm = _make_allreduce_residual_rmsnorm_pattern(
     torch.ops.auto_deploy.trtllm_dist_all_reduce, add_order="residual_first"
@@ -149,29 +138,7 @@ def _apply(
         op_ignore_types = {torch.ops.aten.to.dtype: (torch.dtype,)}
         scalar_workaround = {"eps": 0.1253}
 
-        # Register BOTH torch and trtllm patterns
-        # The pattern matcher will find whichever is present in the graph
-
-        # Torch backend patterns (residual + x)
-        register_ad_pattern(
-            search_fn=_allreduce_residual_rmsnorm_pattern_torch,
-            replace_fn=_allreduce_residual_rmsnorm_repl_torch,
-            patterns=patterns,
-            dummy_args=dummy_args,
-            op_ignore_types=op_ignore_types,
-            scalar_workaround=scalar_workaround,
-        )
-
-        # Torch backend patterns (x + residual)
-        register_ad_pattern(
-            search_fn=_allreduce_residual_rmsnorm_pattern2_torch,
-            replace_fn=_allreduce_residual_rmsnorm_repl_torch,
-            patterns=patterns,
-            dummy_args=dummy_args,
-            op_ignore_types=op_ignore_types,
-            scalar_workaround=scalar_workaround,
-        )
-
+        # Register only trtllm patterns
         # TRT-LLM backend patterns (residual + x)
         register_ad_pattern(
             search_fn=_allreduce_residual_rmsnorm_pattern_trtllm,
diff --git a/tensorrt_llm/_torch/auto_deploy/transform/library/sharding.py b/tensorrt_llm/_torch/auto_deploy/transform/library/sharding.py
@@ -1004,7 +1004,8 @@ def detect_dp_bmm_shard(gm: GraphModule, sharding_config: ShardingConfig) -> Tra
         base_size = bmm_batch_size // world_size
         remainder = bmm_batch_size % world_size
 
-        # NOTE: our torch.ops.auto_deploy.torch_dist_all_gather doesn't support uneven splits at the moment.
+        # NOTE: our torch.ops.auto_deploy.torch_dist_all_gather/trtllm_dist_all_gather
+        #  doesn't support uneven splits at the moment.
         if remainder:
             ad_logger.warning(
                 f"BMM batch size {bmm_batch_size} is not divisible by world size {world_size}. "
diff --git a/tensorrt_llm/_torch/auto_deploy/transform/library/visualization.py b/tensorrt_llm/_torch/auto_deploy/transform/library/visualization.py
@@ -77,6 +77,7 @@ def add_outputs_metadata(self, fx_node: torch.fx.node.Node, node: GraphNode):
 # TODO(yudong): make custom_ops configurable
 CUSTOM_OPS = (
     torch.ops.auto_deploy.torch_dist_all_reduce.default,
+    torch.ops.auto_deploy.trtllm_dist_all_reduce.default,
     torch.ops.aten.slice.Tensor,
     torch.ops.auto_deploy.triton_attention_fused_mha_with_cache.default,
     torch.ops.auto_deploy.trtllm_dist_fused_linear_all_reduce.default,
diff --git a/tensorrt_llm/_torch/auto_deploy/utils/sharding_utils.py b/tensorrt_llm/_torch/auto_deploy/utils/sharding_utils.py
@@ -881,7 +881,8 @@ def validate(self, gm: GraphModule = None, node: Node = None) -> bool:
         # Check if the distribution is balanced
         remainder = bmm_batch_size % self.world_size
 
-        # NOTE: our torch.ops.auto_deploy.torch_dist_all_gather doesn't support uneven splits at the moment.
+        # NOTE: our torch.ops.auto_deploy.torch_dist_all_gather/trtllm_dist_all_gather
+        #  doesn't support uneven splits at the moment.
         if remainder:
             ad_logger.warning(
                 f"BMM batch size {bmm_batch_size} is not divisible by world size {self.world_size}. "
@@ -1070,7 +1071,7 @@ def _insert_sharded_mxfp4_mlp_ep(
     Transform a call to auto_deploy::triton_mxfp4_moe into:
       - sharded expert parameters along dim 0 (this rank's slice),
       - call to auto_deploy::triton_mxfp4_moe_ep(..., local_lo, local_hi),
-      - followed by torch_dist_all_reduce.
+      - followed by torch_dist_all_reduce/trtllm_dist_all_reduce.
 
     Expects the original op signature:
       (hidden_states,
diff --git a/tests/unittest/_torch/auto_deploy/unit/multigpu/custom_ops/test_dist.py b/tests/unittest/_torch/auto_deploy/unit/multigpu/custom_ops/test_dist.py
@@ -9,13 +9,15 @@
 
 def _run_all_reduce_test(rank, world_size):
     x = torch.ones(10, 10).to("cuda")
+    # Test torch backend (demollm mode with Python multiprocessing)
     y = torch.ops.auto_deploy.torch_dist_all_reduce(x)
 
     assert torch.equal(x * world_size, y)
 
 
 def _run_all_gather_test(rank, world_size):
     x = torch.ones(10, 10).to("cuda")
+    # Test torch backend (demollm mode with Python multiprocessing)
     y = torch.ops.auto_deploy.torch_dist_all_gather(x)
 
     assert torch.sum(y) == world_size * torch.sum(x)
diff --git a/tests/unittest/_torch/auto_deploy/unit/multigpu/transformations/library/test_allreduce_residual_rmsnorm_fusion.py b/tests/unittest/_torch/auto_deploy/unit/multigpu/transformations/library/test_allreduce_residual_rmsnorm_fusion.py
@@ -37,7 +37,8 @@ def __init__(self, hidden_size, dtype):
         self.norm = RMSNorm(hidden_size, 1e-5, dtype)
 
     def forward(self, x, residual):
-        x = torch.ops.auto_deploy.torch_dist_all_reduce.default(x)
+        # Use trtllm backend ops when running with MPI/TRT-LLM
+        x = torch.ops.auto_deploy.trtllm_dist_all_reduce.default(x)
         y = residual + x
         normed = self.norm(y)
         return normed, y
@@ -51,7 +52,8 @@ def __init__(self, hidden_size, dtype):
         self.norm = RMSNorm(hidden_size, 1e-5, dtype)
 
     def forward(self, x, residual):
-        x = torch.ops.auto_deploy.torch_dist_all_reduce.default(x)
+        # Use trtllm backend ops when running with MPI/TRT-LLM
+        x = torch.ops.auto_deploy.trtllm_dist_all_reduce.default(x)
         y = x + residual
         normed = self.norm(y)
         return normed, y
@@ -94,7 +96,7 @@ def _test_allreduce_fusion(port: int, ModuleCls):
     # Check if fused node in the graph
     has_fused_node = False
     for node in gm_transformed.graph.nodes:
-        if is_op(node, torch.ops.dist.fused_allreduce_residual_rmsnorm):
+        if is_op(node, torch.ops.dist.trtllm_fused_allreduce_residual_rmsnorm):
             has_fused_node = True
     assert has_fused_node, "Fused node not found."