NVIDIA
diff --git a/‎tensorrt_llm/_torch/auto_deploy/custom_ops/dist.py‎
Lines changed: 61 additions & 13 deletions b/‎tensorrt_llm/_torch/auto_deploy/custom_ops/dist.py‎
Lines changed: 61 additions & 13 deletions
diff --git a/‎tensorrt_llm/_torch/auto_deploy/custom_ops/linear.py‎
Lines changed: 57 additions & 10 deletions b/‎tensorrt_llm/_torch/auto_deploy/custom_ops/linear.py‎
Lines changed: 57 additions & 10 deletions
diff --git a/‎tensorrt_llm/_torch/auto_deploy/custom_ops/quant.py‎
Lines changed: 83 additions & 5 deletions b/‎tensorrt_llm/_torch/auto_deploy/custom_ops/quant.py‎
Lines changed: 83 additions & 5 deletions
@@ -1,4 +1,8 @@
-"""Custom ops required for implementing tensor parallelism."""
+"""Custom ops required for implementing tensor parallelism.
+
+This module defines atomic distributed ops - each op uses a specific backend
+(torch.distributed or TRT-LLM) without internal dispatch logic.
+"""
 
 from typing import List, Optional
 
@@ -7,38 +11,82 @@
 from ..distributed import common as dist
 from ..distributed import trtllm as trtllm_dist
 
+# ============================================================================
+# PyTorch Distributed Backend Ops (demollm mode)
+# ============================================================================
+
 
 @torch.library.custom_op("auto_deploy::torch_dist_all_gather", mutates_args=(), device_types="cuda")
-def all_gather(
+def torch_dist_all_gather(
     tensor: torch.Tensor, dim: int = 0, sizes: Optional[List[int]] = None
 ) -> torch.Tensor:
-    """All gather followed by concat in dim = 0. This is the default nccl behavior."""
-    if trtllm_dist.is_trtllm_op_available():
-        return trtllm_dist.trtllm_allgather(tensor, dim=dim, sizes=sizes)
+    """All gather using PyTorch distributed backend.
+
+    This op always uses torch.distributed.all_gather and is used in demollm mode.
+    """
     tl = [torch.zeros_like(tensor) for _ in range(dist.get_world_size())]
     dist.all_gather(tl, tensor)
     return torch.cat(tl, dim=dim)
 
 
-@all_gather.register_fake
-def all_gather_fake(tensor, dim=0):
+@torch_dist_all_gather.register_fake
+def torch_dist_all_gather_fake(tensor, dim=0, sizes=None):
     return torch.cat([torch.empty_like(tensor) for _ in range(dist.get_world_size())], dim=dim)
 
 
 @torch.library.custom_op("auto_deploy::torch_dist_all_reduce", mutates_args=(), device_types="cuda")
-def all_reduce(t: torch.Tensor) -> torch.Tensor:
-    """All_reduce across the ranks. Reduction op is SUM.
+def torch_dist_all_reduce(t: torch.Tensor) -> torch.Tensor:
+    """All_reduce using PyTorch distributed backend. Reduction op is SUM.
+
+    This op always uses torch.distributed.all_reduce and is used in demollm mode.
 
     NOTE: this op requires an extra memory copy and should ONLY be used for debugging + testing. For
     efficient all_reduce ops one should write/replace it with a fused op.
     """
-    if trtllm_dist.is_trtllm_op_available():
-        return trtllm_dist.trtllm_allreduce(t, op=dist.ReduceOp.SUM)
     t_res = t.clone()
     dist.all_reduce(t_res, op=dist.ReduceOp.SUM)
     return t_res
 
 
-@all_reduce.register_fake
-def all_reduce_fake(tensor):
+@torch_dist_all_reduce.register_fake
+def torch_dist_all_reduce_fake(tensor):
+    return torch.empty_like(tensor)
+
+
+# ============================================================================
+# TRT-LLM Backend Ops (MPI mode)
+# ============================================================================
+
+
+@torch.library.custom_op(
+    "auto_deploy::trtllm_dist_all_gather", mutates_args=(), device_types="cuda"
+)
+def trtllm_dist_all_gather(
+    tensor: torch.Tensor, dim: int = 0, sizes: Optional[List[int]] = None
+) -> torch.Tensor:
+    """All gather using TRT-LLM optimized backend.
+
+    This op always uses TRT-LLM's optimized allgather and is used in MPI mode.
+    """
+    return trtllm_dist.trtllm_allgather(tensor, dim=dim, sizes=sizes)
+
+
+@trtllm_dist_all_gather.register_fake
+def trtllm_dist_all_gather_fake(tensor, dim=0, sizes=None):
+    return torch.cat([torch.empty_like(tensor) for _ in range(dist.get_world_size())], dim=dim)
+
+
+@torch.library.custom_op(
+    "auto_deploy::trtllm_dist_all_reduce", mutates_args=(), device_types="cuda"
+)
+def trtllm_dist_all_reduce(t: torch.Tensor) -> torch.Tensor:
+    """All_reduce using TRT-LLM optimized backend. Reduction op is SUM.
+
+    This op always uses TRT-LLM's optimized allreduce and is used in MPI mode.
+    """
+    return trtllm_dist.trtllm_allreduce(t, op=dist.ReduceOp.SUM)
+
+
+@trtllm_dist_all_reduce.register_fake
+def trtllm_dist_all_reduce_fake(tensor):
     return torch.empty_like(tensor)
@@ -24,26 +24,73 @@ def simple(input: torch.Tensor, weight: torch.Tensor, bias: Optional[torch.Tenso
 @simple.register_fake
 def simple_fake(input, weight, bias):
     """Fake implementation of simple_linear."""
-    # return torch.empty(
-    # input.shape[:-1] + (weight.shape[-1],), dtype=input.dtype, device=input.device
-    # )
     return torch.ops.aten.linear(input, weight, bias)
 
 
+# ============================================================================
+# Fused Linear + AllReduce Ops (Atomic - Backend Specific)
+# ============================================================================
+
+
 @torch.library.custom_op(
-    "auto_deploy::trtllm_dist_fused_linear_all_reduce", mutates_args=(), device_types="cuda"
+    "auto_deploy::torch_fused_linear_all_reduce", mutates_args=(), device_types="cuda"
 )
-def fused_linear_all_reduce(
+def torch_fused_linear_all_reduce(
     input: torch.Tensor, weight: torch.Tensor, bias: Optional[torch.Tensor]
 ) -> torch.Tensor:
-    """Fused linear followed by all_reduce on the output."""
+    """Fused linear + all_reduce using PyTorch backend.
+
+    This op always uses torch.distributed and is used in demollm mode.
+    """
     output = torch.ops.aten.linear(input, weight, bias)
-    if trtllm_dist.is_trtllm_op_available():
-        return trtllm_dist.trtllm_allreduce(output, op=dist.ReduceOp.SUM)
     dist.all_reduce(output, op=dist.ReduceOp.SUM)
     return output
 
 
-@fused_linear_all_reduce.register_fake
-def fused_linear_all_reduce_fake(input, weight, bias):
+@torch_fused_linear_all_reduce.register_fake
+def torch_fused_linear_all_reduce_fake(input, weight, bias):
+    return torch.ops.aten.linear(input, weight, bias)
+
+
+@torch.library.custom_op(
+    "auto_deploy::trtllm_fused_linear_all_reduce", mutates_args=(), device_types="cuda"
+)
+def trtllm_fused_linear_all_reduce(
+    input: torch.Tensor, weight: torch.Tensor, bias: Optional[torch.Tensor]
+) -> torch.Tensor:
+    """Fused linear + all_reduce using TRT-LLM backend.
+
+    This op always uses TRT-LLM's optimized allreduce and is used in MPI mode.
+    """
+    output = torch.ops.aten.linear(input, weight, bias)
+    return trtllm_dist.trtllm_allreduce(output, op=dist.ReduceOp.SUM)
+
+
+@trtllm_fused_linear_all_reduce.register_fake
+def trtllm_fused_linear_all_reduce_fake(input, weight, bias):
+    return torch.ops.aten.linear(input, weight, bias)
+
+
+# ============================================================================
+# Legacy op name for backward compatibility
+# ============================================================================
+
+
+@torch.library.custom_op(
+    "auto_deploy::trtllm_dist_fused_linear_all_reduce", mutates_args=(), device_types="cuda"
+)
+def trtllm_dist_fused_linear_all_reduce(
+    input: torch.Tensor, weight: torch.Tensor, bias: Optional[torch.Tensor]
+) -> torch.Tensor:
+    """Legacy name for trtllm_fused_linear_all_reduce.
+
+    Kept for backward compatibility with existing code.
+    This is an alias that directly implements the same logic.
+    """
+    output = torch.ops.aten.linear(input, weight, bias)
+    return trtllm_dist.trtllm_allreduce(output, op=dist.ReduceOp.SUM)
+
+
+@trtllm_dist_fused_linear_all_reduce.register_fake
+def trtllm_dist_fused_linear_all_reduce_fake(input, weight, bias):
     return torch.ops.aten.linear(input, weight, bias)
@@ -240,26 +240,104 @@ def fp8_linear_fake(
     return torch.ops.aten.linear(input, weight_fp8.to(input.dtype), bias)
 
 
+# ============================================================================
+# Fused FP8 Linear + AllReduce Ops (Atomic - Backend Specific)
+# ============================================================================
+
+
+@torch.library.custom_op("auto_deploy::torch_fused_fp8_linear_all_reduce", mutates_args=())
+@torch.compile(dynamic=True)
+def torch_fused_fp8_linear_all_reduce(
+    input: torch.Tensor,
+    weight_fp8: torch.Tensor,
+    bias: Optional[torch.Tensor] = None,
+    input_scale: Optional[torch.Tensor] = None,
+    weight_scale: Optional[torch.Tensor] = None,
+) -> torch.Tensor:
+    """Fused FP8 linear + all_reduce using PyTorch backend.
+
+    This op always uses torch.distributed and is used in demollm mode.
+    """
+    out = torch.ops.auto_deploy.torch_quant_fp8_linear(
+        input, weight_fp8, bias, input_scale, weight_scale
+    )
+    dist.all_reduce(out, op=dist.ReduceOp.SUM)
+    return out
+
+
+@torch_fused_fp8_linear_all_reduce.register_fake
+def torch_fused_fp8_linear_all_reduce_fake(
+    input: torch.Tensor,
+    weight_fp8: torch.Tensor,
+    bias: Optional[torch.Tensor] = None,
+    input_scale: Optional[torch.Tensor] = None,
+    weight_scale: Optional[torch.Tensor] = None,
+) -> torch.Tensor:
+    return torch.ops.auto_deploy.torch_quant_fp8_linear(
+        input, weight_fp8, bias, input_scale, weight_scale
+    )
+
+
+@torch.library.custom_op("auto_deploy::trtllm_fused_fp8_linear_all_reduce", mutates_args=())
+@torch.compile(dynamic=True)
+def trtllm_fused_fp8_linear_all_reduce(
+    input: torch.Tensor,
+    weight_fp8: torch.Tensor,
+    bias: Optional[torch.Tensor] = None,
+    input_scale: Optional[torch.Tensor] = None,
+    weight_scale: Optional[torch.Tensor] = None,
+) -> torch.Tensor:
+    """Fused FP8 linear + all_reduce using TRT-LLM backend.
+
+    This op always uses TRT-LLM's optimized allreduce and is used in MPI mode.
+    """
+    out = torch.ops.auto_deploy.torch_quant_fp8_linear(
+        input, weight_fp8, bias, input_scale, weight_scale
+    )
+    return trtllm_dist.trtllm_allreduce(out, op=dist.ReduceOp.SUM)
+
+
+@trtllm_fused_fp8_linear_all_reduce.register_fake
+def trtllm_fused_fp8_linear_all_reduce_fake(
+    input: torch.Tensor,
+    weight_fp8: torch.Tensor,
+    bias: Optional[torch.Tensor] = None,
+    input_scale: Optional[torch.Tensor] = None,
+    weight_scale: Optional[torch.Tensor] = None,
+) -> torch.Tensor:
+    return torch.ops.auto_deploy.torch_quant_fp8_linear(
+        input, weight_fp8, bias, input_scale, weight_scale
+    )
+
+
+# ============================================================================
+# Legacy op name for backward compatibility
+# ============================================================================
+
+
 @torch.library.custom_op("auto_deploy::torch_quant_fused_fp8_linear_all_reduce", mutates_args=())
 @torch.compile(dynamic=True)
-def fused_fp8_linear_all_reduce(
+def torch_quant_fused_fp8_linear_all_reduce(
     input: torch.Tensor,
     weight_fp8: torch.Tensor,
     bias: Optional[torch.Tensor] = None,
     input_scale: Optional[torch.Tensor] = None,
     weight_scale: Optional[torch.Tensor] = None,
 ) -> torch.Tensor:
+    """Legacy name for torch_fused_fp8_linear_all_reduce.
+
+    Kept for backward compatibility with existing code.
+    Defaults to torch backend (demollm mode).
+    """
     out = torch.ops.auto_deploy.torch_quant_fp8_linear(
         input, weight_fp8, bias, input_scale, weight_scale
     )
-    if trtllm_dist.is_trtllm_op_available():
-        return trtllm_dist.trtllm_allreduce(out, op=dist.ReduceOp.SUM)
     dist.all_reduce(out, op=dist.ReduceOp.SUM)
     return out
 
 
-@fused_fp8_linear_all_reduce.register_fake
-def fused_fp8_linear_all_reduce_fake(
+@torch_quant_fused_fp8_linear_all_reduce.register_fake
+def torch_quant_fused_fp8_linear_all_reduce_fake(
     input: torch.Tensor,
     weight_fp8: torch.Tensor,
     bias: Optional[torch.Tensor] = None,