added dist_backend arg

MrGeva · MrGeva · commit aeb741061ca1 · 2025-11-24T07:41:36.000-08:00
Signed-off-by: Eran Geva &lt;19514940+MrGeva@users.noreply.github.com&gt;
diff --git a/tensorrt_llm/_torch/auto_deploy/config/default.yaml b/tensorrt_llm/_torch/auto_deploy/config/default.yaml
@@ -79,6 +79,7 @@ transforms:
     sharding_source: ['factory','heuristic']
     support_partial_config: true
     sharding_dims: ['tp', 'ep', 'bmm']
+    dist_backend: auto
     requires_shape_prop: true
   sharding_transform_executor:
     stage: sharding
diff --git a/tensorrt_llm/_torch/auto_deploy/transform/library/sharding.py b/tensorrt_llm/_torch/auto_deploy/transform/library/sharding.py
@@ -39,6 +39,7 @@
 )
 from ...utils.sharding_utils import (
     BMMShardingInfo,
+    DistBackend,
     EPShardingInfo,
     LayerType,
     ParameterUpdateInfo,
@@ -134,6 +135,7 @@ def _process_simple_shard(
                         world_size=world_size,
                         dist_op="all_gather",
                         min_local_shape=1,
+                        dist_backend=sharding_config.dist_backend,
                     )
                 )
             )
@@ -152,6 +154,7 @@ class ShardingTransformConfig(TransformConfig):
     sharding_dims: List[ShardingDim] = Field(
         default_factory=lambda: [ShardingDim.SSM, ShardingDim.TP, ShardingDim.EP, ShardingDim.BMM]
     )
+    dist_backend: DistBackend = Field(default=DistBackend.AUTO)
 
 
 @TransformRegistry.register("detect_sharding")
@@ -209,6 +212,7 @@ def _apply(
         sharding_config.support_partial_config = self.config.support_partial_config
         sharding_config.sharding_dims = self.config.sharding_dims
         sharding_config.sharding_source = self.config.sharding_source
+        sharding_config.dist_backend = self.config.dist_backend
 
         sharding_config.validate_config()
 
@@ -348,6 +352,7 @@ def _process_ssm_sharding(
             dist_op=None,
             min_local_shape=min_local_shape,
             fused_weight_dims=fused_weight_dims["in_proj"],
+            dist_backend=sharding_config.dist_backend,
         )
     )
 
@@ -386,6 +391,7 @@ def _process_ssm_sharding(
                 dist_op=None,
                 min_local_shape=min_local_shape,
                 fused_weight_dims=fused_dims,
+                dist_backend=sharding_config.dist_backend,
             )
         )
 
@@ -422,6 +428,7 @@ def _process_ssm_sharding(
             rank=rank,
             world_size=world_size,
             dist_op="all_reduce",
+            dist_backend=sharding_config.dist_backend,
         )
     )
     return 1
@@ -448,6 +455,7 @@ def _process_column_sharding(
                 world_size=world_size,
                 dist_op=None,  # for column sharding, no dist op is performed
                 min_local_shape=min_local_shape,
+                dist_backend=sharding_config.dist_backend,
             )
         )
 
@@ -581,6 +589,7 @@ def detect_sharding_from_factory_config(
                             world_size=world_size,
                             dist_op=None,
                             min_local_shape=min_local_shape,
+                            dist_backend=sharding_config.dist_backend,
                         )
                     )
                     num_row_col_shards += 1
@@ -593,6 +602,7 @@ def detect_sharding_from_factory_config(
                             world_size=world_size,
                             dist_op="all_reduce",
                             min_local_shape=min_local_shape,
+                            dist_backend=sharding_config.dist_backend,
                         )
                     )
                     num_row_col_shards += 1
@@ -606,6 +616,7 @@ def detect_sharding_from_factory_config(
                             dist_op=None,
                             min_local_shape=min_local_shape,
                             layer_type=LayerType.MAMBA,
+                            dist_backend=sharding_config.dist_backend,
                         )
                     )
                     num_row_col_shards += 1
@@ -626,6 +637,7 @@ def detect_sharding_from_factory_config(
                                     world_size=world_size,
                                     dist_op=None,
                                     min_local_shape=min_local_shape,
+                                    dist_backend=sharding_config.dist_backend,
                                 )
                             )
                         elif col_row_action == "rowwise":
@@ -637,6 +649,7 @@ def detect_sharding_from_factory_config(
                                     world_size=world_size,
                                     dist_op="all_reduce",
                                     min_local_shape=min_local_shape,
+                                    dist_backend=sharding_config.dist_backend,
                                 )
                             )
                             num_row_col_shards += 1
@@ -951,6 +964,7 @@ def detect_column_row_shard(
                 world_size=world_size,
                 dist_op="all_reduce",
                 min_local_shape=min_local_shape,
+                dist_backend=sharding_config.dist_backend,
             )
         )
 
@@ -1028,6 +1042,7 @@ def detect_dp_bmm_shard(gm: GraphModule, sharding_config: ShardingConfig) -> Tra
                 world_size=world_size,
                 start_idx=start_idx,
                 end_idx=end_idx,
+                dist_backend=sharding_config.dist_backend,
             )
         )
         ad_logger.debug(
@@ -1070,6 +1085,7 @@ def detect_ep_shard(gm: GraphModule, sharding_config: ShardingConfig) -> Transfo
                 node,
                 rank=rank,
                 world_size=world_size,
+                dist_backend=sharding_config.dist_backend,
             )
         )
         num_moe_patterns += 1
diff --git a/tensorrt_llm/_torch/auto_deploy/utils/sharding_utils.py b/tensorrt_llm/_torch/auto_deploy/utils/sharding_utils.py
@@ -29,25 +29,47 @@
 )
 
 
-def _get_dist_ops():
+def _get_dist_ops(backend: str):
     """Get the appropriate distributed ops based on backend availability.
 
+    Args:
+        backend: The distributed backend to use. Can be 'auto', 'trtllm', or 'torch'.
+                 'auto' will automatically select based on availability.
+
     Returns tuple of (all_gather_op, all_reduce_op) for the current backend.
     """
-    from ..distributed.trtllm import is_trtllm_op_available
+    from ..custom_ops.trtllm_dist import is_trtllm_op_available
+
+    # Handle DistBackend enum or string
+    if hasattr(backend, "value"):
+        backend = backend.value
 
-    if is_trtllm_op_available():
-        # Use TRT-LLM optimized ops in MPI mode
+    if backend == "trtllm":
+        # Force TRT-LLM ops
         return (
             torch.ops.auto_deploy.trtllm_dist_all_gather.default,
             torch.ops.auto_deploy.trtllm_dist_all_reduce.default,
         )
-    else:
-        # Use PyTorch distributed ops in demollm mode
+    elif backend == "torch":
+        # Force PyTorch distributed ops
         return (
             torch.ops.auto_deploy.torch_dist_all_gather.default,
             torch.ops.auto_deploy.torch_dist_all_reduce.default,
         )
+    else:  # auto
+        # Automatically select based on availability
+        if is_trtllm_op_available():
+            # Use TRT-LLM optimized ops in MPI mode
+            return (
+                torch.ops.auto_deploy.trtllm_dist_all_gather.default,
+                torch.ops.auto_deploy.trtllm_dist_all_reduce.default,
+            )
+        else:
+            # Use PyTorch distributed ops in demollm mode
+            return (
+                torch.ops.auto_deploy.torch_dist_all_gather.default,
+                torch.ops.auto_deploy.torch_dist_all_reduce.default,
+            )
 
 
 def _load_hook(
@@ -251,6 +273,7 @@ def _insert_sharded_mamba(
     dim: int,
     rank: int,
     world_size: int,
+    dist_backend: str,
     add_dist: bool = False,
     min_local_shape: int = 1,
     weights_to_shard: Optional[list[str]] = None,
@@ -359,6 +382,7 @@ def _insert_sharded_mamba(
         dim=SplitDimension.COLUMN,
         rank=rank,
         world_size=world_size,
+        dist_backend=dist_backend,
         add_dist=False,
         min_local_shape=min_local_shape,
         fused_weight_dims=entry_fused_dims,
@@ -422,6 +446,7 @@ def _shard_parameter_node(
     dim: int,
     rank: int,
     world_size: int,
+    dist_backend: str,
     add_dist: bool = False,
     min_local_shape: int = 1,
     fused_weight_dims: Optional[list] = None,
@@ -507,7 +532,7 @@ def _shard_parameter_node(
         return
 
     # figure out the right dist op (backend-aware)
-    all_gather_op, all_reduce_op = _get_dist_ops()
+    all_gather_op, all_reduce_op = _get_dist_ops(dist_backend)
     dist_lookup = {
         0: (all_gather_op, -1),
         1: (all_reduce_op,),
@@ -595,6 +620,7 @@ class WeightShardingInfo(ShardingTransformInfo):
     layer_type: LayerType = LayerType.MLP
     # used for TP sharding of fused weights
     fused_weight_dims: Optional[list] = None
+    dist_backend: str = "auto"
 
     def quantization_cb(
         self,
@@ -644,6 +670,7 @@ def apply(self, gm: GraphModule, node: Node) -> None:
                 dim=self.split_dim.value,
                 rank=self.rank,
                 world_size=self.world_size,
+                dist_backend=self.dist_backend,
                 add_dist=self.dist_op is not None,
                 min_local_shape=self.min_local_shape,
                 fused_weight_dims=self.fused_weight_dims
@@ -658,6 +685,7 @@ def apply(self, gm: GraphModule, node: Node) -> None:
                 dim=self.split_dim.value,
                 rank=self.rank,
                 world_size=self.world_size,
+                dist_backend=self.dist_backend,
                 add_dist=self.dist_op is not None,
                 min_local_shape=self.min_local_shape,
                 fused_weight_dims=self.fused_weight_dims,
@@ -860,6 +888,7 @@ class BMMShardingInfo(ShardingTransformInfo):
     world_size: int
     start_idx: int
     end_idx: int
+    dist_backend: str = "auto"
 
     def validate(self, gm: GraphModule = None, node: Node = None) -> bool:
         """Validate the transformation configuration."""
@@ -947,7 +976,7 @@ def slice_tensor(t: torch.Tensor) -> torch.Tensor:
         handle_tensor(node, rhs_tensor, 1, self.start_idx, self.end_idx)
 
         # Add all_gather node after BMM to collect results
-        all_gather_op, _ = _get_dist_ops()
+        all_gather_op, _ = _get_dist_ops(self.dist_backend)
         with gm.graph.inserting_after(node):
             gather_node = gm.graph.call_function(
                 all_gather_op,
@@ -962,6 +991,7 @@ def _insert_sharded_moe(
     node: Node,
     rank: int,
     world_size: int,
+    dist_backend: str,
     scale_names: Sequence[str] = (),
 ):
     """Update the torch_moe node with sharded weight lists,
@@ -1036,7 +1066,7 @@ def get_partition(lst, world_size, rank):
     node.args = tuple(args)
 
     # -- add an all_reduce node --
-    _, all_reduce_op = _get_dist_ops()
+    _, all_reduce_op = _get_dist_ops(dist_backend)
     with gm.graph.inserting_after(node):
         dist_node = gm.graph.call_function(all_reduce_op, args=(node,))
         node.replace_all_uses_with(dist_node)
@@ -1066,6 +1096,7 @@ def _insert_sharded_mxfp4_mlp_ep(
     node: Node,
     rank: int,
     world_size: int,
+    dist_backend: str,
 ):
     """
     Transform a call to auto_deploy::triton_mxfp4_moe into:
@@ -1107,7 +1138,7 @@ def _insert_sharded_mxfp4_mlp_ep(
     node.args = args_ep
 
     # Add a dist all-reduce after the op (sum partial results across EP ranks)
-    _, all_reduce_op = _get_dist_ops()
+    _, all_reduce_op = _get_dist_ops(dist_backend)
     with gm.graph.inserting_after(node):
         red = gm.graph.call_function(all_reduce_op, args=(node,))
         node.replace_all_uses_with(red)
@@ -1120,6 +1151,7 @@ class EPShardingInfo(ShardingTransformInfo):
 
     rank: int
     world_size: int
+    dist_backend: str = "auto"
 
     @classmethod
     def from_node(cls, node: Node, **kwargs) -> "EPShardingInfo":
@@ -1138,7 +1170,7 @@ def validate(self, gm: GraphModule = None, node: Node = None) -> bool:
 
     def apply(self, gm: GraphModule, node: Node) -> None:
         """Apply EP sharding transformation to the graph module."""
-        _insert_sharded_moe(gm, node, self.rank, self.world_size, [])
+        _insert_sharded_moe(gm, node, self.rank, self.world_size, self.dist_backend, [])
 
 
 class MXFP4EPShardingInfo(EPShardingInfo):
@@ -1152,7 +1184,7 @@ def validate(self, gm: GraphModule = None, node: Node = None) -> bool:
         return True
 
     def apply(self, gm: GraphModule, node: Node) -> None:
-        _insert_sharded_mxfp4_mlp_ep(gm, node, self.rank, self.world_size)
+        _insert_sharded_mxfp4_mlp_ep(gm, node, self.rank, self.world_size, self.dist_backend)
 
 
 class FP8EPShardingInfo(EPShardingInfo, QuantizationShardingMixin):
@@ -1168,7 +1200,9 @@ def scale_names(self) -> List[str]:
         return ["input_scale", "weight_scale"]
 
     def apply(self, gm: GraphModule, node: Node) -> None:
-        _insert_sharded_moe(gm, node, self.rank, self.world_size, self.scale_names())
+        _insert_sharded_moe(
+            gm, node, self.rank, self.world_size, self.dist_backend, self.scale_names()
+        )
 
 
 class NVFP4EPShardingInfo(EPShardingInfo, QuantizationShardingMixin):
@@ -1184,7 +1218,9 @@ def scale_names(self) -> List[str]:
         return ["input_scale", "weight_scale", "alpha"]
 
     def apply(self, gm: GraphModule, node: Node) -> None:
-        _insert_sharded_moe(gm, node, self.rank, self.world_size, self.scale_names())
+        _insert_sharded_moe(
+            gm, node, self.rank, self.world_size, self.dist_backend, self.scale_names()
+        )
 
 
 EP_SHARDING_RULES = [
@@ -1222,6 +1258,14 @@ class ShardingDim(Enum):
     BMM = "bmm"
 
 
+class DistBackend(Enum):
+    """Enum for distributed backend."""
+
+    AUTO = "auto"
+    TRTLLM = "trtllm"
+    TORCH = "torch"
+
+
 class ShardingConfig(BaseModel):
     """Configuration for sharding the model."""
 
@@ -1237,6 +1281,7 @@ class ShardingConfig(BaseModel):
     sharding_dims: List[ShardingDim] = Field(
         default_factory=lambda: [ShardingDim.SSM, ShardingDim.TP, ShardingDim.EP, ShardingDim.BMM]
     )
+    dist_backend: DistBackend = Field(default=DistBackend.AUTO)
     weight_sharding_transforms: List[WeightShardingInfo] = Field(default_factory=list)
     parameter_update_transforms: List[ParameterUpdateInfo] = Field(default_factory=list)
     bmm_transforms: List[BMMShardingInfo] = Field(default_factory=list)
diff --git a/tests/unittest/_torch/auto_deploy/unit/multigpu/transformations/library/test_dist_backend.py b/tests/unittest/_torch/auto_deploy/unit/multigpu/transformations/library/test_dist_backend.py

Original file line number	Diff line number	Diff line change
`@@ -39,6 +39,7 @@`
`39`	`39`	`)`
`40`	`40`	`from ...utils.sharding_utils import (`
`41`	`41`	`BMMShardingInfo,`
	`42`	`+ DistBackend,`
`42`	`43`	`EPShardingInfo,`
`43`	`44`	`LayerType,`
`44`	`45`	`ParameterUpdateInfo,`
`@@ -134,6 +135,7 @@ def _process_simple_shard(`
`134`	`135`	`world_size=world_size,`
`135`	`136`	`dist_op="all_gather",`
`136`	`137`	`min_local_shape=1,`
	`138`	`+ dist_backend=sharding_config.dist_backend,`
`137`	`139`	`)`
`138`	`140`	`)`
`139`	`141`	`)`
`@@ -152,6 +154,7 @@ class ShardingTransformConfig(TransformConfig):`
`152`	`154`	`sharding_dims: List[ShardingDim] = Field(`
`153`	`155`	`default_factory=lambda: [ShardingDim.SSM, ShardingDim.TP, ShardingDim.EP, ShardingDim.BMM]`
`154`	`156`	`)`
	`157`	`+ dist_backend: DistBackend = Field(default=DistBackend.AUTO)`
`155`	`158`
`156`	`159`
`157`	`160`	`@TransformRegistry.register("detect_sharding")`
`@@ -209,6 +212,7 @@ def _apply(`
`209`	`212`	`sharding_config.support_partial_config = self.config.support_partial_config`
`210`	`213`	`sharding_config.sharding_dims = self.config.sharding_dims`
`211`	`214`	`sharding_config.sharding_source = self.config.sharding_source`
	`215`	`+ sharding_config.dist_backend = self.config.dist_backend`
`212`	`216`
`213`	`217`	`sharding_config.validate_config()`
`214`	`218`
`@@ -348,6 +352,7 @@ def _process_ssm_sharding(`
`348`	`352`	`dist_op=None,`
`349`	`353`	`min_local_shape=min_local_shape,`
`350`	`354`	`fused_weight_dims=fused_weight_dims["in_proj"],`
	`355`	`+ dist_backend=sharding_config.dist_backend,`
`351`	`356`	`)`
`352`	`357`	`)`
`353`	`358`
`@@ -386,6 +391,7 @@ def _process_ssm_sharding(`
`386`	`391`	`dist_op=None,`
`387`	`392`	`min_local_shape=min_local_shape,`
`388`	`393`	`fused_weight_dims=fused_dims,`
	`394`	`+ dist_backend=sharding_config.dist_backend,`
`389`	`395`	`)`
`390`	`396`	`)`
`391`	`397`
`@@ -422,6 +428,7 @@ def _process_ssm_sharding(`
`422`	`428`	`rank=rank,`
`423`	`429`	`world_size=world_size,`
`424`	`430`	`dist_op="all_reduce",`
	`431`	`+ dist_backend=sharding_config.dist_backend,`
`425`	`432`	`)`
`426`	`433`	`)`
`427`	`434`	`return 1`
`@@ -448,6 +455,7 @@ def _process_column_sharding(`
`448`	`455`	`world_size=world_size,`
`449`	`456`	`dist_op=None, # for column sharding, no dist op is performed`
`450`	`457`	`min_local_shape=min_local_shape,`
	`458`	`+ dist_backend=sharding_config.dist_backend,`
`451`	`459`	`)`
`452`	`460`	`)`
`453`	`461`
`@@ -581,6 +589,7 @@ def detect_sharding_from_factory_config(`
`581`	`589`	`world_size=world_size,`
`582`	`590`	`dist_op=None,`
`583`	`591`	`min_local_shape=min_local_shape,`
	`592`	`+ dist_backend=sharding_config.dist_backend,`
`584`	`593`	`)`
`585`	`594`	`)`
`586`	`595`	`num_row_col_shards += 1`
`@@ -593,6 +602,7 @@ def detect_sharding_from_factory_config(`
`593`	`602`	`world_size=world_size,`
`594`	`603`	`dist_op="all_reduce",`
`595`	`604`	`min_local_shape=min_local_shape,`
	`605`	`+ dist_backend=sharding_config.dist_backend,`
`596`	`606`	`)`
`597`	`607`	`)`
`598`	`608`	`num_row_col_shards += 1`
`@@ -606,6 +616,7 @@ def detect_sharding_from_factory_config(`
`606`	`616`	`dist_op=None,`
`607`	`617`	`min_local_shape=min_local_shape,`
`608`	`618`	`layer_type=LayerType.MAMBA,`
	`619`	`+ dist_backend=sharding_config.dist_backend,`
`609`	`620`	`)`
`610`	`621`	`)`
`611`	`622`	`num_row_col_shards += 1`
`@@ -626,6 +637,7 @@ def detect_sharding_from_factory_config(`
`626`	`637`	`world_size=world_size,`
`627`	`638`	`dist_op=None,`
`628`	`639`	`min_local_shape=min_local_shape,`
	`640`	`+ dist_backend=sharding_config.dist_backend,`
`629`	`641`	`)`
`630`	`642`	`)`
`631`	`643`	`elif col_row_action == "rowwise":`
`@@ -637,6 +649,7 @@ def detect_sharding_from_factory_config(`
`637`	`649`	`world_size=world_size,`
`638`	`650`	`dist_op="all_reduce",`
`639`	`651`	`min_local_shape=min_local_shape,`
	`652`	`+ dist_backend=sharding_config.dist_backend,`
`640`	`653`	`)`
`641`	`654`	`)`
`642`	`655`	`num_row_col_shards += 1`
`@@ -951,6 +964,7 @@ def detect_column_row_shard(`
`951`	`964`	`world_size=world_size,`
`952`	`965`	`dist_op="all_reduce",`
`953`	`966`	`min_local_shape=min_local_shape,`
	`967`	`+ dist_backend=sharding_config.dist_backend,`
`954`	`968`	`)`
`955`	`969`	`)`
`956`	`970`
`@@ -1028,6 +1042,7 @@ def detect_dp_bmm_shard(gm: GraphModule, sharding_config: ShardingConfig) -> Tra`
`1028`	`1042`	`world_size=world_size,`
`1029`	`1043`	`start_idx=start_idx,`
`1030`	`1044`	`end_idx=end_idx,`
	`1045`	`+ dist_backend=sharding_config.dist_backend,`
`1031`	`1046`	`)`
`1032`	`1047`	`)`
`1033`	`1048`	`ad_logger.debug(`
`@@ -1070,6 +1085,7 @@ def detect_ep_shard(gm: GraphModule, sharding_config: ShardingConfig) -> Transfo`
`1070`	`1085`	`node,`
`1071`	`1086`	`rank=rank,`
`1072`	`1087`	`world_size=world_size,`
	`1088`	`+ dist_backend=sharding_config.dist_backend,`
`1073`	`1089`	`)`
`1074`	`1090`	`)`
`1075`	`1091`	`num_moe_patterns += 1`