pytorch
diff --git a/‎.github/workflows/build-test-linux-aarch64.yml‎
Lines changed: 35 additions & 0 deletions b/‎.github/workflows/build-test-linux-aarch64.yml‎
Lines changed: 35 additions & 0 deletions
diff --git a/‎.github/workflows/build-test-linux-x86_64.yml‎
Lines changed: 33 additions & 0 deletions b/‎.github/workflows/build-test-linux-x86_64.yml‎
Lines changed: 33 additions & 0 deletions
diff --git a/‎dev_dep_versions.yml‎
Lines changed: 1 addition & 0 deletions b/‎dev_dep_versions.yml‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎py/torch_tensorrt/dynamo/_compiler.py‎
Lines changed: 9 additions & 0 deletions b/‎py/torch_tensorrt/dynamo/_compiler.py‎
Lines changed: 9 additions & 0 deletions
diff --git a/‎py/torch_tensorrt/dynamo/conversion/converter_utils.py‎
Lines changed: 0 additions & 65 deletions b/‎py/torch_tensorrt/dynamo/conversion/converter_utils.py‎
Lines changed: 0 additions & 65 deletions
diff --git a/‎py/torch_tensorrt/dynamo/conversion/custom_ops_converters.py‎
Lines changed: 6 additions & 6 deletions b/‎py/torch_tensorrt/dynamo/conversion/custom_ops_converters.py‎
Lines changed: 6 additions & 6 deletions
@@ -356,6 +356,41 @@ jobs:
         python -m pytest -ra -n 4 --junitxml=${RUNNER_TEST_RESULTS_DIR}/tests_py_core_test_results.xml .
         popd
 
+  tests-py-distributed:
+    name: Test dynamo distributed [Python]
+    needs: [filter-matrix, build]
+    if: false
+    strategy:
+      fail-fast: false
+      matrix:
+        include:
+          - repository: pytorch/tensorrt
+            package-name: torch_tensorrt
+            pre-script: packaging/pre_build_script.sh
+            post-script: packaging/post_build_script.sh
+            smoke-test-script: packaging/smoke_test_script.sh
+    uses: ./.github/workflows/linux-test.yml
+    with:
+      job-name: tests-py-dynamo-distributed
+      repository: "pytorch/tensorrt"
+      ref: ""
+      test-infra-repository: pytorch/test-infra
+      test-infra-ref: main
+      build-matrix: ${{ needs.filter-matrix.outputs.matrix }}
+      pre-script: ${{ matrix.pre-script }}
+      script: |
+        set -euo pipefail
+        export USE_HOST_DEPS=1
+        export CI_BUILD=1
+        export USE_TRTLLM_PLUGINS=1
+        dnf install -y mpich mpich-devel openmpi openmpi-devel
+        pushd .
+        cd tests/py
+        cd dynamo
+        python -m pytest -ra --junitxml=${RUNNER_TEST_RESULTS_DIR}/dynamo_distributed_test_results.xml distributed/test_nccl_ops.py
+        popd
+
+
 concurrency:
   group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref_name }}-${{ inputs.repository }}-${{ github.event_name == 'workflow_dispatch' }}-${{ inputs.job-name }}
   cancel-in-progress: true
@@ -340,6 +340,39 @@ jobs:
         python -m pytest -ra -n 4 --junitxml=${RUNNER_TEST_RESULTS_DIR}/tests_py_core_test_results.xml .
         popd
 
+  tests-py-distributed:
+    name: Test dynamo distributed [Python]
+    needs: [filter-matrix, build]
+    strategy:
+      fail-fast: false
+      matrix:
+        include:
+          - repository: pytorch/tensorrt
+            package-name: torch_tensorrt
+            pre-script: packaging/pre_build_script.sh
+            post-script: packaging/post_build_script.sh
+            smoke-test-script: packaging/smoke_test_script.sh
+    uses: ./.github/workflows/linux-test.yml
+    with:
+      job-name: tests-py-dynamo-distributed
+      repository: "pytorch/tensorrt"
+      ref: ""
+      test-infra-repository: pytorch/test-infra
+      test-infra-ref: main
+      build-matrix: ${{ needs.filter-matrix.outputs.matrix }}
+      pre-script: ${{ matrix.pre-script }}
+      script: |
+        set -euo pipefail
+        export USE_HOST_DEPS=1
+        export CI_BUILD=1
+        export USE_TRTLLM_PLUGINS=1
+        dnf install -y mpich mpich-devel openmpi openmpi-devel
+        pushd .
+        cd tests/py
+        cd dynamo
+        python -m pytest -ra --junitxml=${RUNNER_TEST_RESULTS_DIR}/dynamo_distributed_test_results.xml distributed/test_nccl_ops.py
+        popd
+
 concurrency:
   group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref_name }}-tensorrt-${{ inputs.repository }}-${{ github.event_name == 'workflow_dispatch' }}-${{ inputs.job-name }}
   cancel-in-progress: true
@@ -1,3 +1,4 @@
 __cuda_version__: "12.8"
 __tensorrt_version__: "10.12.0"
 __tensorrt_rtx_version__: "1.0.0"
+__tensorrt_llm_version__: "0.17.0.post1"
@@ -103,6 +103,7 @@ def cross_compile_for_windows(
     tiling_optimization_level: str = _defaults.TILING_OPTIMIZATION_LEVEL,
     l2_limit_for_tiling: int = _defaults.L2_LIMIT_FOR_TILING,
     offload_module_to_cpu: bool = _defaults.OFFLOAD_MODULE_TO_CPU,
+    use_distributed_mode_trace: bool = _defaults.USE_DISTRIBUTED_MODE_TRACE,
     **kwargs: Any,
 ) -> torch.fx.GraphModule:
     """Compile an ExportedProgram module using TensorRT in Linux for Inference in Windows
@@ -176,6 +177,7 @@ def cross_compile_for_windows(
         enable_weight_streaming (bool): Enable weight streaming.
         tiling_optimization_level (str): The optimization level of tiling strategies. A higher level allows TensorRT to spend more time searching for better tiling strategy. We currently support ["none", "fast", "moderate", "full"].
         l2_limit_for_tiling (int): The target L2 cache usage limit (in bytes) for tiling optimization (default is -1 which means no limit).
+        use_distributed_mode_trace (bool):  Using aot_autograd to trace the graph. This is enabled when DTensors or distributed tensors are present in distributed model
         **kwargs: Any,
     Returns:
         torch.fx.GraphModule: Compiled FX Module, when run it will execute via TensorRT
@@ -330,6 +332,7 @@ def cross_compile_for_windows(
         "enable_weight_streaming": enable_weight_streaming,
         "tiling_optimization_level": tiling_optimization_level,
         "l2_limit_for_tiling": l2_limit_for_tiling,
+        "use_distributed_mode_trace": use_distributed_mode_trace,
     }
 
     # disable the following settings is not supported for cross compilation for windows feature
@@ -430,6 +433,7 @@ def compile(
     tiling_optimization_level: str = _defaults.TILING_OPTIMIZATION_LEVEL,
     l2_limit_for_tiling: int = _defaults.L2_LIMIT_FOR_TILING,
     offload_module_to_cpu: bool = _defaults.OFFLOAD_MODULE_TO_CPU,
+    use_distributed_mode_trace: bool = _defaults.USE_DISTRIBUTED_MODE_TRACE,
     **kwargs: Any,
 ) -> torch.fx.GraphModule:
     """Compile an ExportedProgram module for NVIDIA GPUs using TensorRT
@@ -506,6 +510,7 @@ def compile(
         tiling_optimization_level (str): The optimization level of tiling strategies. A higher level allows TensorRT to spend more time searching for better tiling strategy. We currently support ["none", "fast", "moderate", "full"].
         l2_limit_for_tiling (int): The target L2 cache usage limit (in bytes) for tiling optimization (default is -1 which means no limit).
         offload_module_to_cpu (bool): Offload the module to CPU. This is useful when we need to minimize GPU memory usage.
+        use_distributed_mode_trace (bool):  Using aot_autograd to trace the graph. This is enabled when DTensors or distributed tensors are present in distributed model
         **kwargs: Any,
     Returns:
         torch.fx.GraphModule: Compiled FX Module, when run it will execute via TensorRT
@@ -674,6 +679,7 @@ def compile(
         "tiling_optimization_level": tiling_optimization_level,
         "l2_limit_for_tiling": l2_limit_for_tiling,
         "offload_module_to_cpu": offload_module_to_cpu,
+        "use_distributed_mode_trace": use_distributed_mode_trace,
     }
 
     settings = CompilationSettings(**compilation_options)
@@ -1045,6 +1051,7 @@ def convert_exported_program_to_serialized_trt_engine(
     tiling_optimization_level: str = _defaults.TILING_OPTIMIZATION_LEVEL,
     l2_limit_for_tiling: int = _defaults.L2_LIMIT_FOR_TILING,
     offload_module_to_cpu: bool = _defaults.OFFLOAD_MODULE_TO_CPU,
+    use_distributed_mode_trace: bool = _defaults.USE_DISTRIBUTED_MODE_TRACE,
     **kwargs: Any,
 ) -> bytes:
     """Convert an ExportedProgram to a serialized TensorRT engine
@@ -1118,6 +1125,7 @@ def convert_exported_program_to_serialized_trt_engine(
         tiling_optimization_level (str): The optimization level of tiling strategies. A higher level allows TensorRT to spend more time searching for better tiling strategy. We currently support ["none", "fast", "moderate", "full"].
         l2_limit_for_tiling (int): The target L2 cache usage limit (in bytes) for tiling optimization (default is -1 which means no limit).
         offload_module_to_cpu (bool): Offload the module to CPU. This is useful when we need to minimize GPU memory usage.
+        use_distributed_mode_trace (bool):  Using aot_autograd to trace the graph. This is enabled when DTensors or distributed tensors are present in distributed model.
         **kwargs: Any,
     Returns:
         bytes: Serialized TensorRT engine, can either be saved to a file or deserialized via TensorRT APIs
@@ -1286,6 +1294,7 @@ def convert_exported_program_to_serialized_trt_engine(
         "tiling_optimization_level": tiling_optimization_level,
         "l2_limit_for_tiling": l2_limit_for_tiling,
         "offload_module_to_cpu": offload_module_to_cpu,
+        "use_distributed_mode_trace": use_distributed_mode_trace,
     }
 
     settings = CompilationSettings(**compilation_options)
 
@@ -1,8 +1,6 @@
 import collections
-import ctypes
 import functools
 import logging
-import os
 from typing import (
     Any,
     Callable,
@@ -1124,69 +1122,6 @@ def args_bounds_check(
     return args[i] if len(args) > i and args[i] is not None else replacement
 
 
-def load_tensorrt_llm() -> bool:
-    """
-    Attempts to load the TensorRT-LLM plugin and initialize it.
-
-    Returns:
-        bool: True if the plugin was successfully loaded and initialized, False otherwise.
-    """
-    try:
-        import tensorrt_llm as trt_llm  # noqa: F401
-
-        _LOGGER.info("TensorRT-LLM successfully imported")
-        return True
-    except (ImportError, AssertionError) as e_import_error:
-        # Check for environment variable for the plugin library path
-        plugin_lib_path = os.environ.get("TRTLLM_PLUGINS_PATH")
-        if not plugin_lib_path:
-            _LOGGER.warning(
-                "TensorRT-LLM is not installed. Please install TensorRT-LLM or set TRTLLM_PLUGINS_PATH to the directory containing libnvinfer_plugin_tensorrt_llm.so to use converters for torch.distributed ops",
-            )
-            return False
-
-        _LOGGER.info(f"TensorRT-LLM Plugin lib path found: {plugin_lib_path}")
-        try:
-            # Load the shared library
-            handle = ctypes.CDLL(plugin_lib_path)
-            _LOGGER.info(f"Successfully loaded plugin library: {plugin_lib_path}")
-        except OSError as e_os_error:
-            _LOGGER.error(
-                f"Failed to load libnvinfer_plugin_tensorrt_llm.so from {plugin_lib_path}"
-                f"Ensure the path is correct and the library is compatible",
-                exc_info=e_os_error,
-            )
-            return False
-
-        try:
-            # Configure plugin initialization arguments
-            handle.initTrtLlmPlugins.argtypes = [ctypes.c_void_p, ctypes.c_char_p]
-            handle.initTrtLlmPlugins.restype = ctypes.c_bool
-        except AttributeError as e_plugin_unavailable:
-            _LOGGER.warning(
-                "Unable to initialize the TensorRT-LLM plugin library",
-                exc_info=e_plugin_unavailable,
-            )
-            return False
-
-        try:
-            # Initialize the plugin
-            TRT_LLM_PLUGIN_NAMESPACE = "tensorrt_llm"
-            if handle.initTrtLlmPlugins(None, TRT_LLM_PLUGIN_NAMESPACE.encode("utf-8")):
-                _LOGGER.info("TensorRT-LLM plugin successfully initialized")
-                return True
-            else:
-                _LOGGER.warning("TensorRT-LLM plugin library failed in initialization")
-                return False
-        except Exception as e_initialization_error:
-            _LOGGER.warning(
-                "Exception occurred during TensorRT-LLM plugin library initialization",
-                exc_info=e_initialization_error,
-            )
-            return False
-    return False
-
-
 def promote_trt_tensors_to_same_dtype(
     ctx: ConversionContext, lhs: TRTTensor, rhs: TRTTensor, name_prefix: str
 ) -> tuple[TRTTensor, TRTTensor]:
 
@@ -11,15 +11,15 @@
 from torch_tensorrt.dynamo.conversion._ConverterRegistry import (
     dynamo_tensorrt_converter,
 )
-from torch_tensorrt.dynamo.conversion.converter_utils import load_tensorrt_llm
+from torch_tensorrt.dynamo.lowering.passes.fuse_distributed_ops import (
+    tensorrt_fused_nccl_all_gather_op,
+    tensorrt_fused_nccl_reduce_scatter_op,
+)
+from torch_tensorrt.dynamo.utils import load_tensorrt_llm_for_nccl
 
 _LOGGER: logging.Logger = logging.getLogger(__name__)
 
-if load_tensorrt_llm():
-    from torch_tensorrt.dynamo.lowering.passes.fuse_distributed_ops import (
-        tensorrt_fused_nccl_all_gather_op,
-        tensorrt_fused_nccl_reduce_scatter_op,
-    )
+if load_tensorrt_llm_for_nccl():
 
     @dynamo_tensorrt_converter(tensorrt_fused_nccl_all_gather_op)
     def fused_nccl_gather(