Skip to content

Commit 8914503

Browse files
committed
TRT-LLM installation tool
1 parent 1f4c159 commit 8914503

File tree

11 files changed

+385
-164
lines changed

11 files changed

+385
-164
lines changed

.github/workflows/build-test-linux-aarch64.yml

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -356,6 +356,41 @@ jobs:
356356
python -m pytest -ra -n 4 --junitxml=${RUNNER_TEST_RESULTS_DIR}/tests_py_core_test_results.xml .
357357
popd
358358
359+
tests-py-distributed:
360+
name: Test dynamo distributed [Python]
361+
needs: [filter-matrix, build]
362+
if: false
363+
strategy:
364+
fail-fast: false
365+
matrix:
366+
include:
367+
- repository: pytorch/tensorrt
368+
package-name: torch_tensorrt
369+
pre-script: packaging/pre_build_script.sh
370+
post-script: packaging/post_build_script.sh
371+
smoke-test-script: packaging/smoke_test_script.sh
372+
uses: ./.github/workflows/linux-test.yml
373+
with:
374+
job-name: tests-py-dynamo-distributed
375+
repository: "pytorch/tensorrt"
376+
ref: ""
377+
test-infra-repository: pytorch/test-infra
378+
test-infra-ref: main
379+
build-matrix: ${{ needs.filter-matrix.outputs.matrix }}
380+
pre-script: ${{ matrix.pre-script }}
381+
script: |
382+
set -euo pipefail
383+
export USE_HOST_DEPS=1
384+
export CI_BUILD=1
385+
export USE_TRTLLM_PLUGINS=1
386+
dnf install -y mpich mpich-devel openmpi openmpi-devel
387+
pushd .
388+
cd tests/py
389+
cd dynamo
390+
python -m pytest -ra --junitxml=${RUNNER_TEST_RESULTS_DIR}/dynamo_distributed_test_results.xml distributed/test_nccl_ops.py
391+
popd
392+
393+
359394
concurrency:
360395
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref_name }}-${{ inputs.repository }}-${{ github.event_name == 'workflow_dispatch' }}-${{ inputs.job-name }}
361396
cancel-in-progress: true

.github/workflows/build-test-linux-x86_64.yml

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -340,6 +340,39 @@ jobs:
340340
python -m pytest -ra -n 4 --junitxml=${RUNNER_TEST_RESULTS_DIR}/tests_py_core_test_results.xml .
341341
popd
342342
343+
tests-py-distributed:
344+
name: Test dynamo distributed [Python]
345+
needs: [filter-matrix, build]
346+
strategy:
347+
fail-fast: false
348+
matrix:
349+
include:
350+
- repository: pytorch/tensorrt
351+
package-name: torch_tensorrt
352+
pre-script: packaging/pre_build_script.sh
353+
post-script: packaging/post_build_script.sh
354+
smoke-test-script: packaging/smoke_test_script.sh
355+
uses: ./.github/workflows/linux-test.yml
356+
with:
357+
job-name: tests-py-dynamo-distributed
358+
repository: "pytorch/tensorrt"
359+
ref: ""
360+
test-infra-repository: pytorch/test-infra
361+
test-infra-ref: main
362+
build-matrix: ${{ needs.filter-matrix.outputs.matrix }}
363+
pre-script: ${{ matrix.pre-script }}
364+
script: |
365+
set -euo pipefail
366+
export USE_HOST_DEPS=1
367+
export CI_BUILD=1
368+
export USE_TRTLLM_PLUGINS=1
369+
dnf install -y mpich mpich-devel openmpi openmpi-devel
370+
pushd .
371+
cd tests/py
372+
cd dynamo
373+
python -m pytest -ra --junitxml=${RUNNER_TEST_RESULTS_DIR}/dynamo_distributed_test_results.xml distributed/test_nccl_ops.py
374+
popd
375+
343376
concurrency:
344377
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref_name }}-tensorrt-${{ inputs.repository }}-${{ github.event_name == 'workflow_dispatch' }}-${{ inputs.job-name }}
345378
cancel-in-progress: true

dev_dep_versions.yml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
11
__cuda_version__: "12.8"
22
__tensorrt_version__: "10.12.0"
33
__tensorrt_rtx_version__: "1.0.0"
4+
__tensorrt_llm_version__: "0.17.0.post1"

py/torch_tensorrt/dynamo/_compiler.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -103,6 +103,7 @@ def cross_compile_for_windows(
103103
tiling_optimization_level: str = _defaults.TILING_OPTIMIZATION_LEVEL,
104104
l2_limit_for_tiling: int = _defaults.L2_LIMIT_FOR_TILING,
105105
offload_module_to_cpu: bool = _defaults.OFFLOAD_MODULE_TO_CPU,
106+
use_distributed_mode_trace: bool = _defaults.USE_DISTRIBUTED_MODE_TRACE,
106107
**kwargs: Any,
107108
) -> torch.fx.GraphModule:
108109
"""Compile an ExportedProgram module using TensorRT in Linux for Inference in Windows
@@ -176,6 +177,7 @@ def cross_compile_for_windows(
176177
enable_weight_streaming (bool): Enable weight streaming.
177178
tiling_optimization_level (str): The optimization level of tiling strategies. A higher level allows TensorRT to spend more time searching for better tiling strategy. We currently support ["none", "fast", "moderate", "full"].
178179
l2_limit_for_tiling (int): The target L2 cache usage limit (in bytes) for tiling optimization (default is -1 which means no limit).
180+
use_distributed_mode_trace (bool): Using aot_autograd to trace the graph. This is enabled when DTensors or distributed tensors are present in distributed model
179181
**kwargs: Any,
180182
Returns:
181183
torch.fx.GraphModule: Compiled FX Module, when run it will execute via TensorRT
@@ -330,6 +332,7 @@ def cross_compile_for_windows(
330332
"enable_weight_streaming": enable_weight_streaming,
331333
"tiling_optimization_level": tiling_optimization_level,
332334
"l2_limit_for_tiling": l2_limit_for_tiling,
335+
"use_distributed_mode_trace": use_distributed_mode_trace,
333336
}
334337

335338
# disable the following settings is not supported for cross compilation for windows feature
@@ -430,6 +433,7 @@ def compile(
430433
tiling_optimization_level: str = _defaults.TILING_OPTIMIZATION_LEVEL,
431434
l2_limit_for_tiling: int = _defaults.L2_LIMIT_FOR_TILING,
432435
offload_module_to_cpu: bool = _defaults.OFFLOAD_MODULE_TO_CPU,
436+
use_distributed_mode_trace: bool = _defaults.USE_DISTRIBUTED_MODE_TRACE,
433437
**kwargs: Any,
434438
) -> torch.fx.GraphModule:
435439
"""Compile an ExportedProgram module for NVIDIA GPUs using TensorRT
@@ -506,6 +510,7 @@ def compile(
506510
tiling_optimization_level (str): The optimization level of tiling strategies. A higher level allows TensorRT to spend more time searching for better tiling strategy. We currently support ["none", "fast", "moderate", "full"].
507511
l2_limit_for_tiling (int): The target L2 cache usage limit (in bytes) for tiling optimization (default is -1 which means no limit).
508512
offload_module_to_cpu (bool): Offload the module to CPU. This is useful when we need to minimize GPU memory usage.
513+
use_distributed_mode_trace (bool): Using aot_autograd to trace the graph. This is enabled when DTensors or distributed tensors are present in distributed model
509514
**kwargs: Any,
510515
Returns:
511516
torch.fx.GraphModule: Compiled FX Module, when run it will execute via TensorRT
@@ -674,6 +679,7 @@ def compile(
674679
"tiling_optimization_level": tiling_optimization_level,
675680
"l2_limit_for_tiling": l2_limit_for_tiling,
676681
"offload_module_to_cpu": offload_module_to_cpu,
682+
"use_distributed_mode_trace": use_distributed_mode_trace,
677683
}
678684

679685
settings = CompilationSettings(**compilation_options)
@@ -1045,6 +1051,7 @@ def convert_exported_program_to_serialized_trt_engine(
10451051
tiling_optimization_level: str = _defaults.TILING_OPTIMIZATION_LEVEL,
10461052
l2_limit_for_tiling: int = _defaults.L2_LIMIT_FOR_TILING,
10471053
offload_module_to_cpu: bool = _defaults.OFFLOAD_MODULE_TO_CPU,
1054+
use_distributed_mode_trace: bool = _defaults.USE_DISTRIBUTED_MODE_TRACE,
10481055
**kwargs: Any,
10491056
) -> bytes:
10501057
"""Convert an ExportedProgram to a serialized TensorRT engine
@@ -1118,6 +1125,7 @@ def convert_exported_program_to_serialized_trt_engine(
11181125
tiling_optimization_level (str): The optimization level of tiling strategies. A higher level allows TensorRT to spend more time searching for better tiling strategy. We currently support ["none", "fast", "moderate", "full"].
11191126
l2_limit_for_tiling (int): The target L2 cache usage limit (in bytes) for tiling optimization (default is -1 which means no limit).
11201127
offload_module_to_cpu (bool): Offload the module to CPU. This is useful when we need to minimize GPU memory usage.
1128+
use_distributed_mode_trace (bool): Using aot_autograd to trace the graph. This is enabled when DTensors or distributed tensors are present in distributed model.
11211129
**kwargs: Any,
11221130
Returns:
11231131
bytes: Serialized TensorRT engine, can either be saved to a file or deserialized via TensorRT APIs
@@ -1286,6 +1294,7 @@ def convert_exported_program_to_serialized_trt_engine(
12861294
"tiling_optimization_level": tiling_optimization_level,
12871295
"l2_limit_for_tiling": l2_limit_for_tiling,
12881296
"offload_module_to_cpu": offload_module_to_cpu,
1297+
"use_distributed_mode_trace": use_distributed_mode_trace,
12891298
}
12901299

12911300
settings = CompilationSettings(**compilation_options)

py/torch_tensorrt/dynamo/conversion/converter_utils.py

Lines changed: 0 additions & 65 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,6 @@
11
import collections
2-
import ctypes
32
import functools
43
import logging
5-
import os
64
from typing import (
75
Any,
86
Callable,
@@ -1124,69 +1122,6 @@ def args_bounds_check(
11241122
return args[i] if len(args) > i and args[i] is not None else replacement
11251123

11261124

1127-
def load_tensorrt_llm() -> bool:
1128-
"""
1129-
Attempts to load the TensorRT-LLM plugin and initialize it.
1130-
1131-
Returns:
1132-
bool: True if the plugin was successfully loaded and initialized, False otherwise.
1133-
"""
1134-
try:
1135-
import tensorrt_llm as trt_llm # noqa: F401
1136-
1137-
_LOGGER.info("TensorRT-LLM successfully imported")
1138-
return True
1139-
except (ImportError, AssertionError) as e_import_error:
1140-
# Check for environment variable for the plugin library path
1141-
plugin_lib_path = os.environ.get("TRTLLM_PLUGINS_PATH")
1142-
if not plugin_lib_path:
1143-
_LOGGER.warning(
1144-
"TensorRT-LLM is not installed. Please install TensorRT-LLM or set TRTLLM_PLUGINS_PATH to the directory containing libnvinfer_plugin_tensorrt_llm.so to use converters for torch.distributed ops",
1145-
)
1146-
return False
1147-
1148-
_LOGGER.info(f"TensorRT-LLM Plugin lib path found: {plugin_lib_path}")
1149-
try:
1150-
# Load the shared library
1151-
handle = ctypes.CDLL(plugin_lib_path)
1152-
_LOGGER.info(f"Successfully loaded plugin library: {plugin_lib_path}")
1153-
except OSError as e_os_error:
1154-
_LOGGER.error(
1155-
f"Failed to load libnvinfer_plugin_tensorrt_llm.so from {plugin_lib_path}"
1156-
f"Ensure the path is correct and the library is compatible",
1157-
exc_info=e_os_error,
1158-
)
1159-
return False
1160-
1161-
try:
1162-
# Configure plugin initialization arguments
1163-
handle.initTrtLlmPlugins.argtypes = [ctypes.c_void_p, ctypes.c_char_p]
1164-
handle.initTrtLlmPlugins.restype = ctypes.c_bool
1165-
except AttributeError as e_plugin_unavailable:
1166-
_LOGGER.warning(
1167-
"Unable to initialize the TensorRT-LLM plugin library",
1168-
exc_info=e_plugin_unavailable,
1169-
)
1170-
return False
1171-
1172-
try:
1173-
# Initialize the plugin
1174-
TRT_LLM_PLUGIN_NAMESPACE = "tensorrt_llm"
1175-
if handle.initTrtLlmPlugins(None, TRT_LLM_PLUGIN_NAMESPACE.encode("utf-8")):
1176-
_LOGGER.info("TensorRT-LLM plugin successfully initialized")
1177-
return True
1178-
else:
1179-
_LOGGER.warning("TensorRT-LLM plugin library failed in initialization")
1180-
return False
1181-
except Exception as e_initialization_error:
1182-
_LOGGER.warning(
1183-
"Exception occurred during TensorRT-LLM plugin library initialization",
1184-
exc_info=e_initialization_error,
1185-
)
1186-
return False
1187-
return False
1188-
1189-
11901125
def promote_trt_tensors_to_same_dtype(
11911126
ctx: ConversionContext, lhs: TRTTensor, rhs: TRTTensor, name_prefix: str
11921127
) -> tuple[TRTTensor, TRTTensor]:

py/torch_tensorrt/dynamo/conversion/custom_ops_converters.py

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -11,15 +11,15 @@
1111
from torch_tensorrt.dynamo.conversion._ConverterRegistry import (
1212
dynamo_tensorrt_converter,
1313
)
14-
from torch_tensorrt.dynamo.conversion.converter_utils import load_tensorrt_llm
14+
from torch_tensorrt.dynamo.lowering.passes.fuse_distributed_ops import (
15+
tensorrt_fused_nccl_all_gather_op,
16+
tensorrt_fused_nccl_reduce_scatter_op,
17+
)
18+
from torch_tensorrt.dynamo.utils import load_tensorrt_llm_for_nccl
1519

1620
_LOGGER: logging.Logger = logging.getLogger(__name__)
1721

18-
if load_tensorrt_llm():
19-
from torch_tensorrt.dynamo.lowering.passes.fuse_distributed_ops import (
20-
tensorrt_fused_nccl_all_gather_op,
21-
tensorrt_fused_nccl_reduce_scatter_op,
22-
)
22+
if load_tensorrt_llm_for_nccl():
2323

2424
@dynamo_tensorrt_converter(tensorrt_fused_nccl_all_gather_op)
2525
def fused_nccl_gather(

0 commit comments

Comments
 (0)