Merge branch 'main' into jennifchen/nmh-moe-export

jenchen13 · web-flow · commit a86c7af0ac35 · 2025-10-21T10:29:38.000-04:00
diff --git a/.github/workflows/gpu_tests.yml b/.github/workflows/gpu_tests.yml
@@ -73,8 +73,7 @@ jobs:
       - uses: nv-gha-runners/setup-proxy-cache@main
       - name: Setup environment variables
         run: |
-          echo "LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:/usr/include:/usr/lib/x86_64-linux-gnu:/usr/local/tensorrt/targets/x86_64-linux-gnu/lib" >> $GITHUB_ENV
-          echo "PATH=${PATH}:/usr/local/tensorrt/targets/x86_64-linux-gnu/bin" >> $GITHUB_ENV
+          echo "LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:/usr/include:/usr/lib/x86_64-linux-gnu" >> $GITHUB_ENV
       - name: Run gpu tests
         run: pip install tox-current-env && tox -e py312-cuda12-gpu --current-env
   gpu-tests-non-pr:
diff --git a/.gitlab/tests.yml b/.gitlab/tests.yml
@@ -35,9 +35,7 @@ unit:
   tags: [docker, linux, 2-gpu]
   before_script:
     # Add libcudnn*.so and libnv*.so to path
-    - export LD_LIBRARY_PATH="${LD_LIBRARY_PATH}:/usr/include:/usr/lib/x86_64-linux-gnu:/usr/local/tensorrt/targets/x86_64-linux-gnu/lib"
-    # Add trtexec to path
-    - export PATH="${PATH}:/usr/local/tensorrt/targets/x86_64-linux-gnu/bin"
+    - export LD_LIBRARY_PATH="${LD_LIBRARY_PATH}:/usr/include:/usr/lib/x86_64-linux-gnu"
     # Install git-lfs for Daring-Anteater dataset
     - apt-get update && apt-get install -y git-lfs
     - git lfs install --system
diff --git a/CHANGELOG.rst b/CHANGELOG.rst
@@ -1,17 +1,15 @@
 Model Optimizer Changelog (Linux)
 =================================
 
-0.39 (2025-11-xx)
+0.39 (2025-11-07)
 ^^^^^^^^^^^^^^^^^
 
-**Deprecations**
-
 **New Features**
 
 - Add flag ``op_types_to_exclude_fp16`` in ONNX quantization to exclude ops from being converted to FP16/BF16. Alternatively, for custom TensorRT ops, this can also be done by indicating ``'fp32'`` precision in ``trt_plugins_precision``.
 - Add LoRA mode support for MCore in a new peft submodule: ``modelopt.torch.peft.update_model(model, LORA_CFG)``.
 - Support PTQ and fakequant in vLLM for fast evaluation of arbitrary quantization formats. See ``examples/vllm_serve`` for more details.
-- Add support for ``nemotron-post-training-dataset-v2`` and ``nemotron-post-training-dataset-v1`` in ``examples/llm_ptq``. Default to a mix of ``cnn_dailymail`` and ``nemotron-post-training-dataset-v2`` if no dataset is specified.
+- Add support for ``nemotron-post-training-dataset-v2`` and ``nemotron-post-training-dataset-v1`` in ``examples/llm_ptq``. Default to a mix of ``cnn_dailymail`` and ``nemotron-post-training-dataset-v2`` (gated dataset accessed using ``HF_TOKEN`` environment variable) if no dataset is specified.
 - Allow specifying ``calib_seq`` in ``examples/llm_ptq`` to set the maximum sequence length for calibration.
 
 **Documentation**
diff --git a/docs/source/getting_started/_installation_for_Linux.rst b/docs/source/getting_started/_installation_for_Linux.rst
@@ -41,8 +41,7 @@ Environment setup
     .. code-block:: shell
 
         export PIP_CONSTRAINT=""
-        export LD_LIBRARY_PATH="${LD_LIBRARY_PATH}:/usr/include:/usr/lib/x86_64-linux-gnu:/usr/local/tensorrt/targets/x86_64-linux-gnu/lib"
-        export PATH="${PATH}:/usr/local/tensorrt/targets/x86_64-linux-gnu/bin"
+        export LD_LIBRARY_PATH="${LD_LIBRARY_PATH}:/usr/include:/usr/lib/x86_64-linux-gnu"
 
     You may need to install additional dependencies from the respective examples's `requirements.txt` file.
 
diff --git a/examples/diffusers/quantization/requirements.txt b/examples/diffusers/quantization/requirements.txt
@@ -4,3 +4,6 @@ nvtx
 onnx_graphsurgeon
 opencv-python>=4.8.1.78,<4.12.0.88
 sentencepiece
+# TODO: Fix for torch 2.9
+torch<2.9
+torchvision<0.24.0
diff --git a/examples/llm_sparsity/launch_finetune.sh b/examples/llm_sparsity/launch_finetune.sh
@@ -91,7 +91,7 @@ CMD="accelerate launch --multi_gpu --mixed_precision bf16 finetune.py \
     --warmup_ratio 0.0 \
     --lr_scheduler_type cosine \
     --logging_steps 1 \
-    --fsdp full_shard auto_wrap \
+    --fsdp 'full_shard auto_wrap' \
     --fsdp_transformer_layer_cls_to_wrap LlamaDecoderLayer \
     --tf32 True \
     --modelopt_restore_path $MODELOPT_RESTORE_PATH \
diff --git a/examples/llm_sparsity/requirements.txt b/examples/llm_sparsity/requirements.txt
@@ -1,4 +1,3 @@
 flash-attn
 sentencepiece>=0.2.0
 tensorboardX
-transformers>=4.57.0
diff --git a/modelopt/onnx/quantization/graph_utils.py b/modelopt/onnx/quantization/graph_utils.py
@@ -27,7 +27,6 @@
 from onnx_graphsurgeon.ir.node import Node
 from onnx_graphsurgeon.ir.tensor import Constant, Tensor, Variable
 from onnxruntime.quantization.calibrate import CalibrationDataReader
-from onnxruntime.tools.symbolic_shape_infer import SymbolicShapeInference
 
 from modelopt.onnx.logging_config import logger
 from modelopt.onnx.op_types import is_copy_op, is_linear_op
@@ -36,6 +35,7 @@
     find_lowest_common_ancestor,
     get_child_nodes,
     get_parent_nodes,
+    infer_shapes,
     parse_shapes_spec,
     save_onnx,
 )
@@ -966,7 +966,7 @@ def find_nodes_from_matmul_to_exclude(
     logger.debug(f"Found {len(matmul_nodes)} MatMul nodes to analyze")
 
     if calibration_shapes:
-        nodes_to_exclude = _exclude_matmuls_by_symbolic_inference(
+        nodes_to_exclude = _exclude_matmuls_by_shape_inference(
             model, matmul_nodes, calibration_shapes
         )
     else:
@@ -1058,10 +1058,10 @@ def find_nodes_from_convs_to_exclude(graph: Graph, quantize_mode: str = "int8"):
     return unsupported_conv_nodes
 
 
-def _exclude_matmuls_by_symbolic_inference(
+def _exclude_matmuls_by_shape_inference(
     model: onnx.ModelProto, matmul_nodes: list, calibration_shapes: str | dict | None = None
 ) -> list[str]:
-    """Use symbolic shape inference to find MatMuls with dimension 1."""
+    """Use shape inference to find MatMuls with dimension 1."""
     # Prepare model for symbolic inference
     for graph_input in model.graph.input:
         for dim in graph_input.type.tensor_type.shape.dim:
@@ -1070,11 +1070,13 @@ def _exclude_matmuls_by_symbolic_inference(
                 dim.dim_value = 1
 
     # Apply calibration shapes if provided
-    input_shapes = (
-        parse_shapes_spec(calibration_shapes)
-        if (calibration_shapes and isinstance(calibration_shapes, str))
-        else {}
-    )
+    input_shapes = {}
+    if calibration_shapes:
+        input_shapes = (
+            parse_shapes_spec(calibration_shapes)
+            if isinstance(calibration_shapes, str)
+            else calibration_shapes
+        )
     for graph_input in model.graph.input:
         if graph_input.name in input_shapes:
             input_shape = input_shapes[graph_input.name]
@@ -1087,9 +1089,9 @@ def _exclude_matmuls_by_symbolic_inference(
             for dim, new_dim_value in zip(tensor_shape, input_shape):
                 dim.dim_value = new_dim_value
 
-    model.graph.ClearField("value_info")
-    model = SymbolicShapeInference.infer_shapes(model)
+    model = infer_shapes(model)
     value_info_map = {vi.name: vi for vi in model.graph.value_info}
+    value_info_map.update({vi.name: vi for vi in model.graph.output})
 
     nodes_to_exclude = []
     for matmul_node in matmul_nodes:
diff --git a/modelopt/onnx/quantization/quantize.py b/modelopt/onnx/quantization/quantize.py
@@ -470,7 +470,7 @@ def quantize(
         calibration_eps,
     )
 
-    if not calibration_shapes:
+    if calibrate_per_node and not calibration_shapes:
         calibration_shapes = get_input_shapes(onnx_path)
 
     if quantize_mode in ["fp8", "int8"]:
diff --git a/modelopt/onnx/trt_utils.py b/modelopt/onnx/trt_utils.py
@@ -36,6 +36,8 @@
 except ImportError:
     TRT_PYTHON_AVAILABLE = False
 
+MAX_IR_VERSION = 10
+
 
 def get_custom_layers(
     onnx_path: str | onnx.ModelProto,
@@ -296,7 +298,8 @@ def load_onnx_model(
 
             static_shaped_onnx_path = onnx_path.replace(".onnx", "_static.onnx")
             save_onnx(onnx_model, static_shaped_onnx_path, use_external_data_format)
-            intermediate_generated_files.append(static_shaped_onnx_path)  # type: ignore[union-attr]
+            if intermediate_generated_files is not None:
+                intermediate_generated_files.append(static_shaped_onnx_path)
 
     if TRT_PYTHON_AVAILABLE and platform.system() != "Windows":
         # Check if there's a custom TensorRT op in the ONNX model. If so, make it ORT compatible by adding
@@ -318,11 +321,27 @@ def load_onnx_model(
             # Infer types and shapes in the graph for ORT compatibility
             onnx_model = infer_types_shapes_tensorrt(onnx_model, trt_plugins or [], all_tensor_info)
 
+    # Enforce IR version = 10
+    ir_version_onnx_path = None
+    if onnx_model.ir_version > MAX_IR_VERSION:
+        onnx_model.ir_version = MAX_IR_VERSION
+        ir_version_onnx_path = (
+            static_shaped_onnx_path.replace(".onnx", f"_ir{MAX_IR_VERSION}.onnx")
+            if static_shaped_onnx_path
+            else onnx_path.replace(".onnx", f"_ir{MAX_IR_VERSION}.onnx")
+        )
+        save_onnx(onnx_model, ir_version_onnx_path, use_external_data_format)
+        if intermediate_generated_files is not None:
+            intermediate_generated_files.append(ir_version_onnx_path)
+
+    # Check that the model is valid
+    onnx.checker.check_model(onnx_model)
+
     return (
         onnx_model,
         has_custom_op,
         custom_ops,
-        static_shaped_onnx_path or onnx_path,
+        ir_version_onnx_path or static_shaped_onnx_path or onnx_path,
         use_external_data_format,
     )
 
diff --git a/tests/_test_utils/torch_quantization/onnx_export.py b/tests/_test_utils/torch_quantization/onnx_export.py
@@ -65,6 +65,7 @@ def forward_loop(model):
         input_names=input_names,
         output_names=output_names,
         do_constant_folding=constant_folding,
+        dynamo=False,
         **kwargs,
     )
 
diff --git a/tests/gpu/onnx/test_plugin.py b/tests/gpu/onnx/test_plugin.py
@@ -19,6 +19,13 @@
 import onnx
 import onnx_graphsurgeon as gs
 from _test_utils.import_helper import skip_if_no_libcudnn, skip_if_no_tensorrt
+from _test_utils.onnx_autocast.utils import _assert_tensors_are_fp16
+from _test_utils.onnx_quantization.utils import _assert_nodes_are_quantized
+
+from modelopt.onnx.autocast import convert_to_mixed_precision
+from modelopt.onnx.autocast.graphsanitizer import GraphSanitizer
+from modelopt.onnx.quantization.quantize import quantize
+from modelopt.onnx.trt_utils import load_onnx_model
 
 skip_if_no_libcudnn()
 skip_if_no_tensorrt()
@@ -95,11 +102,6 @@ def _create_test_model_trt():
 
 
 def test_trt_plugin_quantization(tmp_path):
-    from _test_utils.onnx_quantization.utils import _assert_nodes_are_quantized
-
-    from modelopt.onnx.quantization.quantize import quantize
-    from modelopt.onnx.trt_utils import load_onnx_model
-
     model = _create_test_model_trt()
     with open(os.path.join(tmp_path, "model_with_trt_plugin.onnx"), "w") as f:
         onnx.save_model(model, f.name)
@@ -126,11 +128,6 @@ def test_trt_plugin_quantization(tmp_path):
 
 
 def test_trt_plugin_autocast(tmp_path):
-    from _test_utils.onnx_autocast.utils import _assert_tensors_are_fp16
-
-    from modelopt.onnx.autocast import convert_to_mixed_precision
-    from modelopt.onnx.autocast.graphsanitizer import GraphSanitizer
-
     model = _create_test_model_trt()
     with open(os.path.join(tmp_path, "model_with_trt_plugin_autocast.onnx"), "w") as f:
         onnx.save_model(model, f.name)
diff --git a/tests/gpu/onnx/test_quantize_onnx_torch_int4_awq.py b/tests/gpu/onnx/test_quantize_onnx_torch_int4_awq.py
@@ -111,12 +111,11 @@ def _forward_loop(model, dataloader):
 
         wq_onnx_awq_clip = dq_tensor(wq_onnx_awq_clip, scale_awq_clip, block_size)
 
-        assert np.allclose(wq_torch_awq_lite.detach(), wq_onnx_awq_lite.T, atol=1e-3)
-        assert np.allclose(wq_torch_awq_clip.detach(), wq_onnx_awq_clip.T, atol=1e-3)
+        assert np.allclose(wq_torch_awq_lite.detach().cpu(), wq_onnx_awq_lite.T, atol=1e-3)
+        assert np.allclose(wq_torch_awq_clip.detach().cpu(), wq_onnx_awq_clip.T, atol=1e-3)
 
 
 def test_int4_awq_cuda(tmp_path):
-    skip_if_onnx_version_above_1_18()
     skip_if_no_libcudnn()
     block_size = 128
 
diff --git a/tests/unit/onnx/test_onnx_utils.py b/tests/unit/onnx/test_onnx_utils.py
@@ -28,6 +28,7 @@
     make_tensor_value_info,
 )
 
+from modelopt.onnx.trt_utils import load_onnx_model
 from modelopt.onnx.utils import (
     get_input_names_from_bytes,
     get_output_names_from_bytes,
@@ -253,3 +254,72 @@ def test_remove_node_extra_training_outputs():
     value_info_names = [vi.name for vi in result_model.graph.value_info]
     assert "saved_mean" not in value_info_names
     assert "saved_inv_std" not in value_info_names
+
+
+def _make_matmul_relu_model(ir_version=12):
+    # Define your model inputs and outputs
+    input_names = ["input_0"]
+    output_names = ["output_0"]
+    input_shapes = [(1, 1024, 1024)]
+    output_shapes = [(1, 1024, 16)]
+
+    inputs = [
+        make_tensor_value_info(input_name, onnx.TensorProto.FLOAT, input_shape)
+        for input_name, input_shape in zip(input_names, input_shapes)
+    ]
+    outputs = [
+        make_tensor_value_info(output_name, onnx.TensorProto.FLOAT, output_shape)
+        for output_name, output_shape in zip(output_names, output_shapes)
+    ]
+
+    # Create the ONNX graph with the nodes
+    nodes = [
+        make_node(
+            op_type="MatMul",
+            inputs=["input_0", "weights_1"],
+            outputs=["matmul1_matmul/MatMul:0"],
+            name="matmul1_matmul/MatMul",
+        ),
+        make_node(
+            op_type="Relu",
+            inputs=["matmul1_matmul/MatMul:0"],
+            outputs=["output_0"],
+            name="relu1_relu/Relu",
+        ),
+    ]
+
+    # Create the ONNX initializers
+    initializers = [
+        make_tensor(
+            name="weights_1",
+            data_type=onnx.TensorProto.FLOAT,
+            dims=(1024, 16),
+            vals=np.random.uniform(low=0.5, high=1.0, size=1024 * 16),
+        ),
+    ]
+
+    # Create the ONNX graph with the nodes and initializers
+    graph = make_graph(
+        nodes, f"matmul_relu_ir_{ir_version}", inputs, outputs, initializer=initializers
+    )
+
+    # Create the ONNX model
+    model = make_model(graph)
+    model.opset_import[0].version = 13
+    model.ir_version = ir_version
+
+    # Check the ONNX model
+    model_inferred = onnx.shape_inference.infer_shapes(model)
+    onnx.checker.check_model(model_inferred)
+
+    return model_inferred
+
+
+def test_ir_version_support(tmp_path):
+    model = _make_matmul_relu_model(ir_version=12)
+    model_path = os.path.join(tmp_path, "test_matmul_relu.onnx")
+    onnx.save(model, model_path)
+    model_reload, _, _, _, _ = load_onnx_model(model_path, intermediate_generated_files=[])
+    assert model_reload.ir_version == 10, (
+        f"The maximum supported IR version is 10, but version {model_reload.ir_version} was detected."
+    )
diff --git a/tox.ini b/tox.ini
@@ -59,9 +59,9 @@ commands =
     torch_deploy: python -m pytest tests/unit/torch/deploy
 
 
-########################################################
-# GPU test environments (Can be used with --current-env)
-########################################################
+###########################################################
+# GPU test environments (Should be used with --current-env)
+###########################################################
 [testenv:{py310,py311,py312}-cuda12-gpu]
 setenv =
     MAMBA_FORCE_BUILD=TRUE
@@ -71,8 +71,9 @@ commands_pre =
     pip install git+https://github.com/Dao-AILab/fast-hadamard-transform.git
 
     # Install Mamba model dependencies (takes 8-10mins!)
-    # Triton 3.4.0 causes some real quant tests to fail
-    pip install "triton<3.4"
+    # Skip triton because pytorch-triton is installed in the NGC PyTorch containers
+    pip install pip-mark-installed
+    pip-mark-installed triton
     pip install --no-build-isolation git+https://github.com/state-spaces/mamba.git
 
     # Install Eagle-3 test dependencies

Original file line number	Diff line number	Diff line change
`@@ -470,7 +470,7 @@ def quantize(`
`470`	`470`	`calibration_eps,`
`471`	`471`	`)`
`472`	`472`
`473`		`- if not calibration_shapes:`
	`473`	`+ if calibrate_per_node and not calibration_shapes:`
`474`	`474`	`calibration_shapes = get_input_shapes(onnx_path)`
`475`	`475`
`476`	`476`	`if quantize_mode in ["fp8", "int8"]:`