Skip to content
227 changes: 227 additions & 0 deletions tests/_test_utils/deploy_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,227 @@
# SPDX-FileCopyrightText: Copyright (c) 2023-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import itertools
import subprocess

import pytest
import torch

# Common test prompts for all backends
COMMON_PROMPTS = [
"Hello, my name is",
"The president of the United States is",
"The capital of France is",
"The future of AI is",
]


class ModelDeployer:
def __init__(
self,
backend: str = "trtllm",
model_id: str = "",
tensor_parallel_size: int = 1,
mini_sm: int = 89,
attn_backend: str = "TRTLLM",
base_model: str = "",
eagle3_one_model: bool = True,
):
"""
Initialize the ModelDeployer.

Args:
backend: The backend to use ('vllm', 'trtllm', or 'sglang')
model_id: Path to the model
tensor_parallel_size: Tensor parallel size for distributed inference
mini_sm: Minimum SM (Streaming Multiprocessor) requirement for the model
"""
self.backend = backend
self.model_id = model_id
self.tensor_parallel_size = tensor_parallel_size
self.mini_sm = mini_sm
self.attn_backend = attn_backend
self.base_model = base_model
self.eagle3_one_model = eagle3_one_model

def run(self):
"""Run the deployment based on the specified backend."""
if not torch.cuda.is_available() or torch.cuda.device_count() == 0:
pytest.skip("CUDA is not available")
return
if torch.cuda.get_device_capability() < (
self.mini_sm // 10,
self.mini_sm % 10,
):
pytest.skip(reason=f"Requires sm{self.mini_sm} or higher")
return

if torch.cuda.device_count() < self.tensor_parallel_size:
pytest.skip(reason=f"Requires at least {self.tensor_parallel_size} GPUs")
return
if self.backend == "vllm":
self._deploy_vllm()
elif self.backend == "trtllm":
self._deploy_trtllm()
elif self.backend == "sglang":
self._deploy_sglang()
else:
raise ValueError(f"Unknown backend: {self.backend}")
# check gpu status
gpu_status = subprocess.run(
"nvidia-smi || true", shell=True, capture_output=True, text=True, check=True
)
print("\n=== GPU Status Before Test ===")
print(gpu_status.stdout)
print("=============================\n")

def _deploy_trtllm(self):
"""Deploy a model using TensorRT-LLM."""
try:
from tensorrt_llm import LLM, SamplingParams
from tensorrt_llm.llmapi import CudaGraphConfig, EagleDecodingConfig, KvCacheConfig
except ImportError:
pytest.skip("tensorrt_llm package not available")

sampling_params = SamplingParams(max_tokens=32)
spec_config = None
llm = None
kv_cache_config = KvCacheConfig(enable_block_reuse=True, free_gpu_memory_fraction=0.8)
if "eagle" in self.model_id.lower():
spec_config = EagleDecodingConfig(
max_draft_len=3,
speculative_model_dir=self.model_id,
eagle3_one_model=self.eagle3_one_model,
)
cuda_graph = CudaGraphConfig(
max_batch_size=1,
)
llm = LLM(
model=self.base_model,
tensor_parallel_size=self.tensor_parallel_size,
enable_attention_dp=False,
disable_overlap_scheduler=True,
enable_autotuner=False,
speculative_config=spec_config,
cuda_graph_config=cuda_graph,
kv_cache_config=kv_cache_config,
)
else:
llm = LLM(
model=self.model_id,
tensor_parallel_size=self.tensor_parallel_size,
enable_attention_dp=False,
attn_backend=self.attn_backend,
trust_remote_code=True,
max_batch_size=8,
kv_cache_config=kv_cache_config,
)

outputs = llm.generate(COMMON_PROMPTS, sampling_params)

# Print outputs
for output in outputs:
prompt = output.prompt
generated_text = output.outputs[0].text
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")

def _deploy_vllm(self):
"""Deploy a model using vLLM."""
try:
from vllm import LLM, SamplingParams
except ImportError:
pytest.skip("vllm package not available")

quantization_method = "modelopt"
if "FP4" in self.model_id:
quantization_method = "modelopt_fp4"
llm = LLM(
model=self.model_id,
quantization=quantization_method,
tensor_parallel_size=self.tensor_parallel_size,
trust_remote_code=True,
)
sampling_params = SamplingParams(temperature=0.8, top_p=0.9)
outputs = llm.generate(COMMON_PROMPTS, sampling_params)

# Assertions and output
assert len(outputs) == len(COMMON_PROMPTS), (
f"Expected {len(COMMON_PROMPTS)} outputs, got {len(outputs)}"
)

for i, output in enumerate(outputs):
assert output.prompt == COMMON_PROMPTS[i], f"Prompt mismatch at index {i}"
assert hasattr(output, "outputs"), f"Output {i} missing 'outputs' attribute"
assert len(output.outputs) > 0, f"Output {i} has no generated text"
assert hasattr(output.outputs[0], "text"), f"Output {i} missing 'text' attribute"
assert isinstance(output.outputs[0].text, str), f"Output {i} text is not a string"
assert len(output.outputs[0].text) > 0, f"Output {i} generated empty text"

print(f"Model: {self.model_id}")
print(f"Prompt: {output.prompt!r}, Generated text: {output.outputs[0].text!r}")
print("-" * 50)

Comment on lines +140 to +175
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🔴 Critical

🧩 Analysis chain

vLLM quantization string may vary by version; confirm.

Some versions expect "modelopt" vs "modelopt_fp4" or different aliases. Please confirm the CI vLLM version supports these strings.


🌐 Web query:

What quantization strings does vLLM accept for NVIDIA Model Optimizer (ModelOpt) FP8/FP4 checkpoints?

💡 Result:

vLLM accepts the following ModelOpt quantization strings in the hf_quant_config.json "quant_algo" / "kv_cache_quant_algo" fields:

  • "FP8" (for FP8 checkpoints; also allowed for kv_cache_quant_algo). [1]
  • "NVFP4" (for ModelOpt NVFP4/FP4 checkpoints). [1][2]

Incorrect quantization string identifiers; use "FP8" and "NVFP4" instead.

vLLM accepts "FP8" for FP8 checkpoints and "NVFP4" for ModelOpt NVFP4/FP4 checkpoints (specified in hf_quant_config.json "quant_algo" fields). The code currently uses "modelopt" and "modelopt_fp4", which are not recognized by vLLM and will fail at runtime. Update lines 141–142 to use the correct identifiers:

  • Replace "modelopt" with "FP8"
  • Replace "modelopt_fp4" with "NVFP4"
🤖 Prompt for AI Agents
In tests/_test_utils/deploy_utils.py around lines 138 to 173, the quantization
string identifiers are incorrect: replace the current defaults so that
quantization_method is "FP8" instead of "modelopt", and when the model_id
contains "FP4" set quantization_method to "NVFP4" instead of "modelopt_fp4";
update the two assignment literals accordingly so vLLM recognizes the
quantization algorithm.

def _deploy_sglang(self):
"""Deploy a model using SGLang."""
try:
import sglang as sgl
except ImportError:
pytest.skip("sglang package not available")
quantization_method = "modelopt"
if "FP4" in self.model_id:
quantization_method = "modelopt_fp4"
llm = sgl.Engine(
model_path=self.model_id,
quantization=quantization_method,
tp_size=self.tensor_parallel_size,
trust_remote_code=True,
)
print(llm.generate(["What's the age of the earth? "]))
llm.shutdown()


class ModelDeployerList:
def __init__(self, **params):
self.params = {}
for key, value in params.items():
if isinstance(value, (list, tuple)):
self.params[key] = list(value)
else:
self.params[key] = [value]

# Pre-generate all deployers for pytest compatibility
self._deployers = list(self._generate_deployers())

def _generate_deployers(self):
for values in itertools.product(*self.params.values()):
deployer = ModelDeployer(**dict(zip(self.params.keys(), values)))
# Set test case ID in format "model_id_backend"
deployer.test_id = f"{deployer.model_id}_{deployer.backend}"
yield deployer

def __iter__(self):
return iter(self._deployers)

def __len__(self):
return len(self._deployers)

def __getitem__(self, index):
return self._deployers[index]

def __str__(self):
return f"ModelDeployerList({len(self._deployers)} items)"

def __repr__(self):
return f"ModelDeployerList({len(self._deployers)} items)"
11 changes: 6 additions & 5 deletions tests/examples/cnn_qat/test_resnet50.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,11 +20,10 @@
from _test_utils.torch_misc import minimum_gpu

imagenet_path = os.getenv("IMAGENET_PATH")
if not imagenet_path or not os.path.isdir(imagenet_path):
pytest.skip(
"IMAGENET_PATH environment variable is not set or does not point to a valid directory",
allow_module_level=True,
)
skip_no_imagenet = pytest.mark.skipif(
not imagenet_path or not os.path.isdir(imagenet_path),
reason="IMAGENET_PATH environment variable is not set or does not point to a valid directory",
)


def _build_common_command():
Expand Down Expand Up @@ -59,6 +58,7 @@ def _run_qat_command(base_cmd, common_args, output_dir, example_dir="cnn_qat"):
run_example_command(full_command, example_dir)


@skip_no_imagenet
@minimum_gpu(1)
def test_cnn_qat_single_gpu(tmp_path):
"""Test CNN QAT on single GPU."""
Expand All @@ -68,6 +68,7 @@ def test_cnn_qat_single_gpu(tmp_path):
_run_qat_command(base_command, common_args, tmp_path)


@skip_no_imagenet
@minimum_gpu(2)
def test_cnn_qat_multi_gpu(tmp_path):
"""Test CNN QAT on multiple GPUs."""
Expand Down
Loading
Loading