Skip to content

Commit cd44f80

Browse files
authored
[#9316][feat] AutoDeploy: Add the accuracy test for Nemotron MOE models (#9317)
Signed-off-by: Chenghao Zhang <[email protected]>
1 parent 3004692 commit cd44f80

File tree

3 files changed

+32
-5
lines changed

3 files changed

+32
-5
lines changed

tests/integration/defs/accuracy/references/gsm8k.yaml

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -161,6 +161,11 @@ nvidia/Nemotron-H-56B-Base-8K:
161161
- quant_algo: FP8
162162
kv_cache_quant_algo: FP8
163163
accuracy: 89.27
164+
nvidia/Nemotron-MOE:
165+
- accuracy: 88.249
166+
- quant_algo: FP8
167+
kv_cache_quant_algo: FP8
168+
accuracy: 86.884
164169
nvidia/Llama-3.1-Nemotron-Nano-8B-v1:
165170
- accuracy: 37.15
166171
- quant_algo: FP8

tests/integration/defs/accuracy/references/mmlu.yaml

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -270,6 +270,11 @@ nvidia/Nemotron-H-56B-Base-8K:
270270
- quant_algo: FP8
271271
kv_cache_quant_algo: FP8
272272
accuracy: 83.82
273+
nvidia/Nemotron-MOE:
274+
- accuracy: 77.802
275+
- quant_algo: FP8
276+
kv_cache_quant_algo: FP8
277+
accuracy: 73.879
273278
microsoft/Phi-4-mini-instruct:
274279
- accuracy: 68.98
275280
- quant_algo: FP8

tests/integration/defs/accuracy/test_llm_api_autodeploy.py

Lines changed: 22 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@
1818
import pytest
1919

2020
from tensorrt_llm._torch.auto_deploy import LLM as AutoDeployLLM
21+
from tensorrt_llm.quantization import QuantAlgo
2122
from tensorrt_llm.sampling_params import SamplingParams
2223

2324
from ..conftest import llm_models_root
@@ -153,7 +154,8 @@ def test_auto_dtype(self, enable_chunked_prefill):
153154

154155
class TestNemotronMOE(LlmapiAccuracyTestHarness):
155156
MODEL_NAME = "nvidia/Nemotron-MOE"
156-
MODEL_PATH = f"{llm_models_root()}/Nemotron-MOE/"
157+
MODEL_PATH_BF16 = f"{llm_models_root()}/Nemotron-Nano-3-30B-A3.5B-dev-1024"
158+
MODEL_PATH_FP8 = f"{llm_models_root()}/Nemotron-Nano-3-30B-A3.5B-FP8-KVFP8-dev"
157159

158160
def get_default_kwargs(self):
159161
return {
@@ -196,13 +198,28 @@ def get_default_sampling_params(self):
196198
use_beam_search=beam_width > 1)
197199

198200
@pytest.mark.skip_less_device_memory(32000)
199-
def test_auto_dtype(self):
200-
pytest.skip("Nemotron-MOE is not in CI yet")
201+
def test_bf16(self):
201202
kwargs = self.get_default_kwargs()
202203
sampling_params = self.get_default_sampling_params()
203-
with AutoDeployLLM(model=self.MODEL_PATH,
204-
tokenizer=self.MODEL_PATH,
204+
with AutoDeployLLM(model=self.MODEL_PATH_BF16,
205+
tokenizer=self.MODEL_PATH_BF16,
206+
**kwargs) as llm:
207+
task = MMLU(self.MODEL_NAME)
208+
task.evaluate(llm, sampling_params=sampling_params)
209+
task = GSM8K(self.MODEL_NAME)
210+
task.evaluate(llm)
211+
212+
@pytest.mark.skip_less_device_memory(32000)
213+
def test_fp8(self):
214+
kwargs = self.get_default_kwargs()
215+
sampling_params = self.get_default_sampling_params()
216+
with AutoDeployLLM(model=self.MODEL_PATH_FP8,
217+
tokenizer=self.MODEL_PATH_FP8,
205218
**kwargs) as llm:
219+
# Manually set quant_config for FP8 model to get the accuracy threshold
220+
llm.args.quant_config.quant_algo = QuantAlgo.FP8
221+
llm.args.quant_config.kv_cache_quant_algo = QuantAlgo.FP8
222+
206223
task = MMLU(self.MODEL_NAME)
207224
task.evaluate(llm, sampling_params=sampling_params)
208225
task = GSM8K(self.MODEL_NAME)

0 commit comments

Comments
 (0)