diff --git a/tests/integration/defs/accuracy/references/gsm8k.yaml b/tests/integration/defs/accuracy/references/gsm8k.yaml index 8ad3c4b493e..c8d9ee87d7d 100644 --- a/tests/integration/defs/accuracy/references/gsm8k.yaml +++ b/tests/integration/defs/accuracy/references/gsm8k.yaml @@ -212,7 +212,7 @@ GPT-OSS/BF16: - accuracy: 90.3 - kv_cache_quant_algo: FP8 accuracy: 90.3 -GPT-OSS/MXFP4: +GPT-OSS/120B-MXFP4: - accuracy: 90.3 - quant_algo: W4A8_MXFP4_MXFP8 accuracy: 90.3 @@ -229,6 +229,18 @@ GPT-OSS/MXFP4: - quant_algo: W4A16_MXFP4 kv_cache_quant_algo: FP8 accuracy: 90.3 +GPT-OSS/20B-MXFP4: + - accuracy: 85.0 + - quant_algo: W4A8_MXFP4_MXFP8 + accuracy: 85.0 + - quant_algo: W4A8_MXFP4_MXFP8 + kv_cache_quant_algo: FP8 + accuracy: 85.0 + - quant_algo: W4A16_MXFP4 + accuracy: 85.0 + - quant_algo: W4A16_MXFP4 + kv_cache_quant_algo: FP8 + accuracy: 85.0 LGAI-EXAONE/EXAONE-4.0-32B: - accuracy: 88.36 ByteDance-Seed/Seed-OSS-36B-Instruct: diff --git a/tests/integration/defs/accuracy/test_llm_api_pytorch.py b/tests/integration/defs/accuracy/test_llm_api_pytorch.py index 4ab1aabb24d..5a4af5a5600 100644 --- a/tests/integration/defs/accuracy/test_llm_api_pytorch.py +++ b/tests/integration/defs/accuracy/test_llm_api_pytorch.py @@ -3450,7 +3450,7 @@ def test_w4_1gpu(self, kv_cache_dtype, moe_backend, cuda_graph, moe_config=MoeConfig(backend=moe_backend)) with llm: - model_name = "GPT-OSS/MXFP4" + model_name = "GPT-OSS/20B-MXFP4" task = GSM8K(model_name) task.evaluate(llm, extra_evaluator_kwargs=self.extra_evaluator_kwargs) @@ -3461,7 +3461,7 @@ def test_dummy_load_format(self): load_format="dummy", ) with llm: - model_name = "GPT-OSS/MXFP4" + model_name = "GPT-OSS/20B-MXFP4" task = GSM8K(model_name) task.evaluate(llm, is_integration_test=True) @@ -3509,7 +3509,7 @@ def test_w4_4gpus(self, kv_cache_dtype, moe_backend, tp_size, pp_size, moe_config=MoeConfig(backend=moe_backend)) with llm: - model_name = "GPT-OSS/MXFP4" + model_name = "GPT-OSS/120B-MXFP4" task = GSM8K(model_name) task.evaluate(llm, extra_evaluator_kwargs=self.extra_evaluator_kwargs) @@ -3636,7 +3636,7 @@ def test_w4_2gpus(self, kv_cache_dtype, moe_backend, tp_size, pp_size, moe_config=MoeConfig(backend=moe_backend)) with llm: - model_name = "GPT-OSS/MXFP4" + model_name = "GPT-OSS/20B-MXFP4" task = GSM8K(model_name) mocker.patch.object(GSM8K, "MAX_OUTPUT_LEN", 8192) mocker.patch.dict(GSM8K.EVALUATE_KWARGS, @@ -3663,7 +3663,7 @@ def test_w4_chunked_prefill(self, kv_cache_dtype, moe_backend, mocker): kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.6, dtype=kv_cache_dtype) - model_name = "GPT-OSS/MXFP4" + model_name = "GPT-OSS/120B-MXFP4" with LLM(self.MODEL_PATH, tensor_parallel_size=4, pipeline_parallel_size=1, diff --git a/tests/integration/test_lists/waives.txt b/tests/integration/test_lists/waives.txt index 2b3d1eb58b2..3ff1355b9e0 100644 --- a/tests/integration/test_lists/waives.txt +++ b/tests/integration/test_lists/waives.txt @@ -388,7 +388,6 @@ accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_ctx_pp_gen accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_ctx_pp_gen_tp_asymmetric[GSM8K-gen_tp=2-ctx_pp=4] SKIP (https://nvbugs/5582277) accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_ctx_pp_gen_tp_asymmetric[MMLU-gen_tp=2-ctx_pp=2] SKIP (https://nvbugs/5582277) accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_ctx_pp_gen_tp_asymmetric[MMLU-gen_tp=2-ctx_pp=4] SKIP (https://nvbugs/5582277) -accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_1gpu[True-True-trtllm-fp8] SKIP (https://nvbugs/5608790) accuracy/test_llm_api_pytorch.py::TestQwen3_8B::test_bf16[multi_gpus_no_cache] SKIP (https://nvbugs/5547414) triton_server/test_triton.py::test_llava[llava] SKIP (https://nvbugs/5547414) disaggregated/test_workers.py::test_workers_kv_cache_aware_router[TinyLlama-1.1B-Chat-v1.0] SKIP (https://nvbugs/5607238)