NVIDIA · xinhe-nv · Oct 28, 2025 · Oct 25, 2025 · Oct 27, 2025
diff --git a/tests/integration/defs/accuracy/references/gsm8k.yaml b/tests/integration/defs/accuracy/references/gsm8k.yaml
@@ -212,7 +212,7 @@ GPT-OSS/BF16:
   - accuracy: 90.3
   - kv_cache_quant_algo: FP8
     accuracy: 90.3
-GPT-OSS/MXFP4:
+GPT-OSS/120B-MXFP4:
   - accuracy: 90.3
   - quant_algo: W4A8_MXFP4_MXFP8
     accuracy: 90.3
@@ -229,6 +229,18 @@ GPT-OSS/MXFP4:
   - quant_algo: W4A16_MXFP4
     kv_cache_quant_algo: FP8
     accuracy: 90.3
+GPT-OSS/20B-MXFP4:
+  - accuracy: 85.0
+  - quant_algo: W4A8_MXFP4_MXFP8
+    accuracy: 85.0
+  - quant_algo: W4A8_MXFP4_MXFP8
+    kv_cache_quant_algo: FP8
+    accuracy: 85.0
+  - quant_algo: W4A16_MXFP4
+    accuracy: 85.0
+  - quant_algo: W4A16_MXFP4
+    kv_cache_quant_algo: FP8
+    accuracy: 85.0
 LGAI-EXAONE/EXAONE-4.0-32B:
   - accuracy: 88.36
 ByteDance-Seed/Seed-OSS-36B-Instruct:

diff --git a/tests/integration/defs/accuracy/test_llm_api_pytorch.py b/tests/integration/defs/accuracy/test_llm_api_pytorch.py
@@ -3450,7 +3450,7 @@ def test_w4_1gpu(self, kv_cache_dtype, moe_backend, cuda_graph,
                   moe_config=MoeConfig(backend=moe_backend))
 
         with llm:
-            model_name = "GPT-OSS/MXFP4"
+            model_name = "GPT-OSS/20B-MXFP4"
             task = GSM8K(model_name)
             task.evaluate(llm,
                           extra_evaluator_kwargs=self.extra_evaluator_kwargs)
@@ -3461,7 +3461,7 @@ def test_dummy_load_format(self):
             load_format="dummy",
         )
         with llm:
-            model_name = "GPT-OSS/MXFP4"
+            model_name = "GPT-OSS/20B-MXFP4"
             task = GSM8K(model_name)
             task.evaluate(llm, is_integration_test=True)
 
@@ -3509,7 +3509,7 @@ def test_w4_4gpus(self, kv_cache_dtype, moe_backend, tp_size, pp_size,
                   moe_config=MoeConfig(backend=moe_backend))
 
         with llm:
-            model_name = "GPT-OSS/MXFP4"
+            model_name = "GPT-OSS/120B-MXFP4"
             task = GSM8K(model_name)
             task.evaluate(llm,
                           extra_evaluator_kwargs=self.extra_evaluator_kwargs)
@@ -3636,7 +3636,7 @@ def test_w4_2gpus(self, kv_cache_dtype, moe_backend, tp_size, pp_size,
                   moe_config=MoeConfig(backend=moe_backend))
 
         with llm:
-            model_name = "GPT-OSS/MXFP4"
+            model_name = "GPT-OSS/20B-MXFP4"
             task = GSM8K(model_name)
             mocker.patch.object(GSM8K, "MAX_OUTPUT_LEN", 8192)
             mocker.patch.dict(GSM8K.EVALUATE_KWARGS,
@@ -3663,7 +3663,7 @@ def test_w4_chunked_prefill(self, kv_cache_dtype, moe_backend, mocker):
         kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.6,
                                         dtype=kv_cache_dtype)
 
-        model_name = "GPT-OSS/MXFP4"
+        model_name = "GPT-OSS/120B-MXFP4"
         with LLM(self.MODEL_PATH,
                  tensor_parallel_size=4,
                  pipeline_parallel_size=1,

diff --git a/tests/integration/test_lists/waives.txt b/tests/integration/test_lists/waives.txt
@@ -388,7 +388,6 @@ accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_ctx_pp_gen
 accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_ctx_pp_gen_tp_asymmetric[GSM8K-gen_tp=2-ctx_pp=4] SKIP (https://nvbugs/5582277)
 accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_ctx_pp_gen_tp_asymmetric[MMLU-gen_tp=2-ctx_pp=2] SKIP (https://nvbugs/5582277)
 accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_ctx_pp_gen_tp_asymmetric[MMLU-gen_tp=2-ctx_pp=4] SKIP (https://nvbugs/5582277)
-accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_1gpu[True-True-trtllm-fp8] SKIP (https://nvbugs/5608790)
 accuracy/test_llm_api_pytorch.py::TestQwen3_8B::test_bf16[multi_gpus_no_cache] SKIP (https://nvbugs/5547414)
 triton_server/test_triton.py::test_llava[llava] SKIP (https://nvbugs/5547414)
 disaggregated/test_workers.py::test_workers_kv_cache_aware_router[TinyLlama-1.1B-Chat-v1.0] SKIP (https://nvbugs/5607238)