Skip to content

Commit ab72c4e

Browse files
committed
update cases
Signed-off-by: Xin He (SW-GPU) <[email protected]>
1 parent 9831263 commit ab72c4e

File tree

2 files changed

+30
-37
lines changed

2 files changed

+30
-37
lines changed

tests/integration/defs/stress_test/stress_test.py

Lines changed: 29 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@
2929
pytest tests/integration/defs/stress_test/stress_test.py::test_run_stress_test[stress-test-with-accuracy]
3030
"""
3131
import contextlib
32+
import copy
3233
import itertools
3334
import json
3435
import os
@@ -587,14 +588,12 @@ def test_run_stress_test(config, stress_time_timeout, backend,
587588

588589
@pytest.mark.parametrize("test_mode", ["stress-test-with-accuracy"],
589590
ids=lambda x: x)
590-
@pytest.mark.parametrize("capacity_scheduler_policy", ["MAX_UTILIZATION"],
591-
ids=lambda x: x)
592-
@pytest.mark.parametrize("stress_time_timeout", [(7200, 5400)],
591+
@pytest.mark.parametrize("stress_time_timeout", [(3600, 5400)],
593592
ids=lambda x: f"stress_time_{x[0]}s_timeout_{x[1]}s")
594593
@pytest.mark.parametrize(
595594
"model_config", [
596595
pytest.param(ModelConfig(model_dir="gpt_oss/gpt-oss-120b",
597-
tp_size=2,
596+
tp_size=1,
598597
memory_requirement=120),
599598
marks=[
600599
skip_pre_blackwell,
@@ -628,54 +627,48 @@ def test_disaggregated_stress_test(model_config, test_mode,
628627

629628
# Context server configuration
630629
ctx_server_config = {
631-
"tensor_parallel_size": ctx_tp,
632-
"pipeline_parallel_size": ctx_pp,
633630
"disable_overlap_scheduler": True,
634631
"kv_cache_config": {
635-
"free_gpu_memory_fraction": 0.7,
632+
"free_gpu_memory_fraction": 0.6,
636633
},
637634
"enable_chunked_prefill": True,
638635
"max_num_tokens": 2048,
639636
"cache_transceiver_config": {
640637
"backend": "DEFAULT"
641638
},
642-
"scheduler_config": {
643-
"capacity_scheduler_policy": capacity_scheduler_policy
644-
},
645639
"max_batch_size": 32,
646640
"cuda_graph_config": None,
647641
"print_iter_log": True,
648-
"enable_attention_dp": True
649642
}
650643

651644
# Generation server configuration
652-
gen_server_config = {
653-
"tensor_parallel_size": gen_tp,
654-
"pipeline_parallel_size": gen_pp,
655-
"disable_overlap_scheduler": False,
656-
"kv_cache_config": {
657-
"free_gpu_memory_fraction": 0.7,
658-
},
659-
"cache_transceiver_config": {
660-
"backend": "DEFAULT"
661-
},
662-
"scheduler_config": {
663-
"capacity_scheduler_policy": capacity_scheduler_policy
664-
},
665-
"max_batch_size": 32,
666-
"max_num_tokens": 2048,
667-
"cuda_graph_config": None,
668-
"print_iter_log": True,
669-
"enable_attention_dp": True
670-
}
645+
gen_server_config = copy.deepcopy(ctx_server_config)
646+
#gen_server_config = {
647+
# "tensor_parallel_size": gen_tp,
648+
# "pipeline_parallel_size": gen_pp,
649+
# "disable_overlap_scheduler": True,
650+
# "kv_cache_config": {
651+
# "free_gpu_memory_fraction": 0.6,
652+
# },
653+
# "cache_transceiver_config": {
654+
# "backend": "DEFAULT"
655+
# },
656+
# "max_batch_size": 32,
657+
# "max_num_tokens": 2048,
658+
# "cuda_graph_config": None,
659+
# "print_iter_log": True
660+
#}
671661

672662
# Configure CUDA graphs for pytorch backend
673-
# cuda_graph_config = {
674-
# "enable_padding": True,
675-
# # "batch_sizes": [1, 2, 4, 8, 16, 32, 64, 128, 256, 384],
676-
# }
677-
# ctx_server_config["cuda_graph_config"] = cuda_graph_config
678-
# gen_server_config["cuda_graph_config"] = cuda_graph_config
663+
cuda_graph_config = {
664+
"enable_padding": True,
665+
"batch_sizes": [1, 2, 4, 8, 16, 32, 64, 128, 256, 384, 512],
666+
}
667+
ctx_server_config["tensor_parallel_size"] = ctx_tp
668+
ctx_server_config["pipeline_parallel_size"] = ctx_pp
669+
gen_server_config["tensor_parallel_size"] = gen_tp
670+
gen_server_config["pipeline_parallel_size"] = gen_pp
671+
gen_server_config["cuda_graph_config"] = cuda_graph_config
679672

680673
# Set default timeout values
681674
server_waiting_timeout = 28800
Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
stress_test/stress_test.py::test_run_stress_test[DeepSeek-V3_tp8-stress_time_3600s_timeout_5400s-GUARANTEED_NO_EVICT-pytorch-stress-test-with-accuracy]
22
stress_test/stress_test.py::test_run_stress_test[DeepSeek-V3_tp8-stress_time_3600s_timeout_5400s-MAX_UTILIZATION-pytorch-stress-test-with-accuracy]
33
stress_test/stress_test.py::test_run_stress_test[DeepSeek-R1_tp8-stress_time_3600s_timeout_5400s-MAX_UTILIZATION-pytorch-stress-test-with-accuracy]
4-
stress_test/stress_test.py::test_disaggregated_stress_test[gpt-oss-120b_tp2-stress_time_7200s_timeout_5400s-MAX_UTILIZATION-stress-test-with-accuracy]
4+
stress_test/stress_test.py::test_disaggregated_stress_test[gpt-oss-120b_tp1-stress_time_3600s_timeout_5400s-stress-test-with-accuracy]
55
accuracy/test_llm_api_pytorch.py::TestDeepSeekR1LongBenchV2::test_fp8_8gpus
66
accuracy/test_llm_api_pytorch.py::TestDeepSeekR1LongBenchV2::test_nvfp4_4gpus

0 commit comments

Comments
 (0)