|
29 | 29 | pytest tests/integration/defs/stress_test/stress_test.py::test_run_stress_test[stress-test-with-accuracy] |
30 | 30 | """ |
31 | 31 | import contextlib |
| 32 | +import copy |
32 | 33 | import itertools |
33 | 34 | import json |
34 | 35 | import os |
@@ -587,14 +588,12 @@ def test_run_stress_test(config, stress_time_timeout, backend, |
587 | 588 |
|
588 | 589 | @pytest.mark.parametrize("test_mode", ["stress-test-with-accuracy"], |
589 | 590 | ids=lambda x: x) |
590 | | -@pytest.mark.parametrize("capacity_scheduler_policy", ["MAX_UTILIZATION"], |
591 | | - ids=lambda x: x) |
592 | | -@pytest.mark.parametrize("stress_time_timeout", [(7200, 5400)], |
| 591 | +@pytest.mark.parametrize("stress_time_timeout", [(3600, 5400)], |
593 | 592 | ids=lambda x: f"stress_time_{x[0]}s_timeout_{x[1]}s") |
594 | 593 | @pytest.mark.parametrize( |
595 | 594 | "model_config", [ |
596 | 595 | pytest.param(ModelConfig(model_dir="gpt_oss/gpt-oss-120b", |
597 | | - tp_size=2, |
| 596 | + tp_size=1, |
598 | 597 | memory_requirement=120), |
599 | 598 | marks=[ |
600 | 599 | skip_pre_blackwell, |
@@ -628,54 +627,48 @@ def test_disaggregated_stress_test(model_config, test_mode, |
628 | 627 |
|
629 | 628 | # Context server configuration |
630 | 629 | ctx_server_config = { |
631 | | - "tensor_parallel_size": ctx_tp, |
632 | | - "pipeline_parallel_size": ctx_pp, |
633 | 630 | "disable_overlap_scheduler": True, |
634 | 631 | "kv_cache_config": { |
635 | | - "free_gpu_memory_fraction": 0.7, |
| 632 | + "free_gpu_memory_fraction": 0.6, |
636 | 633 | }, |
637 | 634 | "enable_chunked_prefill": True, |
638 | 635 | "max_num_tokens": 2048, |
639 | 636 | "cache_transceiver_config": { |
640 | 637 | "backend": "DEFAULT" |
641 | 638 | }, |
642 | | - "scheduler_config": { |
643 | | - "capacity_scheduler_policy": capacity_scheduler_policy |
644 | | - }, |
645 | 639 | "max_batch_size": 32, |
646 | 640 | "cuda_graph_config": None, |
647 | 641 | "print_iter_log": True, |
648 | | - "enable_attention_dp": True |
649 | 642 | } |
650 | 643 |
|
651 | 644 | # Generation server configuration |
652 | | - gen_server_config = { |
653 | | - "tensor_parallel_size": gen_tp, |
654 | | - "pipeline_parallel_size": gen_pp, |
655 | | - "disable_overlap_scheduler": False, |
656 | | - "kv_cache_config": { |
657 | | - "free_gpu_memory_fraction": 0.7, |
658 | | - }, |
659 | | - "cache_transceiver_config": { |
660 | | - "backend": "DEFAULT" |
661 | | - }, |
662 | | - "scheduler_config": { |
663 | | - "capacity_scheduler_policy": capacity_scheduler_policy |
664 | | - }, |
665 | | - "max_batch_size": 32, |
666 | | - "max_num_tokens": 2048, |
667 | | - "cuda_graph_config": None, |
668 | | - "print_iter_log": True, |
669 | | - "enable_attention_dp": True |
670 | | - } |
| 645 | + gen_server_config = copy.deepcopy(ctx_server_config) |
| 646 | + #gen_server_config = { |
| 647 | + # "tensor_parallel_size": gen_tp, |
| 648 | + # "pipeline_parallel_size": gen_pp, |
| 649 | + # "disable_overlap_scheduler": True, |
| 650 | + # "kv_cache_config": { |
| 651 | + # "free_gpu_memory_fraction": 0.6, |
| 652 | + # }, |
| 653 | + # "cache_transceiver_config": { |
| 654 | + # "backend": "DEFAULT" |
| 655 | + # }, |
| 656 | + # "max_batch_size": 32, |
| 657 | + # "max_num_tokens": 2048, |
| 658 | + # "cuda_graph_config": None, |
| 659 | + # "print_iter_log": True |
| 660 | + #} |
671 | 661 |
|
672 | 662 | # Configure CUDA graphs for pytorch backend |
673 | | - # cuda_graph_config = { |
674 | | - # "enable_padding": True, |
675 | | - # # "batch_sizes": [1, 2, 4, 8, 16, 32, 64, 128, 256, 384], |
676 | | - # } |
677 | | - # ctx_server_config["cuda_graph_config"] = cuda_graph_config |
678 | | - # gen_server_config["cuda_graph_config"] = cuda_graph_config |
| 663 | + cuda_graph_config = { |
| 664 | + "enable_padding": True, |
| 665 | + "batch_sizes": [1, 2, 4, 8, 16, 32, 64, 128, 256, 384, 512], |
| 666 | + } |
| 667 | + ctx_server_config["tensor_parallel_size"] = ctx_tp |
| 668 | + ctx_server_config["pipeline_parallel_size"] = ctx_pp |
| 669 | + gen_server_config["tensor_parallel_size"] = gen_tp |
| 670 | + gen_server_config["pipeline_parallel_size"] = gen_pp |
| 671 | + gen_server_config["cuda_graph_config"] = cuda_graph_config |
679 | 672 |
|
680 | 673 | # Set default timeout values |
681 | 674 | server_waiting_timeout = 28800 |
|
0 commit comments