From 75731c11616d196ca3be747bc340a27770bea938 Mon Sep 17 00:00:00 2001 From: Jason Zhou Date: Thu, 20 Nov 2025 19:28:57 -0800 Subject: [PATCH 1/5] test: add AIConfigurator dense model tests for Dynamo Planner Profiler --- .gitignore | 3 ++ .../profiler/utils/config_modifiers/vllm.py | 3 +- benchmarks/pyproject.toml | 2 +- .../test_profile_sla_aiconfigurator.py | 36 ++++++++++--------- 4 files changed, 26 insertions(+), 18 deletions(-) diff --git a/.gitignore b/.gitignore index a706c5b098..e30aeac660 100644 --- a/.gitignore +++ b/.gitignore @@ -77,6 +77,9 @@ llm_engine.h ### Ruff ### .ruff_cache/ +### MyPy ### +.mypy_cache/ + ### Python ### __pycache__/ *.py[cod] diff --git a/benchmarks/profiler/utils/config_modifiers/vllm.py b/benchmarks/profiler/utils/config_modifiers/vllm.py index c89102ca30..3a94d1ee9b 100644 --- a/benchmarks/profiler/utils/config_modifiers/vllm.py +++ b/benchmarks/profiler/utils/config_modifiers/vllm.py @@ -123,7 +123,8 @@ def convert_config( args = break_arguments(args) # remove --is-prefill-worker flag - args.remove("--is-prefill-worker") + if "--is-prefill-worker" in args: + args.remove("--is-prefill-worker") # disable prefix caching if "--enable-prefix-caching" in args: diff --git a/benchmarks/pyproject.toml b/benchmarks/pyproject.toml index 9ee8804cd9..529d7333c5 100644 --- a/benchmarks/pyproject.toml +++ b/benchmarks/pyproject.toml @@ -40,7 +40,7 @@ classifiers = [ ] dependencies = [ - "aiconfigurator @ git+https://github.com/ai-dynamo/aiconfigurator.git@5554d2eb8206738c66048bf2d72183e9bcd85759", + "aiconfigurator @ git+https://github.com/ai-dynamo/aiconfigurator.git@release/0.4.0", "networkx", "pandas", "pydantic>=2", diff --git a/tests/profiler/test_profile_sla_aiconfigurator.py b/tests/profiler/test_profile_sla_aiconfigurator.py index 72f1dde18e..dc1b8a1538 100644 --- a/tests/profiler/test_profile_sla_aiconfigurator.py +++ b/tests/profiler/test_profile_sla_aiconfigurator.py @@ -37,7 +37,7 @@ class TestProfileSlaAiconfigurator: """Test class for profile_sla aiconfigurator functionality.""" @pytest.fixture - def trtllm_args(self): + def llm_args(self): class Args: def __init__(self): self.model = "" @@ -78,12 +78,12 @@ def __init__(self): @pytest.mark.pre_merge @pytest.mark.asyncio @pytest.mark.parametrize("missing_arg", ["aic_system", "aic_hf_id"]) - async def test_aiconfigurator_missing_args(self, trtllm_args, missing_arg): + async def test_aiconfigurator_missing_args(self, llm_args, missing_arg): # Check that validation error happens when a required arg is missing. # Note: aic_backend_version is optional - when None, auto-detects latest version - setattr(trtllm_args, missing_arg, None) + setattr(llm_args, missing_arg, None) with pytest.raises(ValueError): - await run_profile(trtllm_args) + await run_profile(llm_args) @pytest.mark.pre_merge @pytest.mark.asyncio @@ -95,18 +95,18 @@ async def test_aiconfigurator_missing_args(self, trtllm_args, missing_arg): ("aic_backend_version", "0.1.0"), ], ) - async def test_aiconfiguator_no_data(self, trtllm_args, arg_name, bad_value): + async def test_aiconfiguator_no_data(self, llm_args, arg_name, bad_value): # Check that an appropriate error is raised when the system/model/backend # is not found in the aiconfigurator database. - setattr(trtllm_args, arg_name, bad_value) + setattr(llm_args, arg_name, bad_value) with pytest.raises(ValueError, match="Database not found"): - await run_profile(trtllm_args) + await run_profile(llm_args) @pytest.mark.pre_merge @pytest.mark.asyncio - async def test_trtllm_aiconfigurator_single_model(self, trtllm_args): - # Test that profile_sla works with the model & backend in the trtllm_args fixture. - await run_profile(trtllm_args) + async def test_trtllm_aiconfigurator_single_model(self, llm_args): + # Test that profile_sla works with the model & backend in the llm_args fixture. + await run_profile(llm_args) @pytest.mark.asyncio @pytest.mark.parametrize( @@ -115,6 +115,10 @@ async def test_trtllm_aiconfigurator_single_model(self, trtllm_args): ("trtllm", None), ("trtllm", "0.20.0"), ("trtllm", "1.0.0rc3"), + ("vllm", None), + ("vllm", "0.11.0"), + ("sglang", None), + ("sglang", "0.5.1.post1"), ], ) @pytest.mark.parametrize( @@ -124,11 +128,11 @@ async def test_trtllm_aiconfigurator_single_model(self, trtllm_args): "meta-llama/Llama-3.1-405B", ], ) - async def test_trtllm_aiconfigurator_many( - self, trtllm_args, hf_model_id, backend, aic_backend_version + async def test_aiconfigurator_dense_models( + self, llm_args, hf_model_id, backend, aic_backend_version ): # Test that profile_sla works with a variety of backend versions and model names. - trtllm_args.aic_hf_id = hf_model_id - trtllm_args.backend = backend - trtllm_args.aic_backend_version = aic_backend_version - await run_profile(trtllm_args) + llm_args.aic_hf_id = hf_model_id + llm_args.backend = backend + llm_args.aic_backend_version = aic_backend_version + await run_profile(llm_args) From 3776be488ccd407ca76dcc96cc9f655e8d8bdefc Mon Sep 17 00:00:00 2001 From: Jason Zhou Date: Thu, 20 Nov 2025 20:47:08 -0800 Subject: [PATCH 2/5] tests/profiler/test_profile_sla_aiconfigurator.py --- benchmarks/profiler/profile_sla.py | 61 ++++++++++++++++--- .../parallelization_mapping.py | 8 +-- .../test_profile_sla_aiconfigurator.py | 2 +- 3 files changed, 56 insertions(+), 15 deletions(-) diff --git a/benchmarks/profiler/profile_sla.py b/benchmarks/profiler/profile_sla.py index 8d3eee1f4f..93e8943469 100644 --- a/benchmarks/profiler/profile_sla.py +++ b/benchmarks/profiler/profile_sla.py @@ -21,6 +21,7 @@ import numpy as np import yaml +from dynamo.planner.defaults import WORKER_COMPONENT_NAMES from benchmarks.profiler.utils.aiperf import ( get_decode_itl_and_thpt_per_gpu, @@ -54,7 +55,6 @@ DynamoDeploymentClient, cleanup_remaining_deployments, ) -from dynamo.planner.defaults import WORKER_COMPONENT_NAMES @dataclass @@ -126,6 +126,37 @@ def add_data( logger.addHandler(console_handler) +def build_model_config_kwargs( + mapping: ParallelizationMapping, is_moe: bool, backend: str +) -> dict: + """ + Build model configuration kwargs for aiconfigurator based on parallelization mapping. + + Args: + mapping: Parallelization mapping containing tp/tep/dep configuration + is_moe: Whether the model is a Mixture of Experts model + backend: Backend name (e.g., "sglang", "trtllm", "vllm") + + Returns: + Dictionary of model configuration parameters for aiconfigurator + """ + model_config_kwargs = {"tp_size": mapping.get_tp_size()} + + # For MoE models, also pass moe_tp_size, moe_ep_size, and attention_dp_size + if is_moe: + model_config_kwargs["moe_tp_size"] = mapping.get_tp_size() + model_config_kwargs["moe_ep_size"] = mapping.get_expert_split() + model_config_kwargs["attention_dp_size"] = mapping.get_attn_dp_size() + + # SGLang-specific MoE configuration + if backend == "sglang": + model_config_kwargs["enable_wideep"] = True + model_config_kwargs["moe_backend"] = "deepep_moe" + model_config_kwargs["attention_backend"] = "flashinfer" + + return model_config_kwargs + + async def run_profile(args): # List to track all created deployment clients for cleanup in case of failure deployment_clients = [] @@ -140,9 +171,9 @@ async def run_profile(args): logger.info( "MoE (Mixture of Experts) model profiling, sweeping TEP/DEP size for prefill and decode" ) - assert args.backend in [ - "sglang" - ], "MoE model support is only available for SGLang" + assert args.backend in ["sglang"], ( + "MoE model support is only available for SGLang" + ) else: logger.info( "Dense model profiling, sweeping TP size for prefill and decode" @@ -272,9 +303,12 @@ async def run_profile(args): logger.info("Skipping deployment creation in dry run mode") elif args.use_ai_configurator: logger.info("Using ai-configurator to estimate prefill latency") + model_config_kwargs = build_model_config_kwargs( + mapping, args.model_info.is_moe, args.backend + ) perf_dict = ai_configurator_perf_estimator.estimate_prefill_perf( args.isl, - tp_size=mapping.get_tp_size(), + **model_config_kwargs, ) ttft = perf_dict["context_latency"] logger.info(f"Estimated prefill TTFT: {ttft:.2f}ms") @@ -379,8 +413,11 @@ async def run_profile(args): elif args.use_ai_configurator: # Compute max_concurrency and max_kv_tokens to know which # num_request to sweep over. + model_config_kwargs = build_model_config_kwargs( + mapping, args.model_info.is_moe, args.backend + ) max_concurrency = ai_configurator_perf_estimator.get_max_batch_size( - args.isl, args.osl, tp_size=mapping.get_tp_size() + args.isl, args.osl, **model_config_kwargs ) max_kv_tokens = max_concurrency * (args.isl + args.osl) @@ -574,13 +611,16 @@ async def run_profile(args): if args.dry_run: logger.info("Skipping deployment creation in dry run mode") elif args.use_ai_configurator: + model_config_kwargs = build_model_config_kwargs( + best_prefill_mapping, args.model_info.is_moe, args.backend + ) profile_prefill_aiconfigurator( work_dir, best_prefill_gpus, # num_gpus sweep_max_context_length, args.prefill_interpolation_granularity, ai_configurator_perf_estimator, - tp_size=best_prefill_mapping.get_tp_size(), + **model_config_kwargs, ) else: client = DynamoDeploymentClient( @@ -659,8 +699,11 @@ async def run_profile(args): logger.info("Skipping deployment creation in dry run mode") elif args.use_ai_configurator: attention_dp_size = best_decode_mapping.get_attn_dp_size() + model_config_kwargs = build_model_config_kwargs( + best_decode_mapping, args.model_info.is_moe, args.backend + ) max_kv_tokens = ai_configurator_perf_estimator.get_max_kv_tokens( - args.isl, args.osl, tp_size=best_decode_mapping.get_tp_size() + args.isl, args.osl, **model_config_kwargs ) profile_decode_aiconfigurator( work_dir, @@ -670,7 +713,7 @@ async def run_profile(args): args.decode_interpolation_granularity, ai_configurator_perf_estimator, attention_dp_size, - tp_size=best_decode_mapping.get_tp_size(), + **model_config_kwargs, ) else: client = DynamoDeploymentClient( diff --git a/benchmarks/profiler/utils/config_modifiers/parallelization_mapping.py b/benchmarks/profiler/utils/config_modifiers/parallelization_mapping.py index 311c696fab..6357d6f792 100644 --- a/benchmarks/profiler/utils/config_modifiers/parallelization_mapping.py +++ b/benchmarks/profiler/utils/config_modifiers/parallelization_mapping.py @@ -60,14 +60,12 @@ def get_tp_size(self) -> int: def get_expert_split(self) -> int: """ - Get the effective expert split size. - Both TEP and DEP split experts, TP doesn't (returns 1). + Get the effective expert split size (expert parallelism, not tensor parallelism). + Only DEP splits experts across GPUs. TEP and TP don't split experts (returns 1). """ - if self.tep is not None: - return self.tep if self.dep is not None: return self.dep - return 1 # TP has expert split of 1 + return 1 # TP and TEP don't use expert parallelism def get_attn_dp_size(self) -> int: """ diff --git a/tests/profiler/test_profile_sla_aiconfigurator.py b/tests/profiler/test_profile_sla_aiconfigurator.py index dc1b8a1538..86e35234c3 100644 --- a/tests/profiler/test_profile_sla_aiconfigurator.py +++ b/tests/profiler/test_profile_sla_aiconfigurator.py @@ -95,7 +95,7 @@ async def test_aiconfigurator_missing_args(self, llm_args, missing_arg): ("aic_backend_version", "0.1.0"), ], ) - async def test_aiconfiguator_no_data(self, llm_args, arg_name, bad_value): + async def test_aiconfigurator_no_data(self, llm_args, arg_name, bad_value): # Check that an appropriate error is raised when the system/model/backend # is not found in the aiconfigurator database. setattr(llm_args, arg_name, bad_value) From f791668a00e4079b7de280d787d78567041b918f Mon Sep 17 00:00:00 2001 From: Jason Zhou Date: Fri, 21 Nov 2025 11:33:13 -0800 Subject: [PATCH 3/5] Update profile_sla.py Signed-off-by: Jason Zhou --- benchmarks/profiler/profile_sla.py | 61 +++++------------------------- 1 file changed, 9 insertions(+), 52 deletions(-) diff --git a/benchmarks/profiler/profile_sla.py b/benchmarks/profiler/profile_sla.py index 93e8943469..c1cdeef82b 100644 --- a/benchmarks/profiler/profile_sla.py +++ b/benchmarks/profiler/profile_sla.py @@ -21,7 +21,6 @@ import numpy as np import yaml -from dynamo.planner.defaults import WORKER_COMPONENT_NAMES from benchmarks.profiler.utils.aiperf import ( get_decode_itl_and_thpt_per_gpu, @@ -55,6 +54,7 @@ DynamoDeploymentClient, cleanup_remaining_deployments, ) +from dynamo.planner.defaults import WORKER_COMPONENT_NAMES @dataclass @@ -126,37 +126,6 @@ def add_data( logger.addHandler(console_handler) -def build_model_config_kwargs( - mapping: ParallelizationMapping, is_moe: bool, backend: str -) -> dict: - """ - Build model configuration kwargs for aiconfigurator based on parallelization mapping. - - Args: - mapping: Parallelization mapping containing tp/tep/dep configuration - is_moe: Whether the model is a Mixture of Experts model - backend: Backend name (e.g., "sglang", "trtllm", "vllm") - - Returns: - Dictionary of model configuration parameters for aiconfigurator - """ - model_config_kwargs = {"tp_size": mapping.get_tp_size()} - - # For MoE models, also pass moe_tp_size, moe_ep_size, and attention_dp_size - if is_moe: - model_config_kwargs["moe_tp_size"] = mapping.get_tp_size() - model_config_kwargs["moe_ep_size"] = mapping.get_expert_split() - model_config_kwargs["attention_dp_size"] = mapping.get_attn_dp_size() - - # SGLang-specific MoE configuration - if backend == "sglang": - model_config_kwargs["enable_wideep"] = True - model_config_kwargs["moe_backend"] = "deepep_moe" - model_config_kwargs["attention_backend"] = "flashinfer" - - return model_config_kwargs - - async def run_profile(args): # List to track all created deployment clients for cleanup in case of failure deployment_clients = [] @@ -171,9 +140,9 @@ async def run_profile(args): logger.info( "MoE (Mixture of Experts) model profiling, sweeping TEP/DEP size for prefill and decode" ) - assert args.backend in ["sglang"], ( - "MoE model support is only available for SGLang" - ) + assert args.backend in [ + "sglang" + ], "MoE model support is only available for SGLang" else: logger.info( "Dense model profiling, sweeping TP size for prefill and decode" @@ -303,12 +272,9 @@ async def run_profile(args): logger.info("Skipping deployment creation in dry run mode") elif args.use_ai_configurator: logger.info("Using ai-configurator to estimate prefill latency") - model_config_kwargs = build_model_config_kwargs( - mapping, args.model_info.is_moe, args.backend - ) perf_dict = ai_configurator_perf_estimator.estimate_prefill_perf( args.isl, - **model_config_kwargs, + tp_size=mapping.get_tp_size(), ) ttft = perf_dict["context_latency"] logger.info(f"Estimated prefill TTFT: {ttft:.2f}ms") @@ -413,11 +379,8 @@ async def run_profile(args): elif args.use_ai_configurator: # Compute max_concurrency and max_kv_tokens to know which # num_request to sweep over. - model_config_kwargs = build_model_config_kwargs( - mapping, args.model_info.is_moe, args.backend - ) max_concurrency = ai_configurator_perf_estimator.get_max_batch_size( - args.isl, args.osl, **model_config_kwargs + args.isl, args.osl, tp_size=mapping.get_tp_size() ) max_kv_tokens = max_concurrency * (args.isl + args.osl) @@ -611,16 +574,13 @@ async def run_profile(args): if args.dry_run: logger.info("Skipping deployment creation in dry run mode") elif args.use_ai_configurator: - model_config_kwargs = build_model_config_kwargs( - best_prefill_mapping, args.model_info.is_moe, args.backend - ) profile_prefill_aiconfigurator( work_dir, best_prefill_gpus, # num_gpus sweep_max_context_length, args.prefill_interpolation_granularity, ai_configurator_perf_estimator, - **model_config_kwargs, + tp_size=best_prefill_mapping.get_tp_size(), ) else: client = DynamoDeploymentClient( @@ -699,11 +659,8 @@ async def run_profile(args): logger.info("Skipping deployment creation in dry run mode") elif args.use_ai_configurator: attention_dp_size = best_decode_mapping.get_attn_dp_size() - model_config_kwargs = build_model_config_kwargs( - best_decode_mapping, args.model_info.is_moe, args.backend - ) max_kv_tokens = ai_configurator_perf_estimator.get_max_kv_tokens( - args.isl, args.osl, **model_config_kwargs + args.isl, args.osl, tp_size=best_decode_mapping.get_tp_size() ) profile_decode_aiconfigurator( work_dir, @@ -713,7 +670,7 @@ async def run_profile(args): args.decode_interpolation_granularity, ai_configurator_perf_estimator, attention_dp_size, - **model_config_kwargs, + tp_size=best_decode_mapping.get_tp_size() ) else: client = DynamoDeploymentClient( From 03760dcd4cf7f01fb7a4f22778ff5da4d310d765 Mon Sep 17 00:00:00 2001 From: Jason Zhou Date: Fri, 21 Nov 2025 11:33:47 -0800 Subject: [PATCH 4/5] Update profile_sla.py Signed-off-by: Jason Zhou --- benchmarks/profiler/profile_sla.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/benchmarks/profiler/profile_sla.py b/benchmarks/profiler/profile_sla.py index c1cdeef82b..8d3eee1f4f 100644 --- a/benchmarks/profiler/profile_sla.py +++ b/benchmarks/profiler/profile_sla.py @@ -670,7 +670,7 @@ async def run_profile(args): args.decode_interpolation_granularity, ai_configurator_perf_estimator, attention_dp_size, - tp_size=best_decode_mapping.get_tp_size() + tp_size=best_decode_mapping.get_tp_size(), ) else: client = DynamoDeploymentClient( From 4de691764f858d0eb8e0934eeb471eec20e96492 Mon Sep 17 00:00:00 2001 From: Jason Zhou Date: Fri, 21 Nov 2025 11:34:56 -0800 Subject: [PATCH 5/5] Update parallelization_mapping.py Signed-off-by: Jason Zhou --- .../utils/config_modifiers/parallelization_mapping.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/benchmarks/profiler/utils/config_modifiers/parallelization_mapping.py b/benchmarks/profiler/utils/config_modifiers/parallelization_mapping.py index 6357d6f792..311c696fab 100644 --- a/benchmarks/profiler/utils/config_modifiers/parallelization_mapping.py +++ b/benchmarks/profiler/utils/config_modifiers/parallelization_mapping.py @@ -60,12 +60,14 @@ def get_tp_size(self) -> int: def get_expert_split(self) -> int: """ - Get the effective expert split size (expert parallelism, not tensor parallelism). - Only DEP splits experts across GPUs. TEP and TP don't split experts (returns 1). + Get the effective expert split size. + Both TEP and DEP split experts, TP doesn't (returns 1). """ + if self.tep is not None: + return self.tep if self.dep is not None: return self.dep - return 1 # TP and TEP don't use expert parallelism + return 1 # TP has expert split of 1 def get_attn_dp_size(self) -> int: """