From 75731c11616d196ca3be747bc340a27770bea938 Mon Sep 17 00:00:00 2001
From: Jason Zhou <jasonzho@nvidia.com>
Date: Thu, 20 Nov 2025 19:28:57 -0800
Subject: [PATCH 1/5] test: add AIConfigurator dense model tests for Dynamo
 Planner Profiler

---
 .gitignore                                    |  3 ++
 .../profiler/utils/config_modifiers/vllm.py   |  3 +-
 benchmarks/pyproject.toml                     |  2 +-
 .../test_profile_sla_aiconfigurator.py        | 36 ++++++++++---------
 4 files changed, 26 insertions(+), 18 deletions(-)

diff --git a/.gitignore b/.gitignore
index a706c5b098..e30aeac660 100644
--- a/.gitignore
+++ b/.gitignore
@@ -77,6 +77,9 @@ llm_engine.h
 ### Ruff ###
 .ruff_cache/
 
+### MyPy ###
+.mypy_cache/
+
 ### Python ###
 __pycache__/
 *.py[cod]
diff --git a/benchmarks/profiler/utils/config_modifiers/vllm.py b/benchmarks/profiler/utils/config_modifiers/vllm.py
index c89102ca30..3a94d1ee9b 100644
--- a/benchmarks/profiler/utils/config_modifiers/vllm.py
+++ b/benchmarks/profiler/utils/config_modifiers/vllm.py
@@ -123,7 +123,8 @@ def convert_config(
             args = break_arguments(args)
 
             # remove --is-prefill-worker flag
-            args.remove("--is-prefill-worker")
+            if "--is-prefill-worker" in args:
+                args.remove("--is-prefill-worker")
 
             # disable prefix caching
             if "--enable-prefix-caching" in args:
diff --git a/benchmarks/pyproject.toml b/benchmarks/pyproject.toml
index 9ee8804cd9..529d7333c5 100644
--- a/benchmarks/pyproject.toml
+++ b/benchmarks/pyproject.toml
@@ -40,7 +40,7 @@ classifiers = [
 ]
 
 dependencies = [
-    "aiconfigurator @ git+https://github.com/ai-dynamo/aiconfigurator.git@5554d2eb8206738c66048bf2d72183e9bcd85759",
+    "aiconfigurator @ git+https://github.com/ai-dynamo/aiconfigurator.git@release/0.4.0",
     "networkx",
     "pandas",
     "pydantic>=2",
diff --git a/tests/profiler/test_profile_sla_aiconfigurator.py b/tests/profiler/test_profile_sla_aiconfigurator.py
index 72f1dde18e..dc1b8a1538 100644
--- a/tests/profiler/test_profile_sla_aiconfigurator.py
+++ b/tests/profiler/test_profile_sla_aiconfigurator.py
@@ -37,7 +37,7 @@ class TestProfileSlaAiconfigurator:
     """Test class for profile_sla aiconfigurator functionality."""
 
     @pytest.fixture
-    def trtllm_args(self):
+    def llm_args(self):
         class Args:
             def __init__(self):
                 self.model = ""
@@ -78,12 +78,12 @@ def __init__(self):
     @pytest.mark.pre_merge
     @pytest.mark.asyncio
     @pytest.mark.parametrize("missing_arg", ["aic_system", "aic_hf_id"])
-    async def test_aiconfigurator_missing_args(self, trtllm_args, missing_arg):
+    async def test_aiconfigurator_missing_args(self, llm_args, missing_arg):
         # Check that validation error happens when a required arg is missing.
         # Note: aic_backend_version is optional - when None, auto-detects latest version
-        setattr(trtllm_args, missing_arg, None)
+        setattr(llm_args, missing_arg, None)
         with pytest.raises(ValueError):
-            await run_profile(trtllm_args)
+            await run_profile(llm_args)
 
     @pytest.mark.pre_merge
     @pytest.mark.asyncio
@@ -95,18 +95,18 @@ async def test_aiconfigurator_missing_args(self, trtllm_args, missing_arg):
             ("aic_backend_version", "0.1.0"),
         ],
     )
-    async def test_aiconfiguator_no_data(self, trtllm_args, arg_name, bad_value):
+    async def test_aiconfiguator_no_data(self, llm_args, arg_name, bad_value):
         # Check that an appropriate error is raised when the system/model/backend
         # is not found in the aiconfigurator database.
-        setattr(trtllm_args, arg_name, bad_value)
+        setattr(llm_args, arg_name, bad_value)
         with pytest.raises(ValueError, match="Database not found"):
-            await run_profile(trtllm_args)
+            await run_profile(llm_args)
 
     @pytest.mark.pre_merge
     @pytest.mark.asyncio
-    async def test_trtllm_aiconfigurator_single_model(self, trtllm_args):
-        # Test that profile_sla works with the model & backend in the trtllm_args fixture.
-        await run_profile(trtllm_args)
+    async def test_trtllm_aiconfigurator_single_model(self, llm_args):
+        # Test that profile_sla works with the model & backend in the llm_args fixture.
+        await run_profile(llm_args)
 
     @pytest.mark.asyncio
     @pytest.mark.parametrize(
@@ -115,6 +115,10 @@ async def test_trtllm_aiconfigurator_single_model(self, trtllm_args):
             ("trtllm", None),
             ("trtllm", "0.20.0"),
             ("trtllm", "1.0.0rc3"),
+            ("vllm", None),
+            ("vllm", "0.11.0"),
+            ("sglang", None),
+            ("sglang", "0.5.1.post1"),
         ],
     )
     @pytest.mark.parametrize(
@@ -124,11 +128,11 @@ async def test_trtllm_aiconfigurator_single_model(self, trtllm_args):
             "meta-llama/Llama-3.1-405B",
         ],
     )
-    async def test_trtllm_aiconfigurator_many(
-        self, trtllm_args, hf_model_id, backend, aic_backend_version
+    async def test_aiconfigurator_dense_models(
+        self, llm_args, hf_model_id, backend, aic_backend_version
     ):
         # Test that profile_sla works with a variety of backend versions and model names.
-        trtllm_args.aic_hf_id = hf_model_id
-        trtllm_args.backend = backend
-        trtllm_args.aic_backend_version = aic_backend_version
-        await run_profile(trtllm_args)
+        llm_args.aic_hf_id = hf_model_id
+        llm_args.backend = backend
+        llm_args.aic_backend_version = aic_backend_version
+        await run_profile(llm_args)

From 3776be488ccd407ca76dcc96cc9f655e8d8bdefc Mon Sep 17 00:00:00 2001
From: Jason Zhou <jasonzho@nvidia.com>
Date: Thu, 20 Nov 2025 20:47:08 -0800
Subject: [PATCH 2/5] tests/profiler/test_profile_sla_aiconfigurator.py

---
 benchmarks/profiler/profile_sla.py            | 61 ++++++++++++++++---
 .../parallelization_mapping.py                |  8 +--
 .../test_profile_sla_aiconfigurator.py        |  2 +-
 3 files changed, 56 insertions(+), 15 deletions(-)

diff --git a/benchmarks/profiler/profile_sla.py b/benchmarks/profiler/profile_sla.py
index 8d3eee1f4f..93e8943469 100644
--- a/benchmarks/profiler/profile_sla.py
+++ b/benchmarks/profiler/profile_sla.py
@@ -21,6 +21,7 @@
 
 import numpy as np
 import yaml
+from dynamo.planner.defaults import WORKER_COMPONENT_NAMES
 
 from benchmarks.profiler.utils.aiperf import (
     get_decode_itl_and_thpt_per_gpu,
@@ -54,7 +55,6 @@
     DynamoDeploymentClient,
     cleanup_remaining_deployments,
 )
-from dynamo.planner.defaults import WORKER_COMPONENT_NAMES
 
 
 @dataclass
@@ -126,6 +126,37 @@ def add_data(
 logger.addHandler(console_handler)
 
 
+def build_model_config_kwargs(
+    mapping: ParallelizationMapping, is_moe: bool, backend: str
+) -> dict:
+    """
+    Build model configuration kwargs for aiconfigurator based on parallelization mapping.
+
+    Args:
+        mapping: Parallelization mapping containing tp/tep/dep configuration
+        is_moe: Whether the model is a Mixture of Experts model
+        backend: Backend name (e.g., "sglang", "trtllm", "vllm")
+
+    Returns:
+        Dictionary of model configuration parameters for aiconfigurator
+    """
+    model_config_kwargs = {"tp_size": mapping.get_tp_size()}
+
+    # For MoE models, also pass moe_tp_size, moe_ep_size, and attention_dp_size
+    if is_moe:
+        model_config_kwargs["moe_tp_size"] = mapping.get_tp_size()
+        model_config_kwargs["moe_ep_size"] = mapping.get_expert_split()
+        model_config_kwargs["attention_dp_size"] = mapping.get_attn_dp_size()
+
+        # SGLang-specific MoE configuration
+        if backend == "sglang":
+            model_config_kwargs["enable_wideep"] = True
+            model_config_kwargs["moe_backend"] = "deepep_moe"
+            model_config_kwargs["attention_backend"] = "flashinfer"
+
+    return model_config_kwargs
+
+
 async def run_profile(args):
     # List to track all created deployment clients for cleanup in case of failure
     deployment_clients = []
@@ -140,9 +171,9 @@ async def run_profile(args):
             logger.info(
                 "MoE (Mixture of Experts) model profiling, sweeping TEP/DEP size for prefill and decode"
             )
-            assert args.backend in [
-                "sglang"
-            ], "MoE model support is only available for SGLang"
+            assert args.backend in ["sglang"], (
+                "MoE model support is only available for SGLang"
+            )
         else:
             logger.info(
                 "Dense model profiling, sweeping TP size for prefill and decode"
@@ -272,9 +303,12 @@ async def run_profile(args):
                     logger.info("Skipping deployment creation in dry run mode")
                 elif args.use_ai_configurator:
                     logger.info("Using ai-configurator to estimate prefill latency")
+                    model_config_kwargs = build_model_config_kwargs(
+                        mapping, args.model_info.is_moe, args.backend
+                    )
                     perf_dict = ai_configurator_perf_estimator.estimate_prefill_perf(
                         args.isl,
-                        tp_size=mapping.get_tp_size(),
+                        **model_config_kwargs,
                     )
                     ttft = perf_dict["context_latency"]
                     logger.info(f"Estimated prefill TTFT: {ttft:.2f}ms")
@@ -379,8 +413,11 @@ async def run_profile(args):
                 elif args.use_ai_configurator:
                     # Compute max_concurrency and max_kv_tokens to know which
                     # num_request to sweep over.
+                    model_config_kwargs = build_model_config_kwargs(
+                        mapping, args.model_info.is_moe, args.backend
+                    )
                     max_concurrency = ai_configurator_perf_estimator.get_max_batch_size(
-                        args.isl, args.osl, tp_size=mapping.get_tp_size()
+                        args.isl, args.osl, **model_config_kwargs
                     )
                     max_kv_tokens = max_concurrency * (args.isl + args.osl)
 
@@ -574,13 +611,16 @@ async def run_profile(args):
         if args.dry_run:
             logger.info("Skipping deployment creation in dry run mode")
         elif args.use_ai_configurator:
+            model_config_kwargs = build_model_config_kwargs(
+                best_prefill_mapping, args.model_info.is_moe, args.backend
+            )
             profile_prefill_aiconfigurator(
                 work_dir,
                 best_prefill_gpus,  # num_gpus
                 sweep_max_context_length,
                 args.prefill_interpolation_granularity,
                 ai_configurator_perf_estimator,
-                tp_size=best_prefill_mapping.get_tp_size(),
+                **model_config_kwargs,
             )
         else:
             client = DynamoDeploymentClient(
@@ -659,8 +699,11 @@ async def run_profile(args):
             logger.info("Skipping deployment creation in dry run mode")
         elif args.use_ai_configurator:
             attention_dp_size = best_decode_mapping.get_attn_dp_size()
+            model_config_kwargs = build_model_config_kwargs(
+                best_decode_mapping, args.model_info.is_moe, args.backend
+            )
             max_kv_tokens = ai_configurator_perf_estimator.get_max_kv_tokens(
-                args.isl, args.osl, tp_size=best_decode_mapping.get_tp_size()
+                args.isl, args.osl, **model_config_kwargs
             )
             profile_decode_aiconfigurator(
                 work_dir,
@@ -670,7 +713,7 @@ async def run_profile(args):
                 args.decode_interpolation_granularity,
                 ai_configurator_perf_estimator,
                 attention_dp_size,
-                tp_size=best_decode_mapping.get_tp_size(),
+                **model_config_kwargs,
             )
         else:
             client = DynamoDeploymentClient(
diff --git a/benchmarks/profiler/utils/config_modifiers/parallelization_mapping.py b/benchmarks/profiler/utils/config_modifiers/parallelization_mapping.py
index 311c696fab..6357d6f792 100644
--- a/benchmarks/profiler/utils/config_modifiers/parallelization_mapping.py
+++ b/benchmarks/profiler/utils/config_modifiers/parallelization_mapping.py
@@ -60,14 +60,12 @@ def get_tp_size(self) -> int:
 
     def get_expert_split(self) -> int:
         """
-        Get the effective expert split size.
-        Both TEP and DEP split experts, TP doesn't (returns 1).
+        Get the effective expert split size (expert parallelism, not tensor parallelism).
+        Only DEP splits experts across GPUs. TEP and TP don't split experts (returns 1).
         """
-        if self.tep is not None:
-            return self.tep
         if self.dep is not None:
             return self.dep
-        return 1  # TP has expert split of 1
+        return 1  # TP and TEP don't use expert parallelism
 
     def get_attn_dp_size(self) -> int:
         """
diff --git a/tests/profiler/test_profile_sla_aiconfigurator.py b/tests/profiler/test_profile_sla_aiconfigurator.py
index dc1b8a1538..86e35234c3 100644
--- a/tests/profiler/test_profile_sla_aiconfigurator.py
+++ b/tests/profiler/test_profile_sla_aiconfigurator.py
@@ -95,7 +95,7 @@ async def test_aiconfigurator_missing_args(self, llm_args, missing_arg):
             ("aic_backend_version", "0.1.0"),
         ],
     )
-    async def test_aiconfiguator_no_data(self, llm_args, arg_name, bad_value):
+    async def test_aiconfigurator_no_data(self, llm_args, arg_name, bad_value):
         # Check that an appropriate error is raised when the system/model/backend
         # is not found in the aiconfigurator database.
         setattr(llm_args, arg_name, bad_value)

From f791668a00e4079b7de280d787d78567041b918f Mon Sep 17 00:00:00 2001
From: Jason Zhou <jasonzho@nvidia.com>
Date: Fri, 21 Nov 2025 11:33:13 -0800
Subject: [PATCH 3/5] Update profile_sla.py

Signed-off-by: Jason Zhou <jasonzho@nvidia.com>
---
 benchmarks/profiler/profile_sla.py | 61 +++++-------------------------
 1 file changed, 9 insertions(+), 52 deletions(-)

diff --git a/benchmarks/profiler/profile_sla.py b/benchmarks/profiler/profile_sla.py
index 93e8943469..c1cdeef82b 100644
--- a/benchmarks/profiler/profile_sla.py
+++ b/benchmarks/profiler/profile_sla.py
@@ -21,7 +21,6 @@
 
 import numpy as np
 import yaml
-from dynamo.planner.defaults import WORKER_COMPONENT_NAMES
 
 from benchmarks.profiler.utils.aiperf import (
     get_decode_itl_and_thpt_per_gpu,
@@ -55,6 +54,7 @@
     DynamoDeploymentClient,
     cleanup_remaining_deployments,
 )
+from dynamo.planner.defaults import WORKER_COMPONENT_NAMES
 
 
 @dataclass
@@ -126,37 +126,6 @@ def add_data(
 logger.addHandler(console_handler)
 
 
-def build_model_config_kwargs(
-    mapping: ParallelizationMapping, is_moe: bool, backend: str
-) -> dict:
-    """
-    Build model configuration kwargs for aiconfigurator based on parallelization mapping.
-
-    Args:
-        mapping: Parallelization mapping containing tp/tep/dep configuration
-        is_moe: Whether the model is a Mixture of Experts model
-        backend: Backend name (e.g., "sglang", "trtllm", "vllm")
-
-    Returns:
-        Dictionary of model configuration parameters for aiconfigurator
-    """
-    model_config_kwargs = {"tp_size": mapping.get_tp_size()}
-
-    # For MoE models, also pass moe_tp_size, moe_ep_size, and attention_dp_size
-    if is_moe:
-        model_config_kwargs["moe_tp_size"] = mapping.get_tp_size()
-        model_config_kwargs["moe_ep_size"] = mapping.get_expert_split()
-        model_config_kwargs["attention_dp_size"] = mapping.get_attn_dp_size()
-
-        # SGLang-specific MoE configuration
-        if backend == "sglang":
-            model_config_kwargs["enable_wideep"] = True
-            model_config_kwargs["moe_backend"] = "deepep_moe"
-            model_config_kwargs["attention_backend"] = "flashinfer"
-
-    return model_config_kwargs
-
-
 async def run_profile(args):
     # List to track all created deployment clients for cleanup in case of failure
     deployment_clients = []
@@ -171,9 +140,9 @@ async def run_profile(args):
             logger.info(
                 "MoE (Mixture of Experts) model profiling, sweeping TEP/DEP size for prefill and decode"
             )
-            assert args.backend in ["sglang"], (
-                "MoE model support is only available for SGLang"
-            )
+            assert args.backend in [
+                "sglang"
+            ], "MoE model support is only available for SGLang"
         else:
             logger.info(
                 "Dense model profiling, sweeping TP size for prefill and decode"
@@ -303,12 +272,9 @@ async def run_profile(args):
                     logger.info("Skipping deployment creation in dry run mode")
                 elif args.use_ai_configurator:
                     logger.info("Using ai-configurator to estimate prefill latency")
-                    model_config_kwargs = build_model_config_kwargs(
-                        mapping, args.model_info.is_moe, args.backend
-                    )
                     perf_dict = ai_configurator_perf_estimator.estimate_prefill_perf(
                         args.isl,
-                        **model_config_kwargs,
+                        tp_size=mapping.get_tp_size(),
                     )
                     ttft = perf_dict["context_latency"]
                     logger.info(f"Estimated prefill TTFT: {ttft:.2f}ms")
@@ -413,11 +379,8 @@ async def run_profile(args):
                 elif args.use_ai_configurator:
                     # Compute max_concurrency and max_kv_tokens to know which
                     # num_request to sweep over.
-                    model_config_kwargs = build_model_config_kwargs(
-                        mapping, args.model_info.is_moe, args.backend
-                    )
                     max_concurrency = ai_configurator_perf_estimator.get_max_batch_size(
-                        args.isl, args.osl, **model_config_kwargs
+                        args.isl, args.osl, tp_size=mapping.get_tp_size()
                     )
                     max_kv_tokens = max_concurrency * (args.isl + args.osl)
 
@@ -611,16 +574,13 @@ async def run_profile(args):
         if args.dry_run:
             logger.info("Skipping deployment creation in dry run mode")
         elif args.use_ai_configurator:
-            model_config_kwargs = build_model_config_kwargs(
-                best_prefill_mapping, args.model_info.is_moe, args.backend
-            )
             profile_prefill_aiconfigurator(
                 work_dir,
                 best_prefill_gpus,  # num_gpus
                 sweep_max_context_length,
                 args.prefill_interpolation_granularity,
                 ai_configurator_perf_estimator,
-                **model_config_kwargs,
+                tp_size=best_prefill_mapping.get_tp_size(),
             )
         else:
             client = DynamoDeploymentClient(
@@ -699,11 +659,8 @@ async def run_profile(args):
             logger.info("Skipping deployment creation in dry run mode")
         elif args.use_ai_configurator:
             attention_dp_size = best_decode_mapping.get_attn_dp_size()
-            model_config_kwargs = build_model_config_kwargs(
-                best_decode_mapping, args.model_info.is_moe, args.backend
-            )
             max_kv_tokens = ai_configurator_perf_estimator.get_max_kv_tokens(
-                args.isl, args.osl, **model_config_kwargs
+                args.isl, args.osl, tp_size=best_decode_mapping.get_tp_size()
             )
             profile_decode_aiconfigurator(
                 work_dir,
@@ -713,7 +670,7 @@ async def run_profile(args):
                 args.decode_interpolation_granularity,
                 ai_configurator_perf_estimator,
                 attention_dp_size,
-                **model_config_kwargs,
+                tp_size=best_decode_mapping.get_tp_size()
             )
         else:
             client = DynamoDeploymentClient(

From 03760dcd4cf7f01fb7a4f22778ff5da4d310d765 Mon Sep 17 00:00:00 2001
From: Jason Zhou <jasonzho@nvidia.com>
Date: Fri, 21 Nov 2025 11:33:47 -0800
Subject: [PATCH 4/5] Update profile_sla.py

Signed-off-by: Jason Zhou <jasonzho@nvidia.com>
---
 benchmarks/profiler/profile_sla.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/benchmarks/profiler/profile_sla.py b/benchmarks/profiler/profile_sla.py
index c1cdeef82b..8d3eee1f4f 100644
--- a/benchmarks/profiler/profile_sla.py
+++ b/benchmarks/profiler/profile_sla.py
@@ -670,7 +670,7 @@ async def run_profile(args):
                 args.decode_interpolation_granularity,
                 ai_configurator_perf_estimator,
                 attention_dp_size,
-                tp_size=best_decode_mapping.get_tp_size()
+                tp_size=best_decode_mapping.get_tp_size(),
             )
         else:
             client = DynamoDeploymentClient(

From 4de691764f858d0eb8e0934eeb471eec20e96492 Mon Sep 17 00:00:00 2001
From: Jason Zhou <jasonzho@nvidia.com>
Date: Fri, 21 Nov 2025 11:34:56 -0800
Subject: [PATCH 5/5] Update parallelization_mapping.py

Signed-off-by: Jason Zhou <jasonzho@nvidia.com>
---
 .../utils/config_modifiers/parallelization_mapping.py     | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/benchmarks/profiler/utils/config_modifiers/parallelization_mapping.py b/benchmarks/profiler/utils/config_modifiers/parallelization_mapping.py
index 6357d6f792..311c696fab 100644
--- a/benchmarks/profiler/utils/config_modifiers/parallelization_mapping.py
+++ b/benchmarks/profiler/utils/config_modifiers/parallelization_mapping.py
@@ -60,12 +60,14 @@ def get_tp_size(self) -> int:
 
     def get_expert_split(self) -> int:
         """
-        Get the effective expert split size (expert parallelism, not tensor parallelism).
-        Only DEP splits experts across GPUs. TEP and TP don't split experts (returns 1).
+        Get the effective expert split size.
+        Both TEP and DEP split experts, TP doesn't (returns 1).
         """
+        if self.tep is not None:
+            return self.tep
         if self.dep is not None:
             return self.dep
-        return 1  # TP and TEP don't use expert parallelism
+        return 1  # TP has expert split of 1
 
     def get_attn_dp_size(self) -> int:
         """