NVIDIA
diff --git a/‎examples/models/core/llama/summarize_long.py‎
Lines changed: 2 additions & 2 deletions b/‎examples/models/core/llama/summarize_long.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎examples/models/core/qwen2audio/run.py‎
Lines changed: 2 additions & 3 deletions b/‎examples/models/core/qwen2audio/run.py‎
Lines changed: 2 additions & 3 deletions
diff --git a/‎examples/models/core/qwenvl/run.py‎
Lines changed: 2 additions & 3 deletions b/‎examples/models/core/qwenvl/run.py‎
Lines changed: 2 additions & 3 deletions
diff --git a/‎examples/models/core/whisper/run.py‎
Lines changed: 2 additions & 1 deletion b/‎examples/models/core/whisper/run.py‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎tensorrt_llm/_torch/auto_deploy/llm_args.py‎
Lines changed: 2 additions & 4 deletions b/‎tensorrt_llm/_torch/auto_deploy/llm_args.py‎
Lines changed: 2 additions & 4 deletions
diff --git a/‎tensorrt_llm/bench/build/build.py‎
Lines changed: 2 additions & 2 deletions b/‎tensorrt_llm/bench/build/build.py‎
Lines changed: 2 additions & 2 deletions
@@ -23,7 +23,7 @@
 
 import tensorrt_llm
 import tensorrt_llm.profiler as profiler
-from tensorrt_llm.bindings import KVCacheType
+from tensorrt_llm.llmapi.kv_cache_type import KVCacheType
 from tensorrt_llm.logger import logger
 from tensorrt_llm.quantization import QuantMode
 
@@ -97,7 +97,7 @@ def TRTLLaMA(args, config):
     quantization_config = pretrained_config['quantization']
 
     build_config = config['build_config']
-    kv_cache_type = KVCacheType.from_string(build_config['kv_cache_type'])
+    kv_cache_type = KVCacheType(build_config['kv_cache_type'])
     plugin_config = build_config['plugin_config']
 
     dtype = pretrained_config['dtype']
 
@@ -27,7 +27,7 @@
 import tensorrt_llm
 import tensorrt_llm.profiler as profiler
 from tensorrt_llm import logger
-from tensorrt_llm.bindings import KVCacheType
+from tensorrt_llm.llmapi.kv_cache_type import KVCacheType
 from tensorrt_llm.quantization import QuantMode
 from tensorrt_llm.runtime import (PYTHON_BINDINGS, ModelConfig, ModelRunner,
                                   SamplingConfig, Session, TensorInfo)
@@ -122,8 +122,7 @@ def get_model(self):
         num_kv_heads = config["pretrained_config"].get("num_key_value_heads",
                                                        num_heads)
         if "kv_cache_type" in config["build_config"]:
-            kv_cache_type = KVCacheType.from_string(
-                config["build_config"]["kv_cache_type"])
+            kv_cache_type = KVCacheType(config["build_config"]["kv_cache_type"])
         else:
             kv_cache_type = KVCacheType.CONTINUOUS
 
 
@@ -25,7 +25,7 @@
 import tensorrt_llm
 import tensorrt_llm.profiler as profiler
 from tensorrt_llm import logger
-from tensorrt_llm.bindings import KVCacheType
+from tensorrt_llm.llmapi.kv_cache_type import KVCacheType
 from tensorrt_llm.quantization import QuantMode
 from tensorrt_llm.runtime import (ModelConfig, SamplingConfig, Session,
                                   TensorInfo)
@@ -118,8 +118,7 @@ def get_model(self):
         num_kv_heads = config["pretrained_config"].get("num_key_value_heads",
                                                        num_heads)
         if "kv_cache_type" in config["build_config"]:
-            kv_cache_type = KVCacheType.from_string(
-                config["build_config"]["kv_cache_type"])
+            kv_cache_type = KVCacheType(config["build_config"]["kv_cache_type"])
         else:
             kv_cache_type = KVCacheType.CONTINUOUS
 
 
@@ -33,7 +33,8 @@
 import tensorrt_llm.logger as logger
 from tensorrt_llm._utils import (str_dtype_to_torch, str_dtype_to_trt,
                                  trt_dtype_to_torch)
-from tensorrt_llm.bindings import GptJsonConfig, KVCacheType
+from tensorrt_llm.bindings import GptJsonConfig
+from tensorrt_llm.llmapi.kv_cache_type import KVCacheType
 from tensorrt_llm.runtime import PYTHON_BINDINGS, ModelConfig, SamplingConfig
 from tensorrt_llm.runtime.session import Session, TensorInfo
 
 
@@ -9,7 +9,6 @@
 from tensorrt_llm.models.modeling_utils import QuantConfig
 
 from ...llmapi.llm_args import BaseLlmArgs, BuildConfig, KvCacheConfig, _ParallelConfig
-from ...llmapi.utils import get_type_repr
 from .models import ModelFactory, ModelFactoryRegistry
 from .utils._config import DynamicYamlMixInForSettings
 from .utils.logger import ad_logger
@@ -318,12 +317,11 @@ class LlmArgs(AutoDeployConfig, BaseLlmArgs, BaseSettings):
 
     model_config = _get_config_dict()
 
-    build_config: Optional[object] = Field(
-        default_factory=lambda: BuildConfig(),
+    build_config: Optional[BuildConfig] = Field(
+        default_factory=BuildConfig,
         description="!!! DO NOT USE !!! Internal only; needed for BaseLlmArgs compatibility.",
         exclude_from_json=True,
         frozen=True,
-        json_schema_extra={"type": f"Optional[{get_type_repr(BuildConfig)}]"},
         repr=False,
     )
     backend: Literal["_autodeploy"] = Field(
 
@@ -22,8 +22,8 @@
     QuantAlgo.NVFP4, QuantAlgo.FP8, QuantAlgo.FP8_BLOCK_SCALES,
     QuantAlgo.NO_QUANT, None
 }
-DEFAULT_MAX_BATCH_SIZE = BuildConfig.max_batch_size
-DEFAULT_MAX_NUM_TOKENS = BuildConfig.max_num_tokens
+DEFAULT_MAX_BATCH_SIZE = BuildConfig.model_fields["max_batch_size"].default
+DEFAULT_MAX_NUM_TOKENS = BuildConfig.model_fields["max_num_tokens"].default
 
 
 def get_benchmark_engine_settings(
Original file line number	Diff line number	Diff line change
`@@ -22,8 +22,8 @@`
`22`	`22`	`QuantAlgo.NVFP4, QuantAlgo.FP8, QuantAlgo.FP8_BLOCK_SCALES,`
`23`	`23`	`QuantAlgo.NO_QUANT, None`
`24`	`24`	`}`
`25`		`-DEFAULT_MAX_BATCH_SIZE = BuildConfig.max_batch_size`
`26`		`-DEFAULT_MAX_NUM_TOKENS = BuildConfig.max_num_tokens`
	`25`	`+DEFAULT_MAX_BATCH_SIZE = BuildConfig.model_fields["max_batch_size"].default`
	`26`	`+DEFAULT_MAX_NUM_TOKENS = BuildConfig.model_fields["max_num_tokens"].default`
`27`	`27`
`28`	`28`
`29`	`29`	`def get_benchmark_engine_settings(`