ai-dynamo · keivenchang · Oct 30, 2025 · Oct 31, 2025 · tedzhouhk · Oct 31, 2025
@@ -241,18 +241,19 @@ async def init(runtime: DistributedRuntime, config: Config):
 
     if config.publish_events_and_metrics:
         # 'event_buffer_max_size' is required to enable TRTLLM to publish kv cache events.
-        # Convert KvCacheConfig object to dict and add the parameter
+        # Add it to kv_cache_config while preserving cache_transceiver_config from YAML
         current_kv_config = arg_map["kv_cache_config"]
         if isinstance(current_kv_config, KvCacheConfig):
+            # Convert KvCacheConfig object to dict (no cache_transceiver_config to preserve)
             arg_map["kv_cache_config"] = {
                 "free_gpu_memory_fraction": config.free_gpu_memory_fraction,
                 "event_buffer_max_size": DEFAULT_KV_EVENT_BUFFER_MAX_SIZE,
             }
         elif isinstance(current_kv_config, dict):
-            if "event_buffer_max_size" not in current_kv_config:
-                current_kv_config[
-                    "event_buffer_max_size"
-                ] = DEFAULT_KV_EVENT_BUFFER_MAX_SIZE
+            # Add event_buffer_max_size while preserving cache_transceiver_config and other YAML settings
+            current_kv_config[
+                "event_buffer_max_size"
+            ] = DEFAULT_KV_EVENT_BUFFER_MAX_SIZE
 
         # Only pytorch backend is supported for now to publish events and metrics.
         if "backend" not in arg_map:

@@ -46,4 +46,4 @@ CUDA_VISIBLE_DEVICES=$DECODE_CUDA_VISIBLE_DEVICES python3 -m dynamo.trtllm \
   --extra-engine-args  "$DECODE_ENGINE_ARGS" \
   --disaggregation-strategy "$DISAGGREGATION_STRATEGY" \
   --modality "$MODALITY" \
-  --disaggregation-mode decode
+  --disaggregation-mode decode
@@ -0,0 +1,79 @@
+#!/bin/bash
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+# Disaggregated mode on single GPU - for testing only
+# Both prefill and decode workers share the same GPU with reduced memory
+
+# Check GPU memory availability
+FREE_GPU_GB=$(python3 -c "import torch; print(torch.cuda.mem_get_info()[0]/1024**3)" 2>/dev/null)
+if [ $? -ne 0 ]; then
+    echo "Error: Failed to check GPU memory. Is PyTorch with CUDA available?"
+    exit 1
+fi
+
+REQUIRED_GB=20
+# Use bash arithmetic instead of bc to avoid external dependency
+FREE_GPU_INT=$(python3 -c "print(int(float('$FREE_GPU_GB')))" 2>/dev/null)
+if [ $? -ne 0 ]; then
+    echo "Error: Failed to parse GPU memory value."
+    exit 1
+fi
+
+if (( FREE_GPU_INT < REQUIRED_GB )); then
+    echo "Error: Insufficient GPU memory. Required: ${REQUIRED_GB}GB, Available: ${FREE_GPU_GB}GB"
+    echo "Please free up GPU memory before running disaggregated mode on single GPU."
+    exit 1
+fi
+
+echo "GPU memory check passed: ${FREE_GPU_GB}GB available (required: ${REQUIRED_GB}GB)"
+
+# Environment variables with defaults
+export DYNAMO_HOME=${DYNAMO_HOME:-"/workspace"}
+export MODEL_PATH=${MODEL_PATH:-"Qwen/Qwen3-0.6B"}
+export SERVED_MODEL_NAME=${SERVED_MODEL_NAME:-"Qwen/Qwen3-0.6B"}
+export DISAGGREGATION_STRATEGY=${DISAGGREGATION_STRATEGY:-"decode_first"}
+export PREFILL_ENGINE_ARGS=${PREFILL_ENGINE_ARGS:-"$DYNAMO_HOME/recipes/qwen3/trtllm-small/prefill.yaml"}
+export DECODE_ENGINE_ARGS=${DECODE_ENGINE_ARGS:-"$DYNAMO_HOME/recipes/qwen3/trtllm-small/decode.yaml"}
+export CUDA_VISIBLE_DEVICES=${CUDA_VISIBLE_DEVICES:-"0"}
+export MODALITY=${MODALITY:-"text"}
+
+# Setup cleanup trap
+cleanup() {
+    echo "Cleaning up background processes..."
+    kill $DYNAMO_PID $PREFILL_PID 2>/dev/null || true
+    wait $DYNAMO_PID $PREFILL_PID 2>/dev/null || true
+    echo "Cleanup complete."
+}
+trap cleanup EXIT INT TERM
+
+
+# run frontend
+python3 -m dynamo.frontend --http-port 8000 &
+DYNAMO_PID=$!
+
+# run prefill worker (shares GPU with decode)
+CUDA_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES \
+DYN_SYSTEM_ENABLED=true DYN_SYSTEM_PORT=8081 \
+python3 -m dynamo.trtllm \
+  --model-path "$MODEL_PATH" \
+  --served-model-name "$SERVED_MODEL_NAME" \
+  --extra-engine-args  "$PREFILL_ENGINE_ARGS" \
+  --disaggregation-strategy "$DISAGGREGATION_STRATEGY" \
+  --modality "$MODALITY" \
+  --publish-events-and-metrics \
+  --disaggregation-mode prefill &
+PREFILL_PID=$!
+
+# run decode worker (shares GPU with prefill)
+CUDA_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES \
+DYN_SYSTEM_ENABLED=true DYN_SYSTEM_PORT=8082 \
+python3 -m dynamo.trtllm \
+  --model-path "$MODEL_PATH" \
+  --served-model-name "$SERVED_MODEL_NAME" \
+  --extra-engine-args  "$DECODE_ENGINE_ARGS" \
+  --disaggregation-strategy "$DISAGGREGATION_STRATEGY" \
+  --modality "$MODALITY" \
+  --publish-events-and-metrics \
+  --disaggregation-mode decode
+
diff --git a/lib/bindings/python/Cargo.lock b/lib/bindings/python/Cargo.lock
@@ -0,0 +1,34 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+# Memory-optimized config for single GPU deployment (FP16)
+# This is for testing. Do not use this for production.
+# How many models can fit?
+# - RTX 4090 (24GB):      10x 0.6B, 6x 1B, 2x 3B
+# - RTX 6000 Ada (48GB):  20x 0.6B, 8x 1.5B, 4x 3.5B, 2x 7B
+# - A100 (40GB):          16x 0.6B, 4x 2.5B, 2x 6B
+# - A100 (80GB):          32x 0.6B, 8x 3B, 4x 6B, 2x 12B
+# - H100 (80GB):          32x 0.6B, 8x 3B, 4x 6B, 2x 12B
+#
+# For production (85% memory): RTX 6000 can handle 70x 0.6B, 4x 6B, 2x 12B, 1x 25B
+
+tensor_parallel_size: 1
+moe_expert_parallel_size: 1
+enable_attention_dp: false
+max_num_tokens: 1024
+max_batch_size: 4
+trust_remote_code: true
+backend: pytorch
+enable_chunked_prefill: true
+
+kv_cache_config:
+  free_gpu_memory_fraction: 0.24
+
+# NOTE: pytorch_backend_config section flattened since: https://github.com/NVIDIA/TensorRT-LLM/pull/4603
+# NOTE: overlap_scheduler enabled by default since this commit and changed
+# config field from 'enable_overlap_scheduler' to 'disable_overlap_scheduler':
+# https://github.com/NVIDIA/TensorRT-LLM/commit/b4e5df0ee0024eda3eeb83a6ba822245a30ab428
+
+
+cuda_graph_config:
+  max_batch_size: 4
@@ -0,0 +1,27 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+# Decode worker config for disaggregated mode (shares GPU with prefill worker)
+# This is for testing. Do not use this for production.
+
+tensor_parallel_size: 1
+moe_expert_parallel_size: 1
+enable_attention_dp: false
+max_num_tokens: 1024
+trust_remote_code: true
+backend: pytorch
+# Enable chunked prefill to process large contexts in smaller chunks
+enable_chunked_prefill: true
+# Overlap scheduler enabled - decode workers can overlap multiple decode operations
+disable_overlap_scheduler: false
+
+cuda_graph_config:
+  max_batch_size: 4
+
+kv_cache_config:
+  free_gpu_memory_fraction: 0.24
+
+# Cache transceiver receives KV cache from prefill worker
+# Required for disaggregated mode - decode worker needs KV cache from prefill
+cache_transceiver_config:
+  backend: DEFAULT
@@ -0,0 +1,28 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+# Prefill worker config for disaggregated mode (shares GPU with decode worker)
+# This is for testing. Do not use this for production.
+
+tensor_parallel_size: 1
+moe_expert_parallel_size: 1
+enable_attention_dp: false
+max_num_tokens: 1024
+trust_remote_code: true
+backend: pytorch
+# Enable chunked prefill to process large contexts in smaller chunks
+enable_chunked_prefill: true
+# Disable overlap scheduler - prefill workers only handle context-only requests
+# PyTorch backend does not support overlap for context-only requests
+disable_overlap_scheduler: true
+
+cuda_graph_config:
+  max_batch_size: 4
+
+kv_cache_config:
+  free_gpu_memory_fraction: 0.24
+
+# Cache transceiver enables KV cache transfer from prefill to decode worker
+# Required for disaggregated mode - decode worker needs KV cache from prefill
+cache_transceiver_config:
+  backend: DEFAULT
@@ -60,6 +60,20 @@ class TRTLLMConfig(EngineConfig):
             completion_payload_default(),
         ],
     ),
+    "disaggregated_same_gpu": TRTLLMConfig(
+        name="disaggregated_same_gpu",
+        directory=trtllm_dir,
+        script_name="disagg_same_gpu.sh",
+        marks=[pytest.mark.gpu_1, pytest.mark.trtllm_marker],
+        model="Qwen/Qwen3-0.6B",
+        models_port=8000,
+        request_payloads=[
+            chat_payload_default(),
+            completion_payload_default(),
+            metric_payload_default(port=8081, min_num_requests=6, backend="trtllm"),
+            metric_payload_default(port=8082, min_num_requests=6, backend="trtllm"),
+        ],
+    ),
     "aggregated_router": TRTLLMConfig(
         name="aggregated_router",
         directory=trtllm_dir,

@@ -67,6 +67,7 @@ def metric_payload_default(
     repeat_count: int = 1,
     expected_log: Optional[List[str]] = None,
     backend: Optional[str] = None,
+    port: int = 8081,
 ) -> MetricsPayload:
     return MetricsPayload(
         body={},
@@ -75,6 +76,7 @@ def metric_payload_default(
         expected_response=[],
         min_num_requests=min_num_requests,
         backend=backend,
+        port=port,
     )