Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 6 additions & 5 deletions components/src/dynamo/trtllm/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -241,18 +241,19 @@ async def init(runtime: DistributedRuntime, config: Config):

if config.publish_events_and_metrics:
# 'event_buffer_max_size' is required to enable TRTLLM to publish kv cache events.
# Convert KvCacheConfig object to dict and add the parameter
# Add it to kv_cache_config while preserving cache_transceiver_config from YAML
current_kv_config = arg_map["kv_cache_config"]
if isinstance(current_kv_config, KvCacheConfig):
# Convert KvCacheConfig object to dict (no cache_transceiver_config to preserve)
arg_map["kv_cache_config"] = {
"free_gpu_memory_fraction": config.free_gpu_memory_fraction,
"event_buffer_max_size": DEFAULT_KV_EVENT_BUFFER_MAX_SIZE,
}
elif isinstance(current_kv_config, dict):
if "event_buffer_max_size" not in current_kv_config:
current_kv_config[
"event_buffer_max_size"
] = DEFAULT_KV_EVENT_BUFFER_MAX_SIZE
# Add event_buffer_max_size while preserving cache_transceiver_config and other YAML settings
current_kv_config[
"event_buffer_max_size"
] = DEFAULT_KV_EVENT_BUFFER_MAX_SIZE

# Only pytorch backend is supported for now to publish events and metrics.
if "backend" not in arg_map:
Expand Down
2 changes: 1 addition & 1 deletion examples/backends/trtllm/launch/disagg.sh
Original file line number Diff line number Diff line change
Expand Up @@ -46,4 +46,4 @@ CUDA_VISIBLE_DEVICES=$DECODE_CUDA_VISIBLE_DEVICES python3 -m dynamo.trtllm \
--extra-engine-args "$DECODE_ENGINE_ARGS" \
--disaggregation-strategy "$DISAGGREGATION_STRATEGY" \
--modality "$MODALITY" \
--disaggregation-mode decode
--disaggregation-mode decode
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

never seen a diff like this before, is this CRLF <> LF change?

79 changes: 79 additions & 0 deletions examples/backends/trtllm/launch/disagg_same_gpu.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,79 @@
#!/bin/bash
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0

# Disaggregated mode on single GPU - for testing only
# Both prefill and decode workers share the same GPU with reduced memory

# Check GPU memory availability
FREE_GPU_GB=$(python3 -c "import torch; print(torch.cuda.mem_get_info()[0]/1024**3)" 2>/dev/null)
if [ $? -ne 0 ]; then
echo "Error: Failed to check GPU memory. Is PyTorch with CUDA available?"
exit 1
fi

REQUIRED_GB=20
# Use bash arithmetic instead of bc to avoid external dependency
FREE_GPU_INT=$(python3 -c "print(int(float('$FREE_GPU_GB')))" 2>/dev/null)
if [ $? -ne 0 ]; then
echo "Error: Failed to parse GPU memory value."
exit 1
fi

if (( FREE_GPU_INT < REQUIRED_GB )); then
echo "Error: Insufficient GPU memory. Required: ${REQUIRED_GB}GB, Available: ${FREE_GPU_GB}GB"
echo "Please free up GPU memory before running disaggregated mode on single GPU."
exit 1
fi

echo "GPU memory check passed: ${FREE_GPU_GB}GB available (required: ${REQUIRED_GB}GB)"

# Environment variables with defaults
export DYNAMO_HOME=${DYNAMO_HOME:-"/workspace"}
export MODEL_PATH=${MODEL_PATH:-"Qwen/Qwen3-0.6B"}
export SERVED_MODEL_NAME=${SERVED_MODEL_NAME:-"Qwen/Qwen3-0.6B"}
export DISAGGREGATION_STRATEGY=${DISAGGREGATION_STRATEGY:-"decode_first"}
export PREFILL_ENGINE_ARGS=${PREFILL_ENGINE_ARGS:-"$DYNAMO_HOME/recipes/qwen3/trtllm-small/prefill.yaml"}
export DECODE_ENGINE_ARGS=${DECODE_ENGINE_ARGS:-"$DYNAMO_HOME/recipes/qwen3/trtllm-small/decode.yaml"}
export CUDA_VISIBLE_DEVICES=${CUDA_VISIBLE_DEVICES:-"0"}
export MODALITY=${MODALITY:-"text"}

# Setup cleanup trap
cleanup() {
echo "Cleaning up background processes..."
kill $DYNAMO_PID $PREFILL_PID 2>/dev/null || true
wait $DYNAMO_PID $PREFILL_PID 2>/dev/null || true
echo "Cleanup complete."
}
trap cleanup EXIT INT TERM


# run frontend
python3 -m dynamo.frontend --http-port 8000 &
DYNAMO_PID=$!

# run prefill worker (shares GPU with decode)
CUDA_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES \
DYN_SYSTEM_ENABLED=true DYN_SYSTEM_PORT=8081 \
python3 -m dynamo.trtllm \
--model-path "$MODEL_PATH" \
--served-model-name "$SERVED_MODEL_NAME" \
--extra-engine-args "$PREFILL_ENGINE_ARGS" \
--disaggregation-strategy "$DISAGGREGATION_STRATEGY" \
--modality "$MODALITY" \
--publish-events-and-metrics \
--disaggregation-mode prefill &
PREFILL_PID=$!

# run decode worker (shares GPU with prefill)
CUDA_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES \
DYN_SYSTEM_ENABLED=true DYN_SYSTEM_PORT=8082 \
python3 -m dynamo.trtllm \
--model-path "$MODEL_PATH" \
--served-model-name "$SERVED_MODEL_NAME" \
--extra-engine-args "$DECODE_ENGINE_ARGS" \
--disaggregation-strategy "$DISAGGREGATION_STRATEGY" \
--modality "$MODALITY" \
--publish-events-and-metrics \
--disaggregation-mode decode

10 changes: 0 additions & 10 deletions lib/bindings/python/Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

34 changes: 34 additions & 0 deletions recipes/qwen3/trtllm-small/agg.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0

# Memory-optimized config for single GPU deployment (FP16)
# This is for testing. Do not use this for production.
# How many models can fit?
# - RTX 4090 (24GB): 10x 0.6B, 6x 1B, 2x 3B
# - RTX 6000 Ada (48GB): 20x 0.6B, 8x 1.5B, 4x 3.5B, 2x 7B
# - A100 (40GB): 16x 0.6B, 4x 2.5B, 2x 6B
# - A100 (80GB): 32x 0.6B, 8x 3B, 4x 6B, 2x 12B
# - H100 (80GB): 32x 0.6B, 8x 3B, 4x 6B, 2x 12B
#
# For production (85% memory): RTX 6000 can handle 70x 0.6B, 4x 6B, 2x 12B, 1x 25B

tensor_parallel_size: 1
moe_expert_parallel_size: 1
enable_attention_dp: false
max_num_tokens: 1024
max_batch_size: 4
Comment on lines +18 to +19
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

seems too low for general use cases? but please ignore me if recipe folder doesn't need to be high-performance in general. same for bs and mnt in disagg

trust_remote_code: true
backend: pytorch
enable_chunked_prefill: true

kv_cache_config:
free_gpu_memory_fraction: 0.24
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

seems too small, but also please ignore me if recipe folder doesn't need to be high-performance in general. same fore below mem frac in disagg


# NOTE: pytorch_backend_config section flattened since: https://github.com/NVIDIA/TensorRT-LLM/pull/4603
# NOTE: overlap_scheduler enabled by default since this commit and changed
# config field from 'enable_overlap_scheduler' to 'disable_overlap_scheduler':
# https://github.com/NVIDIA/TensorRT-LLM/commit/b4e5df0ee0024eda3eeb83a6ba822245a30ab428


cuda_graph_config:
max_batch_size: 4
27 changes: 27 additions & 0 deletions recipes/qwen3/trtllm-small/decode.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0

# Decode worker config for disaggregated mode (shares GPU with prefill worker)
# This is for testing. Do not use this for production.

tensor_parallel_size: 1
moe_expert_parallel_size: 1
enable_attention_dp: false
max_num_tokens: 1024
trust_remote_code: true
backend: pytorch
# Enable chunked prefill to process large contexts in smaller chunks
enable_chunked_prefill: true
# Overlap scheduler enabled - decode workers can overlap multiple decode operations
disable_overlap_scheduler: false

cuda_graph_config:
max_batch_size: 4

kv_cache_config:
free_gpu_memory_fraction: 0.24

# Cache transceiver receives KV cache from prefill worker
# Required for disaggregated mode - decode worker needs KV cache from prefill
cache_transceiver_config:
backend: DEFAULT
28 changes: 28 additions & 0 deletions recipes/qwen3/trtllm-small/prefill.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0

# Prefill worker config for disaggregated mode (shares GPU with decode worker)
# This is for testing. Do not use this for production.

tensor_parallel_size: 1
moe_expert_parallel_size: 1
enable_attention_dp: false
max_num_tokens: 1024
trust_remote_code: true
backend: pytorch
# Enable chunked prefill to process large contexts in smaller chunks
enable_chunked_prefill: true
# Disable overlap scheduler - prefill workers only handle context-only requests
# PyTorch backend does not support overlap for context-only requests
disable_overlap_scheduler: true

cuda_graph_config:
max_batch_size: 4

kv_cache_config:
free_gpu_memory_fraction: 0.24

# Cache transceiver enables KV cache transfer from prefill to decode worker
# Required for disaggregated mode - decode worker needs KV cache from prefill
cache_transceiver_config:
backend: DEFAULT
14 changes: 14 additions & 0 deletions tests/serve/test_trtllm.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,20 @@ class TRTLLMConfig(EngineConfig):
completion_payload_default(),
],
),
"disaggregated_same_gpu": TRTLLMConfig(
name="disaggregated_same_gpu",
directory=trtllm_dir,
script_name="disagg_same_gpu.sh",
marks=[pytest.mark.gpu_1, pytest.mark.trtllm_marker],
model="Qwen/Qwen3-0.6B",
models_port=8000,
request_payloads=[
chat_payload_default(),
completion_payload_default(),
metric_payload_default(port=8081, min_num_requests=6, backend="trtllm"),
metric_payload_default(port=8082, min_num_requests=6, backend="trtllm"),
],
),
"aggregated_router": TRTLLMConfig(
name="aggregated_router",
directory=trtllm_dir,
Expand Down
2 changes: 2 additions & 0 deletions tests/utils/payload_builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,7 @@ def metric_payload_default(
repeat_count: int = 1,
expected_log: Optional[List[str]] = None,
backend: Optional[str] = None,
port: int = 8081,
) -> MetricsPayload:
return MetricsPayload(
body={},
Expand All @@ -75,6 +76,7 @@ def metric_payload_default(
expected_response=[],
min_num_requests=min_num_requests,
backend=backend,
port=port,
)


Expand Down
Loading