- 
                Notifications
    You must be signed in to change notification settings 
- Fork 663
fix: add prefill metrics support for TensorRT-LLM disaggregated mode #3983
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change | 
|---|---|---|
| @@ -0,0 +1,79 @@ | ||
| #!/bin/bash | ||
| # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. | ||
| # SPDX-License-Identifier: Apache-2.0 | ||
|  | ||
| # Disaggregated mode on single GPU - for testing only | ||
| # Both prefill and decode workers share the same GPU with reduced memory | ||
|  | ||
| # Check GPU memory availability | ||
| FREE_GPU_GB=$(python3 -c "import torch; print(torch.cuda.mem_get_info()[0]/1024**3)" 2>/dev/null) | ||
| if [ $? -ne 0 ]; then | ||
| echo "Error: Failed to check GPU memory. Is PyTorch with CUDA available?" | ||
| exit 1 | ||
| fi | ||
|  | ||
| REQUIRED_GB=20 | ||
| # Use bash arithmetic instead of bc to avoid external dependency | ||
| FREE_GPU_INT=$(python3 -c "print(int(float('$FREE_GPU_GB')))" 2>/dev/null) | ||
| if [ $? -ne 0 ]; then | ||
| echo "Error: Failed to parse GPU memory value." | ||
| exit 1 | ||
| fi | ||
|  | ||
| if (( FREE_GPU_INT < REQUIRED_GB )); then | ||
| echo "Error: Insufficient GPU memory. Required: ${REQUIRED_GB}GB, Available: ${FREE_GPU_GB}GB" | ||
| echo "Please free up GPU memory before running disaggregated mode on single GPU." | ||
| exit 1 | ||
| fi | ||
|  | ||
| echo "GPU memory check passed: ${FREE_GPU_GB}GB available (required: ${REQUIRED_GB}GB)" | ||
|  | ||
| # Environment variables with defaults | ||
| export DYNAMO_HOME=${DYNAMO_HOME:-"/workspace"} | ||
| export MODEL_PATH=${MODEL_PATH:-"Qwen/Qwen3-0.6B"} | ||
| export SERVED_MODEL_NAME=${SERVED_MODEL_NAME:-"Qwen/Qwen3-0.6B"} | ||
| export DISAGGREGATION_STRATEGY=${DISAGGREGATION_STRATEGY:-"decode_first"} | ||
| export PREFILL_ENGINE_ARGS=${PREFILL_ENGINE_ARGS:-"$DYNAMO_HOME/recipes/qwen3/trtllm-small/prefill.yaml"} | ||
| export DECODE_ENGINE_ARGS=${DECODE_ENGINE_ARGS:-"$DYNAMO_HOME/recipes/qwen3/trtllm-small/decode.yaml"} | ||
| export CUDA_VISIBLE_DEVICES=${CUDA_VISIBLE_DEVICES:-"0"} | ||
| export MODALITY=${MODALITY:-"text"} | ||
|  | ||
| # Setup cleanup trap | ||
| cleanup() { | ||
| echo "Cleaning up background processes..." | ||
| kill $DYNAMO_PID $PREFILL_PID 2>/dev/null || true | ||
| wait $DYNAMO_PID $PREFILL_PID 2>/dev/null || true | ||
| echo "Cleanup complete." | ||
| } | ||
| trap cleanup EXIT INT TERM | ||
|  | ||
|  | ||
| # run frontend | ||
| python3 -m dynamo.frontend --http-port 8000 & | ||
| DYNAMO_PID=$! | ||
|  | ||
| # run prefill worker (shares GPU with decode) | ||
| CUDA_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES \ | ||
| DYN_SYSTEM_ENABLED=true DYN_SYSTEM_PORT=8081 \ | ||
| python3 -m dynamo.trtllm \ | ||
| --model-path "$MODEL_PATH" \ | ||
| --served-model-name "$SERVED_MODEL_NAME" \ | ||
| --extra-engine-args "$PREFILL_ENGINE_ARGS" \ | ||
| --disaggregation-strategy "$DISAGGREGATION_STRATEGY" \ | ||
| --modality "$MODALITY" \ | ||
| --publish-events-and-metrics \ | ||
| --disaggregation-mode prefill & | ||
| PREFILL_PID=$! | ||
|  | ||
| # run decode worker (shares GPU with prefill) | ||
| CUDA_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES \ | ||
| DYN_SYSTEM_ENABLED=true DYN_SYSTEM_PORT=8082 \ | ||
| python3 -m dynamo.trtllm \ | ||
| --model-path "$MODEL_PATH" \ | ||
| --served-model-name "$SERVED_MODEL_NAME" \ | ||
| --extra-engine-args "$DECODE_ENGINE_ARGS" \ | ||
| --disaggregation-strategy "$DISAGGREGATION_STRATEGY" \ | ||
| --modality "$MODALITY" \ | ||
| --publish-events-and-metrics \ | ||
| --disaggregation-mode decode | ||
|  | 
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
| Original file line number | Diff line number | Diff line change | 
|---|---|---|
| @@ -0,0 +1,34 @@ | ||
| # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. | ||
| # SPDX-License-Identifier: Apache-2.0 | ||
|  | ||
| # Memory-optimized config for single GPU deployment (FP16) | ||
| # This is for testing. Do not use this for production. | ||
| # How many models can fit? | ||
| # - RTX 4090 (24GB): 10x 0.6B, 6x 1B, 2x 3B | ||
| # - RTX 6000 Ada (48GB): 20x 0.6B, 8x 1.5B, 4x 3.5B, 2x 7B | ||
| # - A100 (40GB): 16x 0.6B, 4x 2.5B, 2x 6B | ||
| # - A100 (80GB): 32x 0.6B, 8x 3B, 4x 6B, 2x 12B | ||
| # - H100 (80GB): 32x 0.6B, 8x 3B, 4x 6B, 2x 12B | ||
| # | ||
| # For production (85% memory): RTX 6000 can handle 70x 0.6B, 4x 6B, 2x 12B, 1x 25B | ||
|  | ||
| tensor_parallel_size: 1 | ||
| moe_expert_parallel_size: 1 | ||
| enable_attention_dp: false | ||
| max_num_tokens: 1024 | ||
| max_batch_size: 4 | ||
| 
      Comment on lines
    
      +18
     to 
      +19
    
   There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. seems too low for general use cases? but please ignore me if recipe folder doesn't need to be high-performance in general. same for bs and mnt in disagg | ||
| trust_remote_code: true | ||
| backend: pytorch | ||
| enable_chunked_prefill: true | ||
|  | ||
| kv_cache_config: | ||
| free_gpu_memory_fraction: 0.24 | ||
| There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. seems too small, but also please ignore me if recipe folder doesn't need to be high-performance in general. same fore below mem frac in disagg | ||
|  | ||
| # NOTE: pytorch_backend_config section flattened since: https://github.com/NVIDIA/TensorRT-LLM/pull/4603 | ||
| # NOTE: overlap_scheduler enabled by default since this commit and changed | ||
| # config field from 'enable_overlap_scheduler' to 'disable_overlap_scheduler': | ||
| # https://github.com/NVIDIA/TensorRT-LLM/commit/b4e5df0ee0024eda3eeb83a6ba822245a30ab428 | ||
|  | ||
|  | ||
| cuda_graph_config: | ||
| max_batch_size: 4 | ||
| Original file line number | Diff line number | Diff line change | 
|---|---|---|
| @@ -0,0 +1,27 @@ | ||
| # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. | ||
| # SPDX-License-Identifier: Apache-2.0 | ||
|  | ||
| # Decode worker config for disaggregated mode (shares GPU with prefill worker) | ||
| # This is for testing. Do not use this for production. | ||
|  | ||
| tensor_parallel_size: 1 | ||
| moe_expert_parallel_size: 1 | ||
| enable_attention_dp: false | ||
| max_num_tokens: 1024 | ||
| trust_remote_code: true | ||
| backend: pytorch | ||
| # Enable chunked prefill to process large contexts in smaller chunks | ||
| enable_chunked_prefill: true | ||
| # Overlap scheduler enabled - decode workers can overlap multiple decode operations | ||
| disable_overlap_scheduler: false | ||
|  | ||
| cuda_graph_config: | ||
| max_batch_size: 4 | ||
|  | ||
| kv_cache_config: | ||
| free_gpu_memory_fraction: 0.24 | ||
|  | ||
| # Cache transceiver receives KV cache from prefill worker | ||
| # Required for disaggregated mode - decode worker needs KV cache from prefill | ||
| cache_transceiver_config: | ||
| backend: DEFAULT | 
| Original file line number | Diff line number | Diff line change | 
|---|---|---|
| @@ -0,0 +1,28 @@ | ||
| # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. | ||
| # SPDX-License-Identifier: Apache-2.0 | ||
|  | ||
| # Prefill worker config for disaggregated mode (shares GPU with decode worker) | ||
| # This is for testing. Do not use this for production. | ||
|  | ||
| tensor_parallel_size: 1 | ||
| moe_expert_parallel_size: 1 | ||
| enable_attention_dp: false | ||
| max_num_tokens: 1024 | ||
| trust_remote_code: true | ||
| backend: pytorch | ||
| # Enable chunked prefill to process large contexts in smaller chunks | ||
| enable_chunked_prefill: true | ||
| # Disable overlap scheduler - prefill workers only handle context-only requests | ||
| # PyTorch backend does not support overlap for context-only requests | ||
| disable_overlap_scheduler: true | ||
|  | ||
| cuda_graph_config: | ||
| max_batch_size: 4 | ||
|  | ||
| kv_cache_config: | ||
| free_gpu_memory_fraction: 0.24 | ||
|  | ||
| # Cache transceiver enables KV cache transfer from prefill to decode worker | ||
| # Required for disaggregated mode - decode worker needs KV cache from prefill | ||
| cache_transceiver_config: | ||
| backend: DEFAULT | 
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
never seen a diff like this before, is this CRLF <> LF change?