Skip to content
53 changes: 27 additions & 26 deletions benchmarks/llm/perf.sh
Original file line number Diff line number Diff line change
Expand Up @@ -208,6 +208,33 @@ echo "ISL: $isl"
echo "OSL: $osl"
echo "Concurrency levels: ${concurrency_array[@]}"


# The configuration is dumped to a JSON file which hold details of the OAI service
# being benchmarked.
deployment_config=$(cat << EOF
{
"kind": "${deployment_kind}",
"model": "${model}",
"input_sequence_length": ${isl},
"output_sequence_length": ${osl},
"tensor_parallelism": ${tp},
"data_parallelism": ${dp},
"prefill_tensor_parallelism": ${prefill_tp},
"prefill_data_parallelism": ${prefill_dp},
"decode_tensor_parallelism": ${decode_tp},
"decode_data_parallelism": ${decode_dp},
"mode": "${mode}"
}
EOF
)

mkdir -p "${artifact_dir}"
if [ -f "${artifact_dir}/deployment_config.json" ]; then
echo "Deployment configuration already exists. Overwriting..."
rm -f "${artifact_dir}/deployment_config.json"
fi
echo "${deployment_config}" > "${artifact_dir}/deployment_config.json"

# Concurrency levels to test
for concurrency in "${concurrency_array[@]}"; do
echo "Run concurrency: $concurrency"
Expand Down Expand Up @@ -242,30 +269,4 @@ for concurrency in "${concurrency_array[@]}"; do

done

# The configuration is dumped to a JSON file which hold details of the OAI service
# being benchmarked.
deployment_config=$(cat << EOF
{
"kind": "${deployment_kind}",
"model": "${model}",
"input_sequence_length": ${isl},
"output_sequence_length": ${osl},
"tensor_parallelism": ${tp},
"data_parallelism": ${dp},
"prefill_tensor_parallelism": ${prefill_tp},
"prefill_data_parallelism": ${prefill_dp},
"decode_tensor_parallelism": ${decode_tp},
"decode_data_parallelism": ${decode_dp},
"mode": "${mode}"
}
EOF
)

mkdir -p "${artifact_dir}"
if [ -f "${artifact_dir}/deployment_config.json" ]; then
echo "Deployment configuration already exists. Overwriting..."
rm -f "${artifact_dir}/deployment_config.json"
fi
echo "${deployment_config}" > "${artifact_dir}/deployment_config.json"

echo "Benchmarking Successful!!!"
111 changes: 92 additions & 19 deletions components/backends/vllm/launch/dsr1_dep.sh
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,8 @@ GPUS_PER_NODE=""
MASTER_ADDR="localhost"
LOG_DIR="./logs"
MODEL="deepseek-ai/DeepSeek-R1"

SERVED_MODEL_NAME="deepseek-ai/DeepSeek-R1"
IS_PREFILL_WORKER="false"
# Parse command line arguments
while [[ $# -gt 0 ]]; do
case $1 in
Expand Down Expand Up @@ -39,6 +40,14 @@ while [[ $# -gt 0 ]]; do
MODEL="$2"
shift 2
;;
--served-model-name)
SERVED_MODEL_NAME="$2"
shift 2
;;
--is-prefill-worker)
IS_PREFILL_WORKER=true
shift 1
;;
-h|--help)
echo "Usage: $0 [OPTIONS]"
echo "Options:"
Expand All @@ -48,6 +57,8 @@ while [[ $# -gt 0 ]]; do
echo " --master-addr ADDR Master node address (default: localhost)"
echo " --log-dir DIR Directory for log files (default: ./logs)"
echo " --model MODEL Model name to use (default: ${MODEL})"
echo " --served-model-name SERVED_MODEL_NAME Served model name to use (default: ${SERVED_MODEL_NAME})"
echo " --is-prefill-worker Mark this worker as a prefill worker (flag)"
echo " -h, --help Show this help message"
exit 0
;;
Expand Down Expand Up @@ -78,34 +89,96 @@ echo " Data parallel size: $DATA_PARALLEL_SIZE"
echo " Master address: $MASTER_ADDR"
echo " Log directory: $LOG_DIR"
echo " Model name: $MODEL"
echo " Served model name: $SERVED_MODEL_NAME"
echo " Is prefill worker: $IS_PREFILL_WORKER"

trap 'echo Cleaning up...; kill 0' EXIT
cleanup() {
echo "Cleaning up..."
set +e
# Terminate background jobs started by this script
jobs -p | xargs -r kill 2>/dev/null || true
# Also terminate any direct child processes of this script
pkill -P $$ 2>/dev/null || true
# Give processes a moment to exit gracefully, then force kill leftovers
sleep 1
jobs -p | xargs -r kill -9 2>/dev/null || true
}

# run ingress if it's node 0
if [ $NODE_RANK -eq 0 ]; then
DYN_LOG=debug python -m dynamo.frontend --router-mode kv --http-port=8000 2>&1 | tee $LOG_DIR/dsr1_dep_ingress.log &
trap 'cleanup' EXIT
trap 'cleanup; exit 130' INT
trap 'cleanup; exit 143' TERM

# run ingress if it's node 0 and is not a prefill worker
if [ "$NODE_RANK" -eq 0 ] && [ "$IS_PREFILL_WORKER" = "false" ]; then
python -m dynamo.frontend --router-mode kv --http-port=8000 2>&1 | tee $LOG_DIR/dsr1_dep_ingress.log &
fi

mkdir -p $LOG_DIR


export GLOO_SOCKET_IFNAME=eth3 # this has to be non-IB network interface
export NCCL_SOCKET_IFNAME=eth3
export NVSHMEM_BOOTSTRAP_UID_SOCK_IFNAME=eth3
export NVIDIA_GDRCOPY=enabled
export NVSHMEM_REMOTE_TRANSPORT=ibgda
export NVSHMEM_IB_ENABLE_IBGDA="true"
export VLLM_USE_DEEP_GEMM=1
export VLLM_RANDOMIZE_DP_DUMMY_INPUTS=1
export VLLM_SKIP_P2P_CHECK=1

# Data Parallel Attention / Expert Parallelism
# Routing to DP workers managed by Dynamo
for ((i=0; i<GPUS_PER_NODE; i++)); do
dp_rank=$((i + NODE_RANK * GPUS_PER_NODE))
CUDA_VISIBLE_DEVICES=$i \
VLLM_ALL2ALL_BACKEND="deepep_low_latency" \
VLLM_USE_DEEP_GEMM=1 \
VLLM_RANDOMIZE_DP_DUMMY_INPUTS=1 \
python3 -m dynamo.vllm \
--model $MODEL \
--data_parallel_size $DATA_PARALLEL_SIZE \
--data-parallel-rank $dp_rank \
--enable-expert-parallel \
--max-model-len 4096 \
--data-parallel-address $MASTER_ADDR \
--data-parallel-rpc-port 13345 \
--gpu-memory-utilization 0.9 \
--enforce-eager 2>&1 | tee $LOG_DIR/dsr1_dep_${dp_rank}.log &
if [ "$IS_PREFILL_WORKER" = "true" ]; then
CUDA_VISIBLE_DEVICES=$i \
VLLM_ALL2ALL_BACKEND=deepep_high_throughput \
python3 -m dynamo.vllm \
--model $MODEL \
--served-model-name $SERVED_MODEL_NAME \
--tensor-parallel-size 1 \
--data_parallel_size $DATA_PARALLEL_SIZE \
--data-parallel-address $MASTER_ADDR \
--data-parallel-rpc-port 13345 \
--data-parallel-rank $dp_rank \
--max-model-len 1048 \
--gpu-memory-utilization 0.8 \
--enable-expert-parallel \
--data-parallel-hybrid-lb \
--async-scheduling \
--enable-dbo \
--dbo-decode-token-threshold 32 \
--enable-eplb \
--eplb-config '{"window_size":"1000",
"step_interval":"3000",
"num_redundant_experts":"32",
"log_balancedness":"False"}' \
--is-prefill-worker 2>&1 | tee $LOG_DIR/dsr1_dep_${dp_rank}_prefill.log &
else
CUDA_VISIBLE_DEVICES=$i \
VLLM_ALL2ALL_BACKEND=deepep_low_latency \
python3 -m dynamo.vllm \
--model $MODEL \
--served-model-name $SERVED_MODEL_NAME \
--tensor-parallel-size 1 \
--data_parallel_size $DATA_PARALLEL_SIZE \
--data-parallel-address $MASTER_ADDR \
--data-parallel-rpc-port 13345 \
--data-parallel-rank $dp_rank \
--max-model-len 2560 \
--gpu-memory-utilization 0.85 \
--enable-expert-parallel \
--data-parallel-hybrid-lb \
--async-scheduling \
--enable-dbo \
--dbo-decode-token-threshold 32 \
--enable-eplb \
--eplb-config '{"window_size":"1000",
"step_interval":"3000",
"num_redundant_experts":"32",
"log_balancedness":"False"}' \
--compilation_config '{"cudagraph_mode": "FULL_DECODE_ONLY"}' 2>&1 | tee $LOG_DIR/dsr1_dep_${dp_rank}_decode.log &
fi
done

echo "All workers starting. (press Ctrl+C to stop)..."
Expand Down
16 changes: 16 additions & 0 deletions docs/backends/vllm/deepseek-r1.md
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,22 @@ node 1
./launch/dsr1_dep.sh --num-nodes 2 --node-rank 1 --gpus-per-node 8 --master-addr <node 0 addr>
```

### PD Disaggregation

By default, each worker launched with `dsr1_dep.sh` will act as an **aggregated worker**, handling both prefill and decode requests. If you launch any workers with the `--is-prefill-worker` flag, those workers will act as **prefill workers** only, and the default workers will automatically switch to act as **decode workers**.

To mark a worker as a prefill worker, simply add the `--is-prefill-worker` flag to the launch command. For example:

node 2
```bash
./launch/dsr1_dep.sh --num-nodes 2 --node-rank 0 --gpus-per-node 8 --master-addr <node 2 addr> --is-prefill-worker
```

node 3
```bash
./launch/dsr1_dep.sh --num-nodes 2 --node-rank 1 --gpus-per-node 8 --master-addr <node 2 addr> --is-prefill-worker
```

### Testing the Deployment

On node 0 (where the frontend was started) send a test request to verify your deployment:
Expand Down
Loading