diff --git a/benchmarks/llm/perf.sh b/benchmarks/llm/perf.sh index a1f66dc017..9aae2095fa 100644 --- a/benchmarks/llm/perf.sh +++ b/benchmarks/llm/perf.sh @@ -208,6 +208,33 @@ echo "ISL: $isl" echo "OSL: $osl" echo "Concurrency levels: ${concurrency_array[@]}" + +# The configuration is dumped to a JSON file which hold details of the OAI service +# being benchmarked. +deployment_config=$(cat << EOF +{ + "kind": "${deployment_kind}", + "model": "${model}", + "input_sequence_length": ${isl}, + "output_sequence_length": ${osl}, + "tensor_parallelism": ${tp}, + "data_parallelism": ${dp}, + "prefill_tensor_parallelism": ${prefill_tp}, + "prefill_data_parallelism": ${prefill_dp}, + "decode_tensor_parallelism": ${decode_tp}, + "decode_data_parallelism": ${decode_dp}, + "mode": "${mode}" +} +EOF +) + +mkdir -p "${artifact_dir}" +if [ -f "${artifact_dir}/deployment_config.json" ]; then + echo "Deployment configuration already exists. Overwriting..." + rm -f "${artifact_dir}/deployment_config.json" +fi +echo "${deployment_config}" > "${artifact_dir}/deployment_config.json" + # Concurrency levels to test for concurrency in "${concurrency_array[@]}"; do echo "Run concurrency: $concurrency" @@ -242,30 +269,4 @@ for concurrency in "${concurrency_array[@]}"; do done -# The configuration is dumped to a JSON file which hold details of the OAI service -# being benchmarked. -deployment_config=$(cat << EOF -{ - "kind": "${deployment_kind}", - "model": "${model}", - "input_sequence_length": ${isl}, - "output_sequence_length": ${osl}, - "tensor_parallelism": ${tp}, - "data_parallelism": ${dp}, - "prefill_tensor_parallelism": ${prefill_tp}, - "prefill_data_parallelism": ${prefill_dp}, - "decode_tensor_parallelism": ${decode_tp}, - "decode_data_parallelism": ${decode_dp}, - "mode": "${mode}" -} -EOF -) - -mkdir -p "${artifact_dir}" -if [ -f "${artifact_dir}/deployment_config.json" ]; then - echo "Deployment configuration already exists. Overwriting..." - rm -f "${artifact_dir}/deployment_config.json" -fi -echo "${deployment_config}" > "${artifact_dir}/deployment_config.json" - echo "Benchmarking Successful!!!" diff --git a/components/backends/vllm/launch/dsr1_dep.sh b/components/backends/vllm/launch/dsr1_dep.sh index 4d8c303323..4ca2b23776 100755 --- a/components/backends/vllm/launch/dsr1_dep.sh +++ b/components/backends/vllm/launch/dsr1_dep.sh @@ -11,7 +11,8 @@ GPUS_PER_NODE="" MASTER_ADDR="localhost" LOG_DIR="./logs" MODEL="deepseek-ai/DeepSeek-R1" - +SERVED_MODEL_NAME="deepseek-ai/DeepSeek-R1" +IS_PREFILL_WORKER="false" # Parse command line arguments while [[ $# -gt 0 ]]; do case $1 in @@ -39,6 +40,14 @@ while [[ $# -gt 0 ]]; do MODEL="$2" shift 2 ;; + --served-model-name) + SERVED_MODEL_NAME="$2" + shift 2 + ;; + --is-prefill-worker) + IS_PREFILL_WORKER=true + shift 1 + ;; -h|--help) echo "Usage: $0 [OPTIONS]" echo "Options:" @@ -48,6 +57,8 @@ while [[ $# -gt 0 ]]; do echo " --master-addr ADDR Master node address (default: localhost)" echo " --log-dir DIR Directory for log files (default: ./logs)" echo " --model MODEL Model name to use (default: ${MODEL})" + echo " --served-model-name SERVED_MODEL_NAME Served model name to use (default: ${SERVED_MODEL_NAME})" + echo " --is-prefill-worker Mark this worker as a prefill worker (flag)" echo " -h, --help Show this help message" exit 0 ;; @@ -78,34 +89,96 @@ echo " Data parallel size: $DATA_PARALLEL_SIZE" echo " Master address: $MASTER_ADDR" echo " Log directory: $LOG_DIR" echo " Model name: $MODEL" +echo " Served model name: $SERVED_MODEL_NAME" +echo " Is prefill worker: $IS_PREFILL_WORKER" -trap 'echo Cleaning up...; kill 0' EXIT +cleanup() { + echo "Cleaning up..." + set +e + # Terminate background jobs started by this script + jobs -p | xargs -r kill 2>/dev/null || true + # Also terminate any direct child processes of this script + pkill -P $$ 2>/dev/null || true + # Give processes a moment to exit gracefully, then force kill leftovers + sleep 1 + jobs -p | xargs -r kill -9 2>/dev/null || true +} -# run ingress if it's node 0 -if [ $NODE_RANK -eq 0 ]; then - DYN_LOG=debug python -m dynamo.frontend --router-mode kv --http-port=8000 2>&1 | tee $LOG_DIR/dsr1_dep_ingress.log & +trap 'cleanup' EXIT +trap 'cleanup; exit 130' INT +trap 'cleanup; exit 143' TERM + +# run ingress if it's node 0 and is not a prefill worker +if [ "$NODE_RANK" -eq 0 ] && [ "$IS_PREFILL_WORKER" = "false" ]; then + python -m dynamo.frontend --router-mode kv --http-port=8000 2>&1 | tee $LOG_DIR/dsr1_dep_ingress.log & fi mkdir -p $LOG_DIR + +export GLOO_SOCKET_IFNAME=eth3 # this has to be non-IB network interface +export NCCL_SOCKET_IFNAME=eth3 +export NVSHMEM_BOOTSTRAP_UID_SOCK_IFNAME=eth3 +export NVIDIA_GDRCOPY=enabled +export NVSHMEM_REMOTE_TRANSPORT=ibgda +export NVSHMEM_IB_ENABLE_IBGDA="true" +export VLLM_USE_DEEP_GEMM=1 +export VLLM_RANDOMIZE_DP_DUMMY_INPUTS=1 +export VLLM_SKIP_P2P_CHECK=1 + # Data Parallel Attention / Expert Parallelism # Routing to DP workers managed by Dynamo for ((i=0; i&1 | tee $LOG_DIR/dsr1_dep_${dp_rank}.log & + if [ "$IS_PREFILL_WORKER" = "true" ]; then + CUDA_VISIBLE_DEVICES=$i \ + VLLM_ALL2ALL_BACKEND=deepep_high_throughput \ + python3 -m dynamo.vllm \ + --model $MODEL \ + --served-model-name $SERVED_MODEL_NAME \ + --tensor-parallel-size 1 \ + --data_parallel_size $DATA_PARALLEL_SIZE \ + --data-parallel-address $MASTER_ADDR \ + --data-parallel-rpc-port 13345 \ + --data-parallel-rank $dp_rank \ + --max-model-len 1048 \ + --gpu-memory-utilization 0.8 \ + --enable-expert-parallel \ + --data-parallel-hybrid-lb \ + --async-scheduling \ + --enable-dbo \ + --dbo-decode-token-threshold 32 \ + --enable-eplb \ + --eplb-config '{"window_size":"1000", + "step_interval":"3000", + "num_redundant_experts":"32", + "log_balancedness":"False"}' \ + --is-prefill-worker 2>&1 | tee $LOG_DIR/dsr1_dep_${dp_rank}_prefill.log & + else + CUDA_VISIBLE_DEVICES=$i \ + VLLM_ALL2ALL_BACKEND=deepep_low_latency \ + python3 -m dynamo.vllm \ + --model $MODEL \ + --served-model-name $SERVED_MODEL_NAME \ + --tensor-parallel-size 1 \ + --data_parallel_size $DATA_PARALLEL_SIZE \ + --data-parallel-address $MASTER_ADDR \ + --data-parallel-rpc-port 13345 \ + --data-parallel-rank $dp_rank \ + --max-model-len 2560 \ + --gpu-memory-utilization 0.85 \ + --enable-expert-parallel \ + --data-parallel-hybrid-lb \ + --async-scheduling \ + --enable-dbo \ + --dbo-decode-token-threshold 32 \ + --enable-eplb \ + --eplb-config '{"window_size":"1000", + "step_interval":"3000", + "num_redundant_experts":"32", + "log_balancedness":"False"}' \ + --compilation_config '{"cudagraph_mode": "FULL_DECODE_ONLY"}' 2>&1 | tee $LOG_DIR/dsr1_dep_${dp_rank}_decode.log & + fi done echo "All workers starting. (press Ctrl+C to stop)..." diff --git a/docs/backends/vllm/deepseek-r1.md b/docs/backends/vllm/deepseek-r1.md index c859695e6f..787c7f23db 100644 --- a/docs/backends/vllm/deepseek-r1.md +++ b/docs/backends/vllm/deepseek-r1.md @@ -21,6 +21,22 @@ node 1 ./launch/dsr1_dep.sh --num-nodes 2 --node-rank 1 --gpus-per-node 8 --master-addr ``` +### PD Disaggregation + +By default, each worker launched with `dsr1_dep.sh` will act as an **aggregated worker**, handling both prefill and decode requests. If you launch any workers with the `--is-prefill-worker` flag, those workers will act as **prefill workers** only, and the default workers will automatically switch to act as **decode workers**. + +To mark a worker as a prefill worker, simply add the `--is-prefill-worker` flag to the launch command. For example: + +node 2 +```bash +./launch/dsr1_dep.sh --num-nodes 2 --node-rank 0 --gpus-per-node 8 --master-addr --is-prefill-worker +``` + +node 3 +```bash +./launch/dsr1_dep.sh --num-nodes 2 --node-rank 1 --gpus-per-node 8 --master-addr --is-prefill-worker +``` + ### Testing the Deployment On node 0 (where the frontend was started) send a test request to verify your deployment: