ai-dynamo · ptarasiewiczNV · Oct 7, 2025 · Oct 7, 2025 · Oct 8, 2025 · Oct 8, 2025
diff --git a/benchmarks/llm/perf.sh b/benchmarks/llm/perf.sh
@@ -208,6 +208,33 @@ echo "ISL: $isl"
 echo "OSL: $osl"
 echo "Concurrency levels: ${concurrency_array[@]}"
 
+
+# The configuration is dumped to a JSON file which hold details of the OAI service
+# being benchmarked.
+deployment_config=$(cat << EOF
+{
+  "kind": "${deployment_kind}",
+  "model": "${model}",
+  "input_sequence_length": ${isl},
+  "output_sequence_length": ${osl},
+  "tensor_parallelism": ${tp},
+  "data_parallelism": ${dp},
+  "prefill_tensor_parallelism": ${prefill_tp},
+  "prefill_data_parallelism": ${prefill_dp},
+  "decode_tensor_parallelism": ${decode_tp},
+  "decode_data_parallelism": ${decode_dp},
+  "mode": "${mode}"
+}
+EOF
+)
+
+mkdir -p "${artifact_dir}"
+if [ -f "${artifact_dir}/deployment_config.json" ]; then
+  echo "Deployment configuration already exists. Overwriting..."
+  rm -f "${artifact_dir}/deployment_config.json"
+fi
+echo "${deployment_config}" > "${artifact_dir}/deployment_config.json"
+
 # Concurrency levels to test
 for concurrency in "${concurrency_array[@]}"; do
   echo "Run concurrency: $concurrency"
@@ -242,30 +269,4 @@ for concurrency in "${concurrency_array[@]}"; do
 
 done
 
-# The configuration is dumped to a JSON file which hold details of the OAI service
-# being benchmarked.
-deployment_config=$(cat << EOF
-{
-  "kind": "${deployment_kind}",
-  "model": "${model}",
-  "input_sequence_length": ${isl},
-  "output_sequence_length": ${osl},
-  "tensor_parallelism": ${tp},
-  "data_parallelism": ${dp},
-  "prefill_tensor_parallelism": ${prefill_tp},
-  "prefill_data_parallelism": ${prefill_dp},
-  "decode_tensor_parallelism": ${decode_tp},
-  "decode_data_parallelism": ${decode_dp},
-  "mode": "${mode}"
-}
-EOF
-)
-
-mkdir -p "${artifact_dir}"
-if [ -f "${artifact_dir}/deployment_config.json" ]; then
-  echo "Deployment configuration already exists. Overwriting..."
-  rm -f "${artifact_dir}/deployment_config.json"
-fi
-echo "${deployment_config}" > "${artifact_dir}/deployment_config.json"
-
 echo "Benchmarking Successful!!!"
diff --git a/components/backends/vllm/launch/dsr1_dep.sh b/components/backends/vllm/launch/dsr1_dep.sh
@@ -11,7 +11,8 @@ GPUS_PER_NODE=""
 MASTER_ADDR="localhost"
 LOG_DIR="./logs"
 MODEL="deepseek-ai/DeepSeek-R1"
-
+SERVED_MODEL_NAME="deepseek-ai/DeepSeek-R1"
+IS_PREFILL_WORKER="false"
 # Parse command line arguments
 while [[ $# -gt 0 ]]; do
     case $1 in
@@ -39,6 +40,14 @@ while [[ $# -gt 0 ]]; do
             MODEL="$2"
             shift 2
             ;;
+        --served-model-name)
+            SERVED_MODEL_NAME="$2"
+            shift 2
+            ;;
+        --is-prefill-worker)
+            IS_PREFILL_WORKER=true
+            shift 1
+            ;;
         -h|--help)
             echo "Usage: $0 [OPTIONS]"
             echo "Options:"
@@ -48,6 +57,8 @@ while [[ $# -gt 0 ]]; do
             echo "  --master-addr ADDR    Master node address (default: localhost)"
             echo "  --log-dir DIR         Directory for log files (default: ./logs)"
             echo "  --model MODEL         Model name to use (default: ${MODEL})"
+            echo "  --served-model-name SERVED_MODEL_NAME         Served model name to use (default: ${SERVED_MODEL_NAME})"
+            echo "  --is-prefill-worker   Mark this worker as a prefill worker (flag)"
             echo "  -h, --help            Show this help message"
             exit 0
             ;;
@@ -78,34 +89,96 @@ echo "  Data parallel size: $DATA_PARALLEL_SIZE"
 echo "  Master address: $MASTER_ADDR"
 echo "  Log directory: $LOG_DIR"
 echo "  Model name: $MODEL"
+echo "  Served model name: $SERVED_MODEL_NAME"
+echo "  Is prefill worker: $IS_PREFILL_WORKER"
 
-trap 'echo Cleaning up...; kill 0' EXIT
+cleanup() {
+    echo "Cleaning up..."
+    set +e
+    # Terminate background jobs started by this script
+    jobs -p | xargs -r kill 2>/dev/null || true
+    # Also terminate any direct child processes of this script
+    pkill -P $$ 2>/dev/null || true
+    # Give processes a moment to exit gracefully, then force kill leftovers
+    sleep 1
+    jobs -p | xargs -r kill -9 2>/dev/null || true
+}
 
-# run ingress if it's node 0
-if [ $NODE_RANK -eq 0 ]; then
-    DYN_LOG=debug python -m dynamo.frontend --router-mode kv --http-port=8000 2>&1 | tee $LOG_DIR/dsr1_dep_ingress.log &
+trap 'cleanup' EXIT
+trap 'cleanup; exit 130' INT
+trap 'cleanup; exit 143' TERM
+
+# run ingress if it's node 0 and is not a prefill worker
+if [ "$NODE_RANK" -eq 0 ] && [ "$IS_PREFILL_WORKER" = "false" ]; then
+    python -m dynamo.frontend --router-mode kv --http-port=8000 2>&1 | tee $LOG_DIR/dsr1_dep_ingress.log &
 fi
 
 mkdir -p $LOG_DIR
 
+
+export GLOO_SOCKET_IFNAME=eth3 # this has to be non-IB network interface
+export NCCL_SOCKET_IFNAME=eth3
+export NVSHMEM_BOOTSTRAP_UID_SOCK_IFNAME=eth3
+export NVIDIA_GDRCOPY=enabled
+export NVSHMEM_REMOTE_TRANSPORT=ibgda
+export NVSHMEM_IB_ENABLE_IBGDA="true"
+export VLLM_USE_DEEP_GEMM=1
+export VLLM_RANDOMIZE_DP_DUMMY_INPUTS=1
+export VLLM_SKIP_P2P_CHECK=1
+
 # Data Parallel Attention / Expert Parallelism
 # Routing to DP workers managed by Dynamo
 for ((i=0; i<GPUS_PER_NODE; i++)); do
     dp_rank=$((i + NODE_RANK * GPUS_PER_NODE))
-    CUDA_VISIBLE_DEVICES=$i \
-        VLLM_ALL2ALL_BACKEND="deepep_low_latency" \
-        VLLM_USE_DEEP_GEMM=1 \
-        VLLM_RANDOMIZE_DP_DUMMY_INPUTS=1 \
-        python3 -m dynamo.vllm \
-        --model $MODEL \
-        --data_parallel_size $DATA_PARALLEL_SIZE \
-        --data-parallel-rank $dp_rank \
-        --enable-expert-parallel \
-        --max-model-len 4096 \
-        --data-parallel-address $MASTER_ADDR \
-        --data-parallel-rpc-port 13345 \
-        --gpu-memory-utilization 0.9 \
-        --enforce-eager 2>&1 | tee $LOG_DIR/dsr1_dep_${dp_rank}.log &
+    if [ "$IS_PREFILL_WORKER" = "true" ]; then
+        CUDA_VISIBLE_DEVICES=$i \
+            VLLM_ALL2ALL_BACKEND=deepep_high_throughput \
+            python3 -m dynamo.vllm \
+            --model $MODEL \
+            --served-model-name $SERVED_MODEL_NAME \
+            --tensor-parallel-size 1 \
+            --data_parallel_size $DATA_PARALLEL_SIZE \
+            --data-parallel-address $MASTER_ADDR \
+            --data-parallel-rpc-port 13345 \
+            --data-parallel-rank $dp_rank \
+            --max-model-len 1048 \
+            --gpu-memory-utilization 0.8 \
+            --enable-expert-parallel \
+            --data-parallel-hybrid-lb \
+            --async-scheduling \
+            --enable-dbo \
+            --dbo-decode-token-threshold 32 \
+            --enable-eplb \
+            --eplb-config '{"window_size":"1000",
+                            "step_interval":"3000",
+                            "num_redundant_experts":"32",
+                            "log_balancedness":"False"}' \
+            --is-prefill-worker 2>&1 | tee $LOG_DIR/dsr1_dep_${dp_rank}_prefill.log &
+    else
+        CUDA_VISIBLE_DEVICES=$i \
+            VLLM_ALL2ALL_BACKEND=deepep_low_latency \
+            python3 -m dynamo.vllm \
+            --model $MODEL \
+            --served-model-name $SERVED_MODEL_NAME \
+            --tensor-parallel-size 1 \
+            --data_parallel_size $DATA_PARALLEL_SIZE \
+            --data-parallel-address $MASTER_ADDR \
+            --data-parallel-rpc-port 13345 \
+            --data-parallel-rank $dp_rank \
+            --max-model-len 2560 \
+            --gpu-memory-utilization 0.85 \
+            --enable-expert-parallel \
+            --data-parallel-hybrid-lb \
+            --async-scheduling \
+            --enable-dbo \
+            --dbo-decode-token-threshold 32 \
+            --enable-eplb \
+            --eplb-config '{"window_size":"1000",
+                            "step_interval":"3000",
+                            "num_redundant_experts":"32",
+                            "log_balancedness":"False"}' \
+            --compilation_config '{"cudagraph_mode": "FULL_DECODE_ONLY"}' 2>&1 | tee $LOG_DIR/dsr1_dep_${dp_rank}_decode.log &
+        fi
 done
 
 echo "All workers starting. (press Ctrl+C to stop)..."

diff --git a/docs/backends/vllm/deepseek-r1.md b/docs/backends/vllm/deepseek-r1.md
@@ -21,6 +21,22 @@ node 1
 ./launch/dsr1_dep.sh --num-nodes 2 --node-rank 1 --gpus-per-node 8 --master-addr <node 0 addr>
 ```
 
+### PD Disaggregation
+
+By default, each worker launched with `dsr1_dep.sh` will act as an **aggregated worker**, handling both prefill and decode requests. If you launch any workers with the `--is-prefill-worker` flag, those workers will act as **prefill workers** only, and the default workers will automatically switch to act as **decode workers**.
+
+To mark a worker as a prefill worker, simply add the `--is-prefill-worker` flag to the launch command. For example:
+
+node 2
+```bash
+./launch/dsr1_dep.sh --num-nodes 2 --node-rank 0 --gpus-per-node 8 --master-addr <node 2 addr> --is-prefill-worker
+```
+
+node 3
+```bash
+./launch/dsr1_dep.sh --num-nodes 2 --node-rank 1 --gpus-per-node 8 --master-addr <node 2 addr> --is-prefill-worker
+```
+
 ### Testing the Deployment
 
 On node 0 (where the frontend was started) send a test request to verify your deployment: