Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
18 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion docs/backends/trtllm/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -162,7 +162,7 @@ cd $DYNAMO_HOME/examples/backends/trtllm
```bash
cd $DYNAMO_HOME/examples/backends/trtllm

export AGG_ENGINE_ARGS=./recipes/deepseek-r1/trtllm/mtp/mtp_agg.yaml
export AGG_ENGINE_ARGS=./recipes/deepseek-r1/trtllm/agg/mtp/mtp_agg.yaml
export SERVED_MODEL_NAME="nvidia/DeepSeek-R1-FP4"
# nvidia/DeepSeek-R1-FP4 is a large model
export MODEL_PATH="nvidia/DeepSeek-R1-FP4"
Expand Down
6 changes: 3 additions & 3 deletions docs/backends/trtllm/multinode/multinode-examples.md
Original file line number Diff line number Diff line change
Expand Up @@ -136,7 +136,7 @@ follow these steps below to launch an **aggregated** deployment across 4 nodes:

```bash
# Default set in srun_aggregated.sh, but can customize here.
# export ENGINE_CONFIG="/mnt/recipes/deepseek-r1/trtllm/wide_ep/wide_ep_agg.yaml"
# export ENGINE_CONFIG="/mnt/recipes/deepseek-r1/trtllm/agg/wide_ep/wide_ep_agg.yaml"

# Customize NUM_NODES to match the desired parallelism in ENGINE_CONFIG
# The product of NUM_NODES*NUM_GPUS_PER_NODE should match the number of
Expand Down Expand Up @@ -165,8 +165,8 @@ deployment across 8 nodes:

```bash
# Defaults set in srun_disaggregated.sh, but can customize here.
# export PREFILL_ENGINE_CONFIG="/mnt/recipes/deepseek-r1/trtllm/wide_ep/wide_ep_prefill.yaml"
# export DECODE_ENGINE_CONFIG="/mnt/recipes/deepseek-r1/trtllm/wide_ep/wide_ep_decode.yaml"
# export PREFILL_ENGINE_CONFIG="/mnt/recipes/deepseek-r1/trtllm/disagg/wide_ep/wide_ep_prefill.yaml"
# export DECODE_ENGINE_CONFIG="/mnt/recipes/deepseek-r1/trtllm/disagg/wide_ep/wide_ep_decode.yaml"

# Customize NUM_PREFILL_NODES to match the desired parallelism in PREFILL_ENGINE_CONFIG
# Customize NUM_DECODE_NODES to match the desired parallelism in DECODE_ENGINE_CONFIG
Expand Down
2 changes: 1 addition & 1 deletion examples/basics/multinode/trtllm/srun_aggregated.sh
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ MOUNTS="${MOUNTS:-${DEFAULT_MOUNT}}"
NUM_NODES=${NUM_NODES:-4}
NUM_GPUS_PER_NODE=${NUM_GPUS_PER_NODE:-4}

export ENGINE_CONFIG="${ENGINE_CONFIG:-/mnt/recipes/deepseek-r1/trtllm/wide_ep/wide_ep_agg.yaml}"
export ENGINE_CONFIG="${ENGINE_CONFIG:-/mnt/recipes/deepseek-r1/trtllm/agg/wide_ep/wide_ep_agg.yaml}"

# Automate settings of certain variables for convenience, but you are free
# to manually set these for more control as well.
Expand Down
4 changes: 2 additions & 2 deletions examples/basics/multinode/trtllm/srun_disaggregated.sh
Original file line number Diff line number Diff line change
Expand Up @@ -17,11 +17,11 @@ NUM_GPUS_PER_NODE=${NUM_GPUS_PER_NODE:-4}

NUM_PREFILL_NODES=${NUM_PREFILL_NODES:-4}
NUM_PREFILL_WORKERS=${NUM_PREFILL_WORKERS:-1}
PREFILL_ENGINE_CONFIG="${PREFILL_ENGINE_CONFIG:-/mnt/recipes/deepseek-r1/trtllm/wide_ep/wide_ep_prefill.yaml}"
PREFILL_ENGINE_CONFIG="${PREFILL_ENGINE_CONFIG:-/mnt/recipes/deepseek-r1/trtllm/disagg/wide_ep/wide_ep_prefill.yaml}"

NUM_DECODE_NODES=${NUM_DECODE_NODES:-4}
NUM_DECODE_WORKERS=${NUM_DECODE_WORKERS:-1}
DECODE_ENGINE_CONFIG="${DECODE_ENGINE_CONFIG:-/mnt/recipes/deepseek-r1/trtllm/wide_ep/wide_ep_decode.yaml}"
DECODE_ENGINE_CONFIG="${DECODE_ENGINE_CONFIG:-/mnt/recipes/deepseek-r1/trtllm/disagg/wide_ep/wide_ep_decode.yaml}"

DISAGGREGATION_STRATEGY=${DISAGGREGATION_STRATEGY:-"decode_first"}

Expand Down
4 changes: 2 additions & 2 deletions recipes/deepseek-r1/model-cache/model-cache.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3,11 +3,11 @@
apiVersion: v1
kind: PersistentVolumeClaim
metadata:
name: model-cache
name: model-cache-pvc
spec:
accessModes:
- ReadWriteMany
resources:
requests:
storage: 1000Gi
storage: 1500Gi
storageClassName: "your-storage-class-name"
17 changes: 5 additions & 12 deletions recipes/deepseek-r1/model-cache/model-download.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -14,31 +14,24 @@ spec:
app: model-download
spec:
restartPolicy: Never
tolerations: []
containers:
- name: model-download
image: python:3.10-slim
command: ["sh", "-c"]
envFrom:
- secretRef:
name: hf-token-secret
env:
- name: MODEL_NAME
value: deepseek-ai/DeepSeek-R1
- name: HF_HOME
value: /model-store
- name: HF_HUB_ENABLE_HF_TRANSFER
value: "1"
- name: MODEL_REVISION
value: 56d4cbbb4d29f4355bab4b9a39ccb717a14ad5ad
args:
- |
set -eux
pip install --no-cache-dir huggingface_hub hf_transfer
hf download $MODEL_NAME --revision $MODEL_REVISION
hf download nvidia/DeepSeek-R1-FP4 --local-dir /model-cache/deepseek-r1-fp4
hf download deepseek-ai/DeepSeek-R1 --local-dir /model-cache/deepseek-r1
volumeMounts:
- name: model-cache
mountPath: /model-store
mountPath: /model-cache
volumes:
- name: model-cache
persistentVolumeClaim:
claimName: model-cache
claimName: model-cache-pvc
14 changes: 7 additions & 7 deletions recipes/deepseek-r1/sglang/disagg-16gpu/deploy.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ metadata:
name: sgl-dsr1-16gpu
spec:
pvcs:
- name: model-cache
- name: model-cache-pvc
create: false
services:
Frontend:
Expand All @@ -34,8 +34,8 @@ spec:
limits:
gpu: "8"
volumeMounts:
- name: model-cache
mountPoint: /root/.cache/huggingface
- name: model-cache-pvc
mountPoint: /model-cache
sharedMemory:
size: 80Gi
extraPodSpec:
Expand All @@ -55,7 +55,7 @@ spec:
- dynamo.sglang
args:
- --model-path
- deepseek-ai/DeepSeek-R1
- /model-cache/deepseek-r1
- --served-model-name
- deepseek-ai/DeepSeek-R1
- --tp
Expand Down Expand Up @@ -87,8 +87,8 @@ spec:
limits:
gpu: "8"
volumeMounts:
- name: model-cache
mountPoint: /root/.cache/huggingface
- name: model-cache-pvc
mountPoint: /model-cache
sharedMemory:
size: 80Gi
extraPodSpec:
Expand All @@ -108,7 +108,7 @@ spec:
- dynamo.sglang
args:
- --model-path
- deepseek-ai/DeepSeek-R1
- /model-cache/deepseek-r1
- --served-model-name
- deepseek-ai/DeepSeek-R1
- --tp
Expand Down
14 changes: 7 additions & 7 deletions recipes/deepseek-r1/sglang/disagg-8gpu/deploy.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ metadata:
name: sgl-dsr1-8gpu
spec:
pvcs:
- name: model-cache
- name: model-cache-pvc
create: false
services:
Frontend:
Expand All @@ -32,8 +32,8 @@ spec:
limits:
gpu: "8"
volumeMounts:
- name: model-cache
mountPoint: /root/.cache/huggingface
- name: model-cache-pvc
mountPoint: /model-cache
sharedMemory:
size: 80Gi
extraPodSpec:
Expand All @@ -53,7 +53,7 @@ spec:
- dynamo.sglang
args:
- --model-path
- deepseek-ai/DeepSeek-R1
- /model-cache/deepseek-r1
- --served-model-name
- deepseek-ai/DeepSeek-R1
- --tp
Expand Down Expand Up @@ -81,8 +81,8 @@ spec:
limits:
gpu: "8"
volumeMounts:
- name: model-cache
mountPoint: /root/.cache/huggingface
- name: model-cache-pvc
mountPoint: /model-cache
sharedMemory:
size: 80Gi
extraPodSpec:
Expand All @@ -102,7 +102,7 @@ spec:
- dynamo.sglang
args:
- --model-path
- deepseek-ai/DeepSeek-R1
- /model-cache/deepseek-r1
- --served-model-name
- deepseek-ai/DeepSeek-R1
- --tp
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ moe_config:
# moe_max_num_tokens = max_batch_size * moe_expert_parallel_size
# 4096 = 256 * 16
# moe_max_num_tokens: 4096
load_balancer: /mnt/recipes/deepseek-r1/trtllm/wide_ep/eplb.yaml
load_balancer: /mnt/recipes/deepseek-r1/trtllm/agg/wide_ep/eplb.yaml

tensor_parallel_size: 16
moe_expert_parallel_size: 16
Expand Down
7 changes: 7 additions & 0 deletions recipes/deepseek-r1/trtllm/disagg/wide_ep/eplb.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0

# moe_load_balancer settings for TRTLLM based on:
# https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/ep_load_balancer/README.md#online-ep-load-balancer
num_slots: 288
layer_updates_per_iter: 2
Loading
Loading