ai-dynamo · dagil-nvidia · Nov 1, 2025 · Oct 25, 2025 · Oct 30, 2025 · Oct 30, 2025
diff --git a/docs/backends/trtllm/README.md b/docs/backends/trtllm/README.md
@@ -162,7 +162,7 @@ cd $DYNAMO_HOME/examples/backends/trtllm
 ```bash
 cd $DYNAMO_HOME/examples/backends/trtllm
 
-export AGG_ENGINE_ARGS=./recipes/deepseek-r1/trtllm/mtp/mtp_agg.yaml
+export AGG_ENGINE_ARGS=./recipes/deepseek-r1/trtllm/agg/mtp/mtp_agg.yaml
 export SERVED_MODEL_NAME="nvidia/DeepSeek-R1-FP4"
 # nvidia/DeepSeek-R1-FP4 is a large model
 export MODEL_PATH="nvidia/DeepSeek-R1-FP4"

diff --git a/docs/backends/trtllm/multinode/multinode-examples.md b/docs/backends/trtllm/multinode/multinode-examples.md
@@ -136,7 +136,7 @@ follow these steps below to launch an **aggregated** deployment across 4 nodes:
 
 ```bash
 # Default set in srun_aggregated.sh, but can customize here.
-# export ENGINE_CONFIG="/mnt/recipes/deepseek-r1/trtllm/wide_ep/wide_ep_agg.yaml"
+# export ENGINE_CONFIG="/mnt/recipes/deepseek-r1/trtllm/agg/wide_ep/wide_ep_agg.yaml"
 
 # Customize NUM_NODES to match the desired parallelism in ENGINE_CONFIG
 # The product of NUM_NODES*NUM_GPUS_PER_NODE should match the number of
@@ -165,8 +165,8 @@ deployment across 8 nodes:
 
 ```bash
 # Defaults set in srun_disaggregated.sh, but can customize here.
-# export PREFILL_ENGINE_CONFIG="/mnt/recipes/deepseek-r1/trtllm/wide_ep/wide_ep_prefill.yaml"
-# export DECODE_ENGINE_CONFIG="/mnt/recipes/deepseek-r1/trtllm/wide_ep/wide_ep_decode.yaml"
+# export PREFILL_ENGINE_CONFIG="/mnt/recipes/deepseek-r1/trtllm/disagg/wide_ep/wide_ep_prefill.yaml"
+# export DECODE_ENGINE_CONFIG="/mnt/recipes/deepseek-r1/trtllm/disagg/wide_ep/wide_ep_decode.yaml"
 
 # Customize NUM_PREFILL_NODES to match the desired parallelism in PREFILL_ENGINE_CONFIG
 # Customize NUM_DECODE_NODES to match the desired parallelism in DECODE_ENGINE_CONFIG

@@ -18,7 +18,7 @@ MOUNTS="${MOUNTS:-${DEFAULT_MOUNT}}"
 NUM_NODES=${NUM_NODES:-4}
 NUM_GPUS_PER_NODE=${NUM_GPUS_PER_NODE:-4}
 
-export ENGINE_CONFIG="${ENGINE_CONFIG:-/mnt/recipes/deepseek-r1/trtllm/wide_ep/wide_ep_agg.yaml}"
+export ENGINE_CONFIG="${ENGINE_CONFIG:-/mnt/recipes/deepseek-r1/trtllm/agg/wide_ep/wide_ep_agg.yaml}"
 
 # Automate settings of certain variables for convenience, but you are free
 # to manually set these for more control as well.

@@ -17,11 +17,11 @@ NUM_GPUS_PER_NODE=${NUM_GPUS_PER_NODE:-4}
 
 NUM_PREFILL_NODES=${NUM_PREFILL_NODES:-4}
 NUM_PREFILL_WORKERS=${NUM_PREFILL_WORKERS:-1}
-PREFILL_ENGINE_CONFIG="${PREFILL_ENGINE_CONFIG:-/mnt/recipes/deepseek-r1/trtllm/wide_ep/wide_ep_prefill.yaml}"
+PREFILL_ENGINE_CONFIG="${PREFILL_ENGINE_CONFIG:-/mnt/recipes/deepseek-r1/trtllm/disagg/wide_ep/wide_ep_prefill.yaml}"
 
 NUM_DECODE_NODES=${NUM_DECODE_NODES:-4}
 NUM_DECODE_WORKERS=${NUM_DECODE_WORKERS:-1}
-DECODE_ENGINE_CONFIG="${DECODE_ENGINE_CONFIG:-/mnt/recipes/deepseek-r1/trtllm/wide_ep/wide_ep_decode.yaml}"
+DECODE_ENGINE_CONFIG="${DECODE_ENGINE_CONFIG:-/mnt/recipes/deepseek-r1/trtllm/disagg/wide_ep/wide_ep_decode.yaml}"
 
 DISAGGREGATION_STRATEGY=${DISAGGREGATION_STRATEGY:-"decode_first"}
 

@@ -3,11 +3,11 @@
 apiVersion: v1
 kind: PersistentVolumeClaim
 metadata:
-  name: model-cache
+  name: model-cache-pvc
 spec:
   accessModes:
     - ReadWriteMany
   resources:
     requests:
-      storage: 1000Gi
+      storage: 1500Gi
   storageClassName: "your-storage-class-name"
@@ -14,31 +14,24 @@ spec:
         app: model-download
     spec:
       restartPolicy: Never
+      tolerations: []
       containers:
         - name: model-download
           image: python:3.10-slim
           command: ["sh", "-c"]
-          envFrom:
-            - secretRef:
-                name: hf-token-secret
           env:
-            - name: MODEL_NAME
-              value: deepseek-ai/DeepSeek-R1
-            - name: HF_HOME
-              value: /model-store
             - name: HF_HUB_ENABLE_HF_TRANSFER
               value: "1"
-            - name: MODEL_REVISION
-              value: 56d4cbbb4d29f4355bab4b9a39ccb717a14ad5ad
           args:
             - |
               set -eux
               pip install --no-cache-dir huggingface_hub hf_transfer
-              hf download $MODEL_NAME --revision $MODEL_REVISION
+              hf download nvidia/DeepSeek-R1-FP4 --local-dir /model-cache/deepseek-r1-fp4
+              hf download deepseek-ai/DeepSeek-R1 --local-dir /model-cache/deepseek-r1
           volumeMounts:
             - name: model-cache
-              mountPath: /model-store
+              mountPath: /model-cache
       volumes:
       - name: model-cache
         persistentVolumeClaim:
-          claimName: model-cache
+          claimName: model-cache-pvc
@@ -7,7 +7,7 @@ metadata:
   name: sgl-dsr1-16gpu
 spec:
   pvcs:
-    - name: model-cache
+    - name: model-cache-pvc
       create: false
   services:
     Frontend:
@@ -34,8 +34,8 @@ spec:
         limits:
           gpu: "8"
       volumeMounts:
-        - name: model-cache
-          mountPoint: /root/.cache/huggingface
+        - name: model-cache-pvc
+          mountPoint: /model-cache
       sharedMemory:
         size: 80Gi
       extraPodSpec:
@@ -55,7 +55,7 @@ spec:
             - dynamo.sglang
           args:
             - --model-path
-            - deepseek-ai/DeepSeek-R1
+            - /model-cache/deepseek-r1
             - --served-model-name
             - deepseek-ai/DeepSeek-R1
             - --tp
@@ -87,8 +87,8 @@ spec:
         limits:
           gpu: "8"
       volumeMounts:
-        - name: model-cache
-          mountPoint: /root/.cache/huggingface
+        - name: model-cache-pvc
+          mountPoint: /model-cache
       sharedMemory:
         size: 80Gi
       extraPodSpec:
@@ -108,7 +108,7 @@ spec:
             - dynamo.sglang
           args:
             - --model-path
-            - deepseek-ai/DeepSeek-R1
+            - /model-cache/deepseek-r1
             - --served-model-name
             - deepseek-ai/DeepSeek-R1
             - --tp

@@ -7,7 +7,7 @@ metadata:
   name: sgl-dsr1-8gpu
 spec:
   pvcs:
-    - name: model-cache
+    - name: model-cache-pvc
       create: false
   services:
     Frontend:
@@ -32,8 +32,8 @@ spec:
         limits:
           gpu: "8"
       volumeMounts:
-        - name: model-cache
-          mountPoint: /root/.cache/huggingface
+        - name: model-cache-pvc
+          mountPoint: /model-cache
       sharedMemory:
         size: 80Gi
       extraPodSpec:
@@ -53,7 +53,7 @@ spec:
             - dynamo.sglang
           args:
             - --model-path
-            - deepseek-ai/DeepSeek-R1
+            - /model-cache/deepseek-r1
             - --served-model-name
             - deepseek-ai/DeepSeek-R1
             - --tp
@@ -81,8 +81,8 @@ spec:
         limits:
           gpu: "8"
       volumeMounts:
-        - name: model-cache
-          mountPoint: /root/.cache/huggingface
+        - name: model-cache-pvc
+          mountPoint: /model-cache
       sharedMemory:
         size: 80Gi
       extraPodSpec:
@@ -102,7 +102,7 @@ spec:
             - dynamo.sglang
           args:
             - --model-path
-            - deepseek-ai/DeepSeek-R1
+            - /model-cache/deepseek-r1
             - --served-model-name
             - deepseek-ai/DeepSeek-R1
             - --tp

@@ -11,7 +11,7 @@ moe_config:
   #   moe_max_num_tokens = max_batch_size * moe_expert_parallel_size
   #   4096 = 256 * 16
   # moe_max_num_tokens: 4096
-  load_balancer: /mnt/recipes/deepseek-r1/trtllm/wide_ep/eplb.yaml
+  load_balancer: /mnt/recipes/deepseek-r1/trtllm/agg/wide_ep/eplb.yaml
 
 tensor_parallel_size: 16
 moe_expert_parallel_size: 16

@@ -0,0 +1,7 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+# moe_load_balancer settings for TRTLLM based on:
+# https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/ep_load_balancer/README.md#online-ep-load-balancer
+num_slots: 288
+layer_updates_per_iter: 2