diff --git a/docs/backends/trtllm/README.md b/docs/backends/trtllm/README.md index dc4f88e3ad..670ec67669 100644 --- a/docs/backends/trtllm/README.md +++ b/docs/backends/trtllm/README.md @@ -162,7 +162,7 @@ cd $DYNAMO_HOME/examples/backends/trtllm ```bash cd $DYNAMO_HOME/examples/backends/trtllm -export AGG_ENGINE_ARGS=./recipes/deepseek-r1/trtllm/mtp/mtp_agg.yaml +export AGG_ENGINE_ARGS=./recipes/deepseek-r1/trtllm/agg/mtp/mtp_agg.yaml export SERVED_MODEL_NAME="nvidia/DeepSeek-R1-FP4" # nvidia/DeepSeek-R1-FP4 is a large model export MODEL_PATH="nvidia/DeepSeek-R1-FP4" diff --git a/docs/backends/trtllm/multinode/multinode-examples.md b/docs/backends/trtllm/multinode/multinode-examples.md index b8080d504c..c7b18594bc 100644 --- a/docs/backends/trtllm/multinode/multinode-examples.md +++ b/docs/backends/trtllm/multinode/multinode-examples.md @@ -136,7 +136,7 @@ follow these steps below to launch an **aggregated** deployment across 4 nodes: ```bash # Default set in srun_aggregated.sh, but can customize here. -# export ENGINE_CONFIG="/mnt/recipes/deepseek-r1/trtllm/wide_ep/wide_ep_agg.yaml" +# export ENGINE_CONFIG="/mnt/recipes/deepseek-r1/trtllm/agg/wide_ep/wide_ep_agg.yaml" # Customize NUM_NODES to match the desired parallelism in ENGINE_CONFIG # The product of NUM_NODES*NUM_GPUS_PER_NODE should match the number of @@ -165,8 +165,8 @@ deployment across 8 nodes: ```bash # Defaults set in srun_disaggregated.sh, but can customize here. -# export PREFILL_ENGINE_CONFIG="/mnt/recipes/deepseek-r1/trtllm/wide_ep/wide_ep_prefill.yaml" -# export DECODE_ENGINE_CONFIG="/mnt/recipes/deepseek-r1/trtllm/wide_ep/wide_ep_decode.yaml" +# export PREFILL_ENGINE_CONFIG="/mnt/recipes/deepseek-r1/trtllm/disagg/wide_ep/wide_ep_prefill.yaml" +# export DECODE_ENGINE_CONFIG="/mnt/recipes/deepseek-r1/trtllm/disagg/wide_ep/wide_ep_decode.yaml" # Customize NUM_PREFILL_NODES to match the desired parallelism in PREFILL_ENGINE_CONFIG # Customize NUM_DECODE_NODES to match the desired parallelism in DECODE_ENGINE_CONFIG diff --git a/examples/basics/multinode/trtllm/srun_aggregated.sh b/examples/basics/multinode/trtllm/srun_aggregated.sh index 36dcd03099..654c8ef691 100755 --- a/examples/basics/multinode/trtllm/srun_aggregated.sh +++ b/examples/basics/multinode/trtllm/srun_aggregated.sh @@ -18,7 +18,7 @@ MOUNTS="${MOUNTS:-${DEFAULT_MOUNT}}" NUM_NODES=${NUM_NODES:-4} NUM_GPUS_PER_NODE=${NUM_GPUS_PER_NODE:-4} -export ENGINE_CONFIG="${ENGINE_CONFIG:-/mnt/recipes/deepseek-r1/trtllm/wide_ep/wide_ep_agg.yaml}" +export ENGINE_CONFIG="${ENGINE_CONFIG:-/mnt/recipes/deepseek-r1/trtllm/agg/wide_ep/wide_ep_agg.yaml}" # Automate settings of certain variables for convenience, but you are free # to manually set these for more control as well. diff --git a/examples/basics/multinode/trtllm/srun_disaggregated.sh b/examples/basics/multinode/trtllm/srun_disaggregated.sh index fe103db1d4..06fd74677f 100755 --- a/examples/basics/multinode/trtllm/srun_disaggregated.sh +++ b/examples/basics/multinode/trtllm/srun_disaggregated.sh @@ -17,11 +17,11 @@ NUM_GPUS_PER_NODE=${NUM_GPUS_PER_NODE:-4} NUM_PREFILL_NODES=${NUM_PREFILL_NODES:-4} NUM_PREFILL_WORKERS=${NUM_PREFILL_WORKERS:-1} -PREFILL_ENGINE_CONFIG="${PREFILL_ENGINE_CONFIG:-/mnt/recipes/deepseek-r1/trtllm/wide_ep/wide_ep_prefill.yaml}" +PREFILL_ENGINE_CONFIG="${PREFILL_ENGINE_CONFIG:-/mnt/recipes/deepseek-r1/trtllm/disagg/wide_ep/wide_ep_prefill.yaml}" NUM_DECODE_NODES=${NUM_DECODE_NODES:-4} NUM_DECODE_WORKERS=${NUM_DECODE_WORKERS:-1} -DECODE_ENGINE_CONFIG="${DECODE_ENGINE_CONFIG:-/mnt/recipes/deepseek-r1/trtllm/wide_ep/wide_ep_decode.yaml}" +DECODE_ENGINE_CONFIG="${DECODE_ENGINE_CONFIG:-/mnt/recipes/deepseek-r1/trtllm/disagg/wide_ep/wide_ep_decode.yaml}" DISAGGREGATION_STRATEGY=${DISAGGREGATION_STRATEGY:-"decode_first"} diff --git a/recipes/deepseek-r1/model-cache/model-cache.yaml b/recipes/deepseek-r1/model-cache/model-cache.yaml index 5f4666fa6f..2e9cd898bf 100644 --- a/recipes/deepseek-r1/model-cache/model-cache.yaml +++ b/recipes/deepseek-r1/model-cache/model-cache.yaml @@ -3,11 +3,11 @@ apiVersion: v1 kind: PersistentVolumeClaim metadata: - name: model-cache + name: model-cache-pvc spec: accessModes: - ReadWriteMany resources: requests: - storage: 1000Gi + storage: 1500Gi storageClassName: "your-storage-class-name" \ No newline at end of file diff --git a/recipes/deepseek-r1/model-cache/model-download.yaml b/recipes/deepseek-r1/model-cache/model-download.yaml index 0f65b6b58d..ef3e2fd8cb 100644 --- a/recipes/deepseek-r1/model-cache/model-download.yaml +++ b/recipes/deepseek-r1/model-cache/model-download.yaml @@ -14,31 +14,24 @@ spec: app: model-download spec: restartPolicy: Never + tolerations: [] containers: - name: model-download image: python:3.10-slim command: ["sh", "-c"] - envFrom: - - secretRef: - name: hf-token-secret env: - - name: MODEL_NAME - value: deepseek-ai/DeepSeek-R1 - - name: HF_HOME - value: /model-store - name: HF_HUB_ENABLE_HF_TRANSFER value: "1" - - name: MODEL_REVISION - value: 56d4cbbb4d29f4355bab4b9a39ccb717a14ad5ad args: - | set -eux pip install --no-cache-dir huggingface_hub hf_transfer - hf download $MODEL_NAME --revision $MODEL_REVISION + hf download nvidia/DeepSeek-R1-FP4 --local-dir /model-cache/deepseek-r1-fp4 + hf download deepseek-ai/DeepSeek-R1 --local-dir /model-cache/deepseek-r1 volumeMounts: - name: model-cache - mountPath: /model-store + mountPath: /model-cache volumes: - name: model-cache persistentVolumeClaim: - claimName: model-cache \ No newline at end of file + claimName: model-cache-pvc \ No newline at end of file diff --git a/recipes/deepseek-r1/sglang/disagg-16gpu/deploy.yaml b/recipes/deepseek-r1/sglang/disagg-16gpu/deploy.yaml index 455f7943da..437cfdb768 100644 --- a/recipes/deepseek-r1/sglang/disagg-16gpu/deploy.yaml +++ b/recipes/deepseek-r1/sglang/disagg-16gpu/deploy.yaml @@ -7,7 +7,7 @@ metadata: name: sgl-dsr1-16gpu spec: pvcs: - - name: model-cache + - name: model-cache-pvc create: false services: Frontend: @@ -34,8 +34,8 @@ spec: limits: gpu: "8" volumeMounts: - - name: model-cache - mountPoint: /root/.cache/huggingface + - name: model-cache-pvc + mountPoint: /model-cache sharedMemory: size: 80Gi extraPodSpec: @@ -55,7 +55,7 @@ spec: - dynamo.sglang args: - --model-path - - deepseek-ai/DeepSeek-R1 + - /model-cache/deepseek-r1 - --served-model-name - deepseek-ai/DeepSeek-R1 - --tp @@ -87,8 +87,8 @@ spec: limits: gpu: "8" volumeMounts: - - name: model-cache - mountPoint: /root/.cache/huggingface + - name: model-cache-pvc + mountPoint: /model-cache sharedMemory: size: 80Gi extraPodSpec: @@ -108,7 +108,7 @@ spec: - dynamo.sglang args: - --model-path - - deepseek-ai/DeepSeek-R1 + - /model-cache/deepseek-r1 - --served-model-name - deepseek-ai/DeepSeek-R1 - --tp diff --git a/recipes/deepseek-r1/sglang/disagg-8gpu/deploy.yaml b/recipes/deepseek-r1/sglang/disagg-8gpu/deploy.yaml index cb156c4425..a53e25235d 100644 --- a/recipes/deepseek-r1/sglang/disagg-8gpu/deploy.yaml +++ b/recipes/deepseek-r1/sglang/disagg-8gpu/deploy.yaml @@ -7,7 +7,7 @@ metadata: name: sgl-dsr1-8gpu spec: pvcs: - - name: model-cache + - name: model-cache-pvc create: false services: Frontend: @@ -32,8 +32,8 @@ spec: limits: gpu: "8" volumeMounts: - - name: model-cache - mountPoint: /root/.cache/huggingface + - name: model-cache-pvc + mountPoint: /model-cache sharedMemory: size: 80Gi extraPodSpec: @@ -53,7 +53,7 @@ spec: - dynamo.sglang args: - --model-path - - deepseek-ai/DeepSeek-R1 + - /model-cache/deepseek-r1 - --served-model-name - deepseek-ai/DeepSeek-R1 - --tp @@ -81,8 +81,8 @@ spec: limits: gpu: "8" volumeMounts: - - name: model-cache - mountPoint: /root/.cache/huggingface + - name: model-cache-pvc + mountPoint: /model-cache sharedMemory: size: 80Gi extraPodSpec: @@ -102,7 +102,7 @@ spec: - dynamo.sglang args: - --model-path - - deepseek-ai/DeepSeek-R1 + - /model-cache/deepseek-r1 - --served-model-name - deepseek-ai/DeepSeek-R1 - --tp diff --git a/recipes/deepseek-r1/trtllm/mtp/mtp_agg.yaml b/recipes/deepseek-r1/trtllm/agg/mtp/mtp_agg.yaml similarity index 100% rename from recipes/deepseek-r1/trtllm/mtp/mtp_agg.yaml rename to recipes/deepseek-r1/trtllm/agg/mtp/mtp_agg.yaml diff --git a/recipes/deepseek-r1/trtllm/simple/agg.yaml b/recipes/deepseek-r1/trtllm/agg/simple/agg.yaml similarity index 100% rename from recipes/deepseek-r1/trtllm/simple/agg.yaml rename to recipes/deepseek-r1/trtllm/agg/simple/agg.yaml diff --git a/recipes/deepseek-r1/trtllm/wide_ep/dep16_agg.yaml b/recipes/deepseek-r1/trtllm/agg/wide_ep/dep16_agg.yaml similarity index 100% rename from recipes/deepseek-r1/trtllm/wide_ep/dep16_agg.yaml rename to recipes/deepseek-r1/trtllm/agg/wide_ep/dep16_agg.yaml diff --git a/recipes/deepseek-r1/trtllm/wide_ep/eplb.yaml b/recipes/deepseek-r1/trtllm/agg/wide_ep/eplb.yaml similarity index 100% rename from recipes/deepseek-r1/trtllm/wide_ep/eplb.yaml rename to recipes/deepseek-r1/trtllm/agg/wide_ep/eplb.yaml diff --git a/recipes/deepseek-r1/trtllm/wide_ep/wide_ep_agg.yaml b/recipes/deepseek-r1/trtllm/agg/wide_ep/wide_ep_agg.yaml similarity index 91% rename from recipes/deepseek-r1/trtllm/wide_ep/wide_ep_agg.yaml rename to recipes/deepseek-r1/trtllm/agg/wide_ep/wide_ep_agg.yaml index 31d7e395bd..bcd6ae87e0 100644 --- a/recipes/deepseek-r1/trtllm/wide_ep/wide_ep_agg.yaml +++ b/recipes/deepseek-r1/trtllm/agg/wide_ep/wide_ep_agg.yaml @@ -11,7 +11,7 @@ moe_config: # moe_max_num_tokens = max_batch_size * moe_expert_parallel_size # 4096 = 256 * 16 # moe_max_num_tokens: 4096 - load_balancer: /mnt/recipes/deepseek-r1/trtllm/wide_ep/eplb.yaml + load_balancer: /mnt/recipes/deepseek-r1/trtllm/agg/wide_ep/eplb.yaml tensor_parallel_size: 16 moe_expert_parallel_size: 16 diff --git a/recipes/deepseek-r1/trtllm/mtp/mtp_decode.yaml b/recipes/deepseek-r1/trtllm/disagg/mtp/mtp_decode.yaml similarity index 100% rename from recipes/deepseek-r1/trtllm/mtp/mtp_decode.yaml rename to recipes/deepseek-r1/trtllm/disagg/mtp/mtp_decode.yaml diff --git a/recipes/deepseek-r1/trtllm/mtp/mtp_prefill.yaml b/recipes/deepseek-r1/trtllm/disagg/mtp/mtp_prefill.yaml similarity index 100% rename from recipes/deepseek-r1/trtllm/mtp/mtp_prefill.yaml rename to recipes/deepseek-r1/trtllm/disagg/mtp/mtp_prefill.yaml diff --git a/recipes/deepseek-r1/trtllm/simple/decode.yaml b/recipes/deepseek-r1/trtllm/disagg/simple/decode.yaml similarity index 100% rename from recipes/deepseek-r1/trtllm/simple/decode.yaml rename to recipes/deepseek-r1/trtllm/disagg/simple/decode.yaml diff --git a/recipes/deepseek-r1/trtllm/simple/prefill.yaml b/recipes/deepseek-r1/trtllm/disagg/simple/prefill.yaml similarity index 100% rename from recipes/deepseek-r1/trtllm/simple/prefill.yaml rename to recipes/deepseek-r1/trtllm/disagg/simple/prefill.yaml diff --git a/recipes/deepseek-r1/trtllm/disagg/wide_ep/eplb.yaml b/recipes/deepseek-r1/trtllm/disagg/wide_ep/eplb.yaml new file mode 100644 index 0000000000..f2fe0a13c6 --- /dev/null +++ b/recipes/deepseek-r1/trtllm/disagg/wide_ep/eplb.yaml @@ -0,0 +1,7 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +# moe_load_balancer settings for TRTLLM based on: +# https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/ep_load_balancer/README.md#online-ep-load-balancer +num_slots: 288 +layer_updates_per_iter: 2 diff --git a/recipes/deepseek-r1/trtllm/disagg/wide_ep/gb200/deploy.yaml b/recipes/deepseek-r1/trtllm/disagg/wide_ep/gb200/deploy.yaml new file mode 100644 index 0000000000..936bcb5bee --- /dev/null +++ b/recipes/deepseek-r1/trtllm/disagg/wide_ep/gb200/deploy.yaml @@ -0,0 +1,253 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + + +# Instructions: +# 1. First, create the model cache PersistentVolumeClaim: +# kubectl apply -f model-cache.yaml -n +# 2. Download the model to the model cache: +# kubectl apply -f model-download.yaml -n +# 3. Once the above steps are complete, deploy the prefill and decode workers via this yaml: +# kubectl apply -f deploy.yaml -n +# 4. To benchmark the service, run: +# kubectl apply -f perf.yaml -n + +# ConfigMap for prefill engine configuration +# This configuration sets up a DEP 4 prefill worker +apiVersion: v1 +kind: ConfigMap +metadata: + name: prefill-config +data: + prefill_config.yaml: | + build_config: + max_batch_size: 4 + max_num_tokens: 4608 + max_seq_len: 1227 + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + enable_attention_dp: true + pipeline_parallel_size: 1 + cuda_graph_config: null + print_iter_log: true + disable_overlap_scheduler: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.85 + dtype: fp8 + cache_transceiver_config: + max_tokens_in_buffer: 4608 + backend: DEFAULT +--- + +# ConfigMap for decode engine configuration +# This configuration sets up a DEP 32 decode worker +apiVersion: v1 +kind: ConfigMap +metadata: + name: decode-config +data: + decode_config_dep32.yaml: | + tensor_parallel_size: 32 + moe_expert_parallel_size: 32 + enable_attention_dp: true + pipeline_parallel_size: 1 + build_config: + max_batch_size: 32 + max_num_tokens: 32 + max_seq_len: 2251 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + - 384 + - 512 + - 768 + - 1024 + - 2048 + print_iter_log: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.7 + dtype: fp8 + moe_config: + backend: WIDEEP + cache_transceiver_config: + max_tokens_in_buffer: 4608 + backend: DEFAULT + stream_interval: 20 +--- + +# NOTE: The numNodes value should equal the total number of nodes across prefill and decode +# as specified in their respective sections below (prefill.multinode.nodeCount + decode.multinode.nodeCount). +# For autoscaling deployments, the compute domain will automatically adjust as needed. +apiVersion: resource.nvidia.com/v1beta1 +kind: ComputeDomain +metadata: + name: trtllm-test-compute-domain +spec: + numNodes: 9 + channel: + resourceClaimTemplate: + name: trtllm-test-compute-domain-channel +--- + +apiVersion: nvidia.com/v1alpha1 +kind: DynamoGraphDeployment +metadata: + name: trtllm-disagg-multinode +spec: + pvcs: + - name: model-cache-pvc + create: false + envs: + - name: NCCL_MNNVL_ENABLE + value: "1" + - name: NCCL_CUMEM_ENABLE + value: "1" + - name: TLLM_LOG_LEVEL + value: "info" + - name: TRTLLM_MOE_ENABLE_ALLTOALL_WITHOUT_ALLGATHER + value: "1" + - name: TRTLLM_ENABLE_PDL + value: "1" + backendFramework: trtllm + services: + Frontend: + dynamoNamespace: trtllm-disagg-multinode + componentType: frontend + replicas: 1 + extraPodSpec: + tolerations: [] + affinity: {} + mainContainer: + image: nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:my-tag + args: + - | + python3 -m dynamo.frontend --http-port 8000 + command: + - /bin/sh + - -c + prefill: + dynamoNamespace: trtllm-disagg-multinode + componentType: worker + replicas: 1 + # NOTE: Prefill uses 1 node (no multinode section = single node) + # and contributes to ComputeDomain.numNodes (see above) + volumeMounts: + - name: model-cache-pvc + mountPoint: /model-cache + sharedMemory: + size: 800Gi + resources: + requests: + cpu: "130" + memory: "850Gi" + limits: + cpu: "130" + memory: "850Gi" + gpu: "4" + claims: + - name: compute-domain-channel + extraPodSpec: + tolerations: [] + affinity: {} + mainContainer: + image: nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:my-tag + workingDir: /workspace/components/backends/trtllm + # NOTE: If your PVCs (Persistent Volume Claims) are really slow, + # you might need to increase 'failureThreshold' below to allow more time for startup + startupProbe: + httpGet: + path: /live + port: 9090 + initialDelaySeconds: 30 + periodSeconds: 10 + timeoutSeconds: 5 + failureThreshold: 500 + volumeMounts: + - name: prefill-config-volume + mountPath: /config + command: + - /bin/sh + - -c + args: + - >- + python3 -m dynamo.trtllm + --model-path /model-cache/deepseek-r1-fp4 + --served-model-name deepseek-ai/DeepSeek-R1 + --extra-engine-args /config/prefill_config.yaml + --disaggregation-mode prefill + resourceClaims: + - name: compute-domain-channel + resourceClaimTemplateName: trtllm-test-compute-domain-channel + volumes: + - name: prefill-config-volume + configMap: + name: prefill-config + decode: + dynamoNamespace: trtllm-disagg-multinode + componentType: worker + replicas: 1 + volumeMounts: + - name: model-cache-pvc + mountPoint: /model-cache + multinode: + # NOTE: This nodeCount contributes to ComputeDomain.numNodes (see above) + nodeCount: 8 + sharedMemory: + size: 800Gi + resources: + requests: + cpu: "130" + memory: "850Gi" + limits: + cpu: "130" + memory: "850Gi" + gpu: "4" + claims: + - name: compute-domain-channel + extraPodSpec: + tolerations: [] + affinity: {} + mainContainer: + image: nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:my-tag + workingDir: /workspace/components/backends/trtllm + # NOTE: If your PVCs (Persistent Volume Claims) are really slow, + # you might need to increase 'failureThreshold' below to allow more time for startup + startupProbe: + httpGet: + path: /live + port: 9090 + initialDelaySeconds: 30 + periodSeconds: 10 + timeoutSeconds: 5 + failureThreshold: 500 + volumeMounts: + - name: decode-config-volume + mountPath: /config + command: + - /bin/sh + - -c + args: + - >- + python3 -m dynamo.trtllm + --model-path /model-cache/deepseek-r1-fp4 + --served-model-name deepseek-ai/DeepSeek-R1 + --extra-engine-args /config/decode_config_dep32.yaml + --disaggregation-mode decode + resourceClaims: + - name: compute-domain-channel + resourceClaimTemplateName: trtllm-test-compute-domain-channel + volumes: + - name: decode-config-volume + configMap: + name: decode-config diff --git a/recipes/deepseek-r1/trtllm/disagg/wide_ep/gb200/perf.yaml b/recipes/deepseek-r1/trtllm/disagg/wide_ep/gb200/perf.yaml new file mode 100644 index 0000000000..13fe06b22a --- /dev/null +++ b/recipes/deepseek-r1/trtllm/disagg/wide_ep/gb200/perf.yaml @@ -0,0 +1,154 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +apiVersion: batch/v1 +kind: Job +metadata: + name: deepseek-r1-bench +spec: + backoffLimit: 1 + completions: 1 + parallelism: 1 + template: + metadata: + labels: + app: deepseek-r1-bench + spec: + affinity: + podAntiAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + - labelSelector: + matchExpressions: + - key: nvidia.com/dynamo-graph-deployment-name + operator: In + values: + - trtllm-disagg-multinode + topologyKey: kubernetes.io/hostname + containers: + - command: + - /bin/sh + - -c + - | + apt-get update && apt-get install -y curl jq procps git && apt-get clean + pip install aiperf; + echo "aiperf installation completed"; + sysctl -w net.ipv4.ip_local_port_range="1024 65000" + cat /proc/sys/net/ipv4/ip_local_port_range + export COLUMNS=200 + EPOCH=$(date +%s) + ## utility functions -- can be moved to a bash script / configmap + wait_for_model_ready() { + echo "Waiting for model '$TARGET_MODEL' at $ENDPOINT/v1/models (checking every 5s)..." + while ! curl -s "http://$ENDPOINT/v1/models" | jq -e --arg model "$TARGET_MODEL" '.data[]? | select(.id == $model)' >/dev/null 2>&1; do + echo "[$(date '+%H:%M:%S')] Model not ready yet, sleeping 5s before checking again http://$ENDPOINT/v1/models" + sleep 5 + done + echo "✅ Model '$TARGET_MODEL' is now available!" + echo "Model '$TARGET_MODEL' is now available!" + curl -s "http://$ENDPOINT/v1/models" | jq . + } + run_perf() { + local concurrency=$1 + local isl=$2 + local osl=$3 + key=concurrency_${concurrency} + export ARTIFACT_DIR="${ROOT_ARTIFACT_DIR}/${EPOCH}_${JOB_NAME}/${key}" + mkdir -p "$ARTIFACT_DIR" + echo "ARTIFACT_DIR: $ARTIFACT_DIR" + aiperf profile --artifact-dir $ARTIFACT_DIR \ + --model $TARGET_MODEL \ + --tokenizer /model-cache/deepseek-r1-fp4 \ + --endpoint-type chat \ + --endpoint /v1/chat/completions \ + --streaming \ + --url http://$ENDPOINT \ + --synthetic-input-tokens-mean $isl \ + --synthetic-input-tokens-stddev 0 \ + --output-tokens-mean $osl \ + --output-tokens-stddev 0 \ + --extra-inputs "max_tokens:$osl" \ + --extra-inputs "min_tokens:$osl" \ + --extra-inputs "ignore_eos:true" \ + --extra-inputs "{\"nvext\":{\"ignore_eos\":true}}" \ + --extra-inputs "repetition_penalty:1.0" \ + --extra-inputs "temperature: 0.0" \ + --concurrency $concurrency \ + --request-count $((10*concurrency)) \ + --warmup-request-count $concurrency \ + --conversation-num 12800 \ + --random-seed 100 \ + --workers-max 252 \ + -H 'Authorization: Bearer NOT USED' \ + -H 'Accept: text/event-stream'\ + --record-processors 32 \ + --ui simple + echo "ARTIFACT_DIR: $ARTIFACT_DIR" + ls -la $ARTIFACT_DIR + } + #### Actual execution #### + wait_for_model_ready + mkdir -p "${ROOT_ARTIFACT_DIR}/${EPOCH}_${JOB_NAME}" + # Calculate total concurrency based on per-GPU concurrency and GPU count + TOTAL_CONCURRENCY=$((CONCURRENCY_PER_GPU * DEPLOYMENT_GPU_COUNT)) + echo "Calculated total concurrency: $TOTAL_CONCURRENCY (${CONCURRENCY_PER_GPU} per GPU × ${DEPLOYMENT_GPU_COUNT} GPUs)" + # Write input_config.json + cat > "${ROOT_ARTIFACT_DIR}/${EPOCH}_${JOB_NAME}/input_config.json" <